1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-03 22:26:15 +02:00

Merge pull request #430 from sebastian-nagel/cc-390-114-robots-closing-rule-group

[Robots.txt] Close groups of rules as defined in RFC 9309
This commit is contained in:
Sebastian Nagel 2023-07-12 10:35:48 +02:00 committed by GitHub
commit 871e4e61d2
Signed by: GitHub
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 151 additions and 18 deletions

View File

@ -47,10 +47,20 @@ public abstract class BaseRobotRules implements Serializable {
_sitemaps = new LinkedHashSet<>();
}
/**
* Get Crawl-delay (in milliseconds)
*
* @return Crawl-delay defined in the robots.txt for the given agent name,
* or {@link UNSET_CRAWL_DELAY} if not defined.
*/
public long getCrawlDelay() {
return _crawlDelay;
}
/**
* @param crawlDelay
* Crawl-Delay in milliseconds
*/
public void setCrawlDelay(long crawlDelay) {
_crawlDelay = crawlDelay;
}

View File

@ -207,12 +207,23 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
* agent.
*/
private boolean _addingRules;
/**
* Flag that is true as it is allowed to add a Crawl-delay for the given
* agent.
*/
private boolean _addingCrawlDelay;
/**
* Flag indicating whether all consecutive agent fields has been seen.
* It is set to false if an agent field is found and back to true if
* something else has been found.
*/
private boolean _finishedAgentFields;
/**
* Flag indicating whether the Crawl-delay was set for the given agent
* name (false means either not set at all or set for the wildcard
* agent).
*/
private boolean _crawlDelaySetRealName;
/*
* Counter of warnings reporting invalid rules/lines in the robots.txt
@ -259,6 +270,14 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
_addingRules = addingRules;
}
public boolean isAddingCrawlDelay() {
return _addingCrawlDelay;
}
public void setAddingCrawlDelay(boolean addingCrawlDelay) {
_addingCrawlDelay = addingCrawlDelay;
}
public boolean isFinishedAgentFields() {
return _finishedAgentFields;
}
@ -275,8 +294,64 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
_curRules.addRule(prefix, allow);
}
/**
* Set the Crawl-delay given the current parse state.
*
* <p>
* The <a href=
* "https://en.wikipedia.org/wiki/Robots.txt#Crawl-delay_directive">Crawl
* -delay</a> is not part of RFC 9309 treating it (same as the Sitemap
* directive) as <a href=
* "https://www.rfc-editor.org/rfc/rfc9309.html#name-other-records">other
* records</a> and specifies: <cite>Parsing of other records MUST NOT
* interfere with the parsing of explicitly defined records</cite>.
* </p>
*
* This methods sets the Crawl-delay in the robots rules
* ({@link BaseRobotRules#setCrawlDelay(long)})
* <ul>
* <li>always if the given agent name was matched in the current
* group</li>
* <li>for wildcard agent groups, only if the crawl-delay wasn't already
* defined for the given agent</li>
* </ul>
*
* <p>
* In case of a multiple defined crawl-delay, resolve the conflict and
* override the current value if the value is shorter than the already
* defined one. But do not override a Crawl-delay defined for a specific
* agent by a value defined for the wildcard agent.
* </p>
*/
public void setCrawlDelay(long delay) {
_curRules.setCrawlDelay(delay);
if (_matchedRealName) {
// set crawl-delay for the given agent name
if (_crawlDelaySetRealName && _addingCrawlDelay) {
/*
* Crawl-delay defined twice for the same agent or defined
* for more than one of multiple agent names.
*
* Resolve conflict and select shortest Crawl-delay.
*/
if (_curRules.getCrawlDelay() > delay) {
_curRules.setCrawlDelay(delay);
}
} else {
_curRules.setCrawlDelay(delay);
}
_crawlDelaySetRealName = true;
} else if (_curRules.getCrawlDelay() == BaseRobotRules.UNSET_CRAWL_DELAY) {
// not set yet
_curRules.setCrawlDelay(delay);
} else if (_curRules.getCrawlDelay() > delay) {
// Crawl-delay defined twice for the wildcard agent.
// Resolve conflict and select shortest Crawl-delay.
_curRules.setCrawlDelay(delay);
}
}
public void clearCrawlDelay() {
_curRules.setCrawlDelay(BaseRobotRules.UNSET_CRAWL_DELAY);
}
public SimpleRobotRules getRobotRules() {
@ -689,36 +764,28 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
break;
case CRAWL_DELAY:
parseState.setFinishedAgentFields(true);
handleCrawlDelay(parseState, token);
parseState.setAddingCrawlDelay(false);
break;
case SITEMAP:
parseState.setFinishedAgentFields(true);
handleSitemap(parseState, token);
break;
case HTTP:
parseState.setFinishedAgentFields(true);
handleHttp(parseState, token);
break;
case UNKNOWN:
reportWarning(parseState, "Unknown directive in robots.txt file: {}", line);
parseState.setFinishedAgentFields(true);
break;
case MISSING:
reportWarning(parseState, "Unknown line in robots.txt file (size {}): {}", content.length, line);
parseState.setFinishedAgentFields(true);
break;
default:
// All others we just ignore
// TODO KKr - which of these should be setting
// finishedAgentFields to true?
// TODO KKr - handle no-index
// TODO KKr - handle request-rate and visit-time
break;
}
}
@ -790,9 +857,10 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
*/
private void handleUserAgent(ParseState state, RobotToken token) {
// If we are adding rules, and had already finished reading user agent
// fields, then we are in a new section, hence stop adding rules
// fields, then we are in a new section, hence stop adding rules.
if (state.isAddingRules() && state.isFinishedAgentFields()) {
state.setAddingRules(false);
state.setAddingCrawlDelay(false);
}
// Clearly we've encountered a new user agent directive, hence we need to start
@ -810,14 +878,17 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
} else if (agentName.equals("*") && !state.isMatchedRealName()) {
state.setMatchedWildcard(true);
state.setAddingRules(true);
state.setAddingCrawlDelay(true);
} else if (targetNames.contains(agentName) || (!isValidUserAgentToObey(agentName) && userAgentProductTokenPartialMatch(agentName, targetNames))) {
if (state.isMatchedWildcard()) {
// Clear rules of the wildcard user-agent found
// before the non-wildcard user-agent match.
// Clear rules and Crawl-delay of the wildcard user-agent
// found before the non-wildcard user-agent match.
state.clearRules();
state.clearCrawlDelay();
}
state.setMatchedRealName(true);
state.setAddingRules(true);
state.setAddingCrawlDelay(true);
state.setMatchedWildcard(false);
}
} else {
@ -834,6 +905,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
if (agentNameFull.equals("*") && !state.isMatchedRealName()) {
state.setMatchedWildcard(true);
state.setAddingRules(true);
state.setAddingCrawlDelay(true);
} else if (userAgentProductTokenPartialMatch(agentNameFull, targetNames)) {
// match "butterfly" in the line "User-agent: Butterfly/1.0"
matched = true;
@ -854,12 +926,14 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
}
if (matched) {
if (state.isMatchedWildcard()) {
// Clear rules of the wildcard user-agent found
// before the non-wildcard user-agent match.
// Clear rules and Crawl-delay of the wildcard user-agent
// found before the non-wildcard user-agent match.
state.clearRules();
state.clearCrawlDelay();
}
state.setMatchedRealName(true);
state.setAddingRules(true);
state.setAddingCrawlDelay(true);
state.setMatchedWildcard(false);
}
}
@ -967,7 +1041,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
* data for directive
*/
private void handleCrawlDelay(ParseState state, RobotToken token) {
if (!state.isAddingRules()) {
if (!state.isAddingCrawlDelay()) {
return;
}
@ -987,6 +1061,8 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
reportWarning(state, "Error parsing robots rules - can't decode crawl delay: {}", delayString);
}
}
state.setAddingCrawlDelay(false);
}
/**

View File

@ -27,6 +27,8 @@ import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import static java.nio.charset.StandardCharsets.US_ASCII;
@ -942,10 +944,55 @@ public class SimpleRobotRulesParserTest {
+ "User-agent: qihoobot" + CR //
+ "Disallow: /";
// Note: In compliance with RFC 9309, for Googlebot a specific
// Crawl-delay is set. However, Googlebot is not allowed to crawl any
// page (same as qihoobot).
BaseRobotRules rules = createRobotRules("googlebot/2.1", krugleRobotsTxt, false);
assertTrue(rules.isAllowed("http://www.krugle.com/examples/index.html"));
assertEquals(1000L, rules.getCrawlDelay());
assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
assertFalse(rules.isAllowed("http://www.krugle.com/"));
rules = createRobotRules("googlebot", krugleRobotsTxt, true);
assertTrue(rules.isAllowed("http://www.krugle.com/examples/index.html"));
assertEquals(1000L, rules.getCrawlDelay());
assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
assertFalse(rules.isAllowed("http://www.krugle.com/"));
rules = createRobotRules("qihoobot", krugleRobotsTxt, true);
assertEquals(BaseRobotRules.UNSET_CRAWL_DELAY, rules.getCrawlDelay());
assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
assertFalse(rules.isAllowed("http://www.krugle.com/"));
rules = createRobotRules("anybot", krugleRobotsTxt, true);
assertEquals(3000L, rules.getCrawlDelay());
assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
assertTrue(rules.isAllowed("http://www.krugle.com/"));
}
/** Test selection of Crawl-delay directives in robots.txt (see #114) */
@Test
void testSelectCrawlDelayGroup() {
final String robotsTxt = "User-Agent: bingbot" + CRLF //
+ "Crawl-delay: 1" + CRLF + CRLF //
+ "User-Agent: msnbot" + CRLF //
+ "Crawl-delay: 2" + CRLF + CRLF //
+ "User-Agent: pinterestbot" + CRLF //
+ "Crawl-delay: 0.2" + CRLF + CRLF //
+ "User-agent: *" + CRLF //
+ "Disallow: /login" + CRLF //
+ "Sitemap: http://www.example.com/sitemap.xml" + CRLF //
+ "Disallow: /assets/";
// test for specific Crawl-delay values but same set of rules
Map<String, Long> expectedCrawlDelays = Map.of("bingbot", 1000L, "msnbot", 2000L, "anybot", BaseRobotRules.UNSET_CRAWL_DELAY);
List<String> sitemaps = List.of("http://www.example.com/sitemap.xml");
for (Entry<String, Long> e : expectedCrawlDelays.entrySet()) {
BaseRobotRules rules = createRobotRules(e.getKey(), robotsTxt);
assertEquals(e.getValue(), rules.getCrawlDelay(), "Crawl-delay for " + e.getKey() + " = " + e.getValue());
assertFalse(rules.isAllowed("http://www.example.com/login"), "Disallow path /login for all agents");
assertFalse(rules.isAllowed("http://www.example.com/assets/"), "Disallow path /assets for all agents");
assertTrue(rules.isAllowed("http://www.example.com/"), "Implicitly allowed");
assertEquals(sitemaps, rules.getSitemaps());
}
}
@Test