1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-04 14:36:04 +02:00

[Robots.txt] Clarify behavior when to close blocks of multiple user-agents, closes #390

[Robots.txt] Handle robots.txt with missing sections (and implicit master rules), fixes #114
- do not close rule blocks / groups on other directives than specified
  in RFC 9309: groups are only closed on a user-agent line at least
  one allow/disallow line was read before
- set Crawl-delay independently from grouping, but never override
  or set the value for a specific agent using a value defined for the
  wildcard agent
This commit is contained in:
Sebastian Nagel 2023-06-13 12:31:07 +02:00
parent d710c85871
commit 4524cfb5c0

View File

@ -204,6 +204,12 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
* something else has been found.
*/
private boolean _finishedAgentFields;
/**
* Flag indicating whether the Crawl-delay was set for the given agent
* name (false means either not set at all or set for the wildcard
* agent).
*/
private boolean _crawlDelaySetRealName;
/*
* Counter of warnings reporting invalid rules/lines in the robots.txt
@ -266,8 +272,64 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
_curRules.addRule(prefix, allow);
}
/**
* Set the Crawl-delay given the current parse state.
*
* <p>
* The <a href=
* "https://en.wikipedia.org/wiki/Robots.txt#Crawl-delay_directive">Crawl
* -delay</a> is not part of RFC 9309 treating it (same as the Sitemap
* directive) as <a href=
* "https://www.rfc-editor.org/rfc/rfc9309.html#name-other-records">other
* records</a> and specifies: <cite>Parsing of other records MUST NOT
* interfere with the parsing of explicitly defined records</cite>.
* </p>
*
* This methods sets the Crawl-delay in the robots rules
* ({@link BaseRobotRules#setCrawlDelay(long)})
* <ul>
* <li>always if the given agent name was matched in the current
* group</li>
* <li>for wildcard agent groups, only if the crawl-delay wasn't already
* defined for the given agent</li>
* </ul>
*
* <p>
* In case of a multiple defined crawl-delay, resolve the conflict and
* override the current value if the value is shorter than the already
* defined one. But do not override a Crawl-delay defined for a specific
* agent by a value defined for the wildcard agent.
* </p>
*/
public void setCrawlDelay(long delay) {
_curRules.setCrawlDelay(delay);
if (_matchedRealName) {
// set crawl-delay for the given agent name
if (_crawlDelaySetRealName) {
/*
* Crawl-delay defined twice for the same agent or defined
* for more than one of multiple agent names.
*
* Resolve conflict and select shortest Crawl-delay.
*/
if (_curRules.getCrawlDelay() > delay) {
_curRules.setCrawlDelay(delay);
}
} else {
_curRules.setCrawlDelay(delay);
}
_crawlDelaySetRealName = true;
} else if (_curRules.getCrawlDelay() == BaseRobotRules.UNSET_CRAWL_DELAY) {
// not set yet
_curRules.setCrawlDelay(delay);
} else if (_curRules.getCrawlDelay() > delay) {
// Crawl-delay defined twice for the wildcard agent.
// Resolve conflict and select shortest Crawl-delay.
_curRules.setCrawlDelay(delay);
}
}
public void clearCrawlDelay() {
_curRules.setCrawlDelay(BaseRobotRules.UNSET_CRAWL_DELAY);
}
public SimpleRobotRules getRobotRules() {
@ -662,36 +724,27 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
break;
case CRAWL_DELAY:
parseState.setFinishedAgentFields(true);
handleCrawlDelay(parseState, token);
break;
case SITEMAP:
parseState.setFinishedAgentFields(true);
handleSitemap(parseState, token);
break;
case HTTP:
parseState.setFinishedAgentFields(true);
handleHttp(parseState, token);
break;
case UNKNOWN:
reportWarning(parseState, "Unknown directive in robots.txt file: {}", line);
parseState.setFinishedAgentFields(true);
break;
case MISSING:
reportWarning(parseState, "Unknown line in robots.txt file (size {}): {}", content.length, line);
parseState.setFinishedAgentFields(true);
break;
default:
// All others we just ignore
// TODO KKr - which of these should be setting
// finishedAgentFields to true?
// TODO KKr - handle no-index
// TODO KKr - handle request-rate and visit-time
break;
}
}
@ -763,7 +816,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
*/
private void handleUserAgent(ParseState state, RobotToken token) {
// If we are adding rules, and had already finished reading user agent
// fields, then we are in a new section, hence stop adding rules
// fields, then we are in a new section, hence stop adding rules.
if (state.isAddingRules() && state.isFinishedAgentFields()) {
state.setAddingRules(false);
}
@ -785,9 +838,10 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
state.setAddingRules(true);
} else if (targetNames.contains(agentName) || (!isValidUserAgentToObey(agentName) && userAgentProductTokenPartialMatch(agentName, targetNames))) {
if (state.isMatchedWildcard()) {
// Clear rules of the wildcard user-agent found
// before the non-wildcard user-agent match.
// Clear rules and Crawl-delay of the wildcard user-agent
// found before the non-wildcard user-agent match.
state.clearRules();
state.clearCrawlDelay();
}
state.setMatchedRealName(true);
state.setAddingRules(true);
@ -827,9 +881,10 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
}
if (matched) {
if (state.isMatchedWildcard()) {
// Clear rules of the wildcard user-agent found
// before the non-wildcard user-agent match.
// Clear rules and Crawl-delay of the wildcard user-agent
// found before the non-wildcard user-agent match.
state.clearRules();
state.clearCrawlDelay();
}
state.setMatchedRealName(true);
state.setAddingRules(true);