mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-03 22:26:15 +02:00
[Robots.txt] Clarify behavior when to close blocks of multiple user-agents
- must keep state whether Crawl-delay is already set for a specific agent as separate variable - add unit test to ensure that no already set Crawl-delay is overridden by a (lower) value of another agent
This commit is contained in:
parent
17e8544980
commit
e67299432c
|
@ -198,6 +198,11 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
* agent.
|
||||
*/
|
||||
private boolean _addingRules;
|
||||
/**
|
||||
* Flag that is true as it is allowed to add a Crawl-delay for the given
|
||||
* agent.
|
||||
*/
|
||||
private boolean _addingCrawlDelay;
|
||||
/**
|
||||
* Flag indicating whether all consecutive agent fields has been seen.
|
||||
* It is set to false if an agent field is found and back to true if
|
||||
|
@ -256,6 +261,14 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
_addingRules = addingRules;
|
||||
}
|
||||
|
||||
public boolean isAddingCrawlDelay() {
|
||||
return _addingCrawlDelay;
|
||||
}
|
||||
|
||||
public void setAddingCrawlDelay(boolean addingCrawlDelay) {
|
||||
_addingCrawlDelay = addingCrawlDelay;
|
||||
}
|
||||
|
||||
public boolean isFinishedAgentFields() {
|
||||
return _finishedAgentFields;
|
||||
}
|
||||
|
@ -304,7 +317,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
public void setCrawlDelay(long delay) {
|
||||
if (_matchedRealName) {
|
||||
// set crawl-delay for the given agent name
|
||||
if (_crawlDelaySetRealName) {
|
||||
if (_crawlDelaySetRealName && _addingCrawlDelay) {
|
||||
/*
|
||||
* Crawl-delay defined twice for the same agent or defined
|
||||
* for more than one of multiple agent names.
|
||||
|
@ -725,6 +738,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
|
||||
case CRAWL_DELAY:
|
||||
handleCrawlDelay(parseState, token);
|
||||
parseState.setAddingCrawlDelay(false);
|
||||
break;
|
||||
|
||||
case SITEMAP:
|
||||
|
@ -819,6 +833,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
// fields, then we are in a new section, hence stop adding rules.
|
||||
if (state.isAddingRules() && state.isFinishedAgentFields()) {
|
||||
state.setAddingRules(false);
|
||||
state.setAddingCrawlDelay(false);
|
||||
}
|
||||
|
||||
// Clearly we've encountered a new user agent directive, hence we need to start
|
||||
|
@ -836,6 +851,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
} else if (agentName.equals("*") && !state.isMatchedRealName()) {
|
||||
state.setMatchedWildcard(true);
|
||||
state.setAddingRules(true);
|
||||
state.setAddingCrawlDelay(true);
|
||||
} else if (targetNames.contains(agentName) || (!isValidUserAgentToObey(agentName) && userAgentProductTokenPartialMatch(agentName, targetNames))) {
|
||||
if (state.isMatchedWildcard()) {
|
||||
// Clear rules and Crawl-delay of the wildcard user-agent
|
||||
|
@ -845,6 +861,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
}
|
||||
state.setMatchedRealName(true);
|
||||
state.setAddingRules(true);
|
||||
state.setAddingCrawlDelay(true);
|
||||
state.setMatchedWildcard(false);
|
||||
}
|
||||
} else {
|
||||
|
@ -861,6 +878,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
if (agentNameFull.equals("*") && !state.isMatchedRealName()) {
|
||||
state.setMatchedWildcard(true);
|
||||
state.setAddingRules(true);
|
||||
state.setAddingCrawlDelay(true);
|
||||
} else if (userAgentProductTokenPartialMatch(agentNameFull, targetNames)) {
|
||||
// match "butterfly" in the line "User-agent: Butterfly/1.0"
|
||||
matched = true;
|
||||
|
@ -888,6 +906,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
}
|
||||
state.setMatchedRealName(true);
|
||||
state.setAddingRules(true);
|
||||
state.setAddingCrawlDelay(true);
|
||||
state.setMatchedWildcard(false);
|
||||
}
|
||||
}
|
||||
|
@ -980,7 +999,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
* data for directive
|
||||
*/
|
||||
private void handleCrawlDelay(ParseState state, RobotToken token) {
|
||||
if (!state.isAddingRules()) {
|
||||
if (!state.isAddingCrawlDelay()) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -1000,6 +1019,8 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
reportWarning(state, "Error parsing robots rules - can't decode crawl delay: {}", delayString);
|
||||
}
|
||||
}
|
||||
|
||||
state.setAddingCrawlDelay(false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -945,9 +945,11 @@ public class SimpleRobotRulesParserTest {
|
|||
@Test
|
||||
void testSelectCrawlDelayGroup() {
|
||||
final String robotsTxt = "User-Agent: bingbot" + CRLF //
|
||||
+ "Crawl-delay: 1" + CRLF //
|
||||
+ "Crawl-delay: 1" + CRLF + CRLF //
|
||||
+ "User-Agent: msnbot" + CRLF //
|
||||
+ "Crawl-delay: 2" + CRLF //
|
||||
+ "Crawl-delay: 2" + CRLF + CRLF //
|
||||
+ "User-Agent: pinterestbot" + CRLF //
|
||||
+ "Crawl-delay: 0.2" + CRLF + CRLF //
|
||||
+ "User-agent: *" + CRLF //
|
||||
+ "Disallow: /login" + CRLF //
|
||||
+ "Sitemap: http://www.example.com/sitemap.xml" + CRLF //
|
||||
|
@ -958,9 +960,9 @@ public class SimpleRobotRulesParserTest {
|
|||
List<String> sitemaps = List.of("http://www.example.com/sitemap.xml");
|
||||
for (Entry<String, Long> e : expectedCrawlDelays.entrySet()) {
|
||||
BaseRobotRules rules = createRobotRules(e.getKey(), robotsTxt);
|
||||
assertEquals(e.getValue(), rules.getCrawlDelay());
|
||||
assertEquals(e.getValue(), rules.getCrawlDelay(), "Crawl-delay for " + e.getKey() + " = " + e.getValue());
|
||||
assertFalse(rules.isAllowed("http://www.example.com/login"), "Disallow path /login for all agents");
|
||||
assertFalse(rules.isAllowed("http://www.example.com/assets/"), "Disallow path /login for all agents");
|
||||
assertFalse(rules.isAllowed("http://www.example.com/assets/"), "Disallow path /assets for all agents");
|
||||
assertTrue(rules.isAllowed("http://www.example.com/"), "Implicitly allowed");
|
||||
assertEquals(sitemaps, rules.getSitemaps());
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue