mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-03 22:26:15 +02:00
Merge pull request #430 from sebastian-nagel/cc-390-114-robots-closing-rule-group
[Robots.txt] Close groups of rules as defined in RFC 9309
This commit is contained in:
commit
871e4e61d2
|
@ -47,10 +47,20 @@ public abstract class BaseRobotRules implements Serializable {
|
|||
_sitemaps = new LinkedHashSet<>();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get Crawl-delay (in milliseconds)
|
||||
*
|
||||
* @return Crawl-delay defined in the robots.txt for the given agent name,
|
||||
* or {@link UNSET_CRAWL_DELAY} if not defined.
|
||||
*/
|
||||
public long getCrawlDelay() {
|
||||
return _crawlDelay;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param crawlDelay
|
||||
* Crawl-Delay in milliseconds
|
||||
*/
|
||||
public void setCrawlDelay(long crawlDelay) {
|
||||
_crawlDelay = crawlDelay;
|
||||
}
|
||||
|
|
|
@ -207,12 +207,23 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
* agent.
|
||||
*/
|
||||
private boolean _addingRules;
|
||||
/**
|
||||
* Flag that is true as it is allowed to add a Crawl-delay for the given
|
||||
* agent.
|
||||
*/
|
||||
private boolean _addingCrawlDelay;
|
||||
/**
|
||||
* Flag indicating whether all consecutive agent fields has been seen.
|
||||
* It is set to false if an agent field is found and back to true if
|
||||
* something else has been found.
|
||||
*/
|
||||
private boolean _finishedAgentFields;
|
||||
/**
|
||||
* Flag indicating whether the Crawl-delay was set for the given agent
|
||||
* name (false means either not set at all or set for the wildcard
|
||||
* agent).
|
||||
*/
|
||||
private boolean _crawlDelaySetRealName;
|
||||
|
||||
/*
|
||||
* Counter of warnings reporting invalid rules/lines in the robots.txt
|
||||
|
@ -259,6 +270,14 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
_addingRules = addingRules;
|
||||
}
|
||||
|
||||
public boolean isAddingCrawlDelay() {
|
||||
return _addingCrawlDelay;
|
||||
}
|
||||
|
||||
public void setAddingCrawlDelay(boolean addingCrawlDelay) {
|
||||
_addingCrawlDelay = addingCrawlDelay;
|
||||
}
|
||||
|
||||
public boolean isFinishedAgentFields() {
|
||||
return _finishedAgentFields;
|
||||
}
|
||||
|
@ -275,8 +294,64 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
_curRules.addRule(prefix, allow);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the Crawl-delay given the current parse state.
|
||||
*
|
||||
* <p>
|
||||
* The <a href=
|
||||
* "https://en.wikipedia.org/wiki/Robots.txt#Crawl-delay_directive">Crawl
|
||||
* -delay</a> is not part of RFC 9309 treating it (same as the Sitemap
|
||||
* directive) as <a href=
|
||||
* "https://www.rfc-editor.org/rfc/rfc9309.html#name-other-records">other
|
||||
* records</a> and specifies: <cite>Parsing of other records MUST NOT
|
||||
* interfere with the parsing of explicitly defined records</cite>.
|
||||
* </p>
|
||||
*
|
||||
* This methods sets the Crawl-delay in the robots rules
|
||||
* ({@link BaseRobotRules#setCrawlDelay(long)})
|
||||
* <ul>
|
||||
* <li>always if the given agent name was matched in the current
|
||||
* group</li>
|
||||
* <li>for wildcard agent groups, only if the crawl-delay wasn't already
|
||||
* defined for the given agent</li>
|
||||
* </ul>
|
||||
*
|
||||
* <p>
|
||||
* In case of a multiple defined crawl-delay, resolve the conflict and
|
||||
* override the current value if the value is shorter than the already
|
||||
* defined one. But do not override a Crawl-delay defined for a specific
|
||||
* agent by a value defined for the wildcard agent.
|
||||
* </p>
|
||||
*/
|
||||
public void setCrawlDelay(long delay) {
|
||||
_curRules.setCrawlDelay(delay);
|
||||
if (_matchedRealName) {
|
||||
// set crawl-delay for the given agent name
|
||||
if (_crawlDelaySetRealName && _addingCrawlDelay) {
|
||||
/*
|
||||
* Crawl-delay defined twice for the same agent or defined
|
||||
* for more than one of multiple agent names.
|
||||
*
|
||||
* Resolve conflict and select shortest Crawl-delay.
|
||||
*/
|
||||
if (_curRules.getCrawlDelay() > delay) {
|
||||
_curRules.setCrawlDelay(delay);
|
||||
}
|
||||
} else {
|
||||
_curRules.setCrawlDelay(delay);
|
||||
}
|
||||
_crawlDelaySetRealName = true;
|
||||
} else if (_curRules.getCrawlDelay() == BaseRobotRules.UNSET_CRAWL_DELAY) {
|
||||
// not set yet
|
||||
_curRules.setCrawlDelay(delay);
|
||||
} else if (_curRules.getCrawlDelay() > delay) {
|
||||
// Crawl-delay defined twice for the wildcard agent.
|
||||
// Resolve conflict and select shortest Crawl-delay.
|
||||
_curRules.setCrawlDelay(delay);
|
||||
}
|
||||
}
|
||||
|
||||
public void clearCrawlDelay() {
|
||||
_curRules.setCrawlDelay(BaseRobotRules.UNSET_CRAWL_DELAY);
|
||||
}
|
||||
|
||||
public SimpleRobotRules getRobotRules() {
|
||||
|
@ -689,36 +764,28 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
break;
|
||||
|
||||
case CRAWL_DELAY:
|
||||
parseState.setFinishedAgentFields(true);
|
||||
handleCrawlDelay(parseState, token);
|
||||
parseState.setAddingCrawlDelay(false);
|
||||
break;
|
||||
|
||||
case SITEMAP:
|
||||
parseState.setFinishedAgentFields(true);
|
||||
handleSitemap(parseState, token);
|
||||
break;
|
||||
|
||||
case HTTP:
|
||||
parseState.setFinishedAgentFields(true);
|
||||
handleHttp(parseState, token);
|
||||
break;
|
||||
|
||||
case UNKNOWN:
|
||||
reportWarning(parseState, "Unknown directive in robots.txt file: {}", line);
|
||||
parseState.setFinishedAgentFields(true);
|
||||
break;
|
||||
|
||||
case MISSING:
|
||||
reportWarning(parseState, "Unknown line in robots.txt file (size {}): {}", content.length, line);
|
||||
parseState.setFinishedAgentFields(true);
|
||||
break;
|
||||
|
||||
default:
|
||||
// All others we just ignore
|
||||
// TODO KKr - which of these should be setting
|
||||
// finishedAgentFields to true?
|
||||
// TODO KKr - handle no-index
|
||||
// TODO KKr - handle request-rate and visit-time
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
@ -790,9 +857,10 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
*/
|
||||
private void handleUserAgent(ParseState state, RobotToken token) {
|
||||
// If we are adding rules, and had already finished reading user agent
|
||||
// fields, then we are in a new section, hence stop adding rules
|
||||
// fields, then we are in a new section, hence stop adding rules.
|
||||
if (state.isAddingRules() && state.isFinishedAgentFields()) {
|
||||
state.setAddingRules(false);
|
||||
state.setAddingCrawlDelay(false);
|
||||
}
|
||||
|
||||
// Clearly we've encountered a new user agent directive, hence we need to start
|
||||
|
@ -810,14 +878,17 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
} else if (agentName.equals("*") && !state.isMatchedRealName()) {
|
||||
state.setMatchedWildcard(true);
|
||||
state.setAddingRules(true);
|
||||
state.setAddingCrawlDelay(true);
|
||||
} else if (targetNames.contains(agentName) || (!isValidUserAgentToObey(agentName) && userAgentProductTokenPartialMatch(agentName, targetNames))) {
|
||||
if (state.isMatchedWildcard()) {
|
||||
// Clear rules of the wildcard user-agent found
|
||||
// before the non-wildcard user-agent match.
|
||||
// Clear rules and Crawl-delay of the wildcard user-agent
|
||||
// found before the non-wildcard user-agent match.
|
||||
state.clearRules();
|
||||
state.clearCrawlDelay();
|
||||
}
|
||||
state.setMatchedRealName(true);
|
||||
state.setAddingRules(true);
|
||||
state.setAddingCrawlDelay(true);
|
||||
state.setMatchedWildcard(false);
|
||||
}
|
||||
} else {
|
||||
|
@ -834,6 +905,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
if (agentNameFull.equals("*") && !state.isMatchedRealName()) {
|
||||
state.setMatchedWildcard(true);
|
||||
state.setAddingRules(true);
|
||||
state.setAddingCrawlDelay(true);
|
||||
} else if (userAgentProductTokenPartialMatch(agentNameFull, targetNames)) {
|
||||
// match "butterfly" in the line "User-agent: Butterfly/1.0"
|
||||
matched = true;
|
||||
|
@ -854,12 +926,14 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
}
|
||||
if (matched) {
|
||||
if (state.isMatchedWildcard()) {
|
||||
// Clear rules of the wildcard user-agent found
|
||||
// before the non-wildcard user-agent match.
|
||||
// Clear rules and Crawl-delay of the wildcard user-agent
|
||||
// found before the non-wildcard user-agent match.
|
||||
state.clearRules();
|
||||
state.clearCrawlDelay();
|
||||
}
|
||||
state.setMatchedRealName(true);
|
||||
state.setAddingRules(true);
|
||||
state.setAddingCrawlDelay(true);
|
||||
state.setMatchedWildcard(false);
|
||||
}
|
||||
}
|
||||
|
@ -967,7 +1041,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
* data for directive
|
||||
*/
|
||||
private void handleCrawlDelay(ParseState state, RobotToken token) {
|
||||
if (!state.isAddingRules()) {
|
||||
if (!state.isAddingCrawlDelay()) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -987,6 +1061,8 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
reportWarning(state, "Error parsing robots rules - can't decode crawl delay: {}", delayString);
|
||||
}
|
||||
}
|
||||
|
||||
state.setAddingCrawlDelay(false);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -27,6 +27,8 @@ import java.util.Arrays;
|
|||
import java.util.Collection;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
|
||||
import static java.nio.charset.StandardCharsets.US_ASCII;
|
||||
|
@ -942,10 +944,55 @@ public class SimpleRobotRulesParserTest {
|
|||
+ "User-agent: qihoobot" + CR //
|
||||
+ "Disallow: /";
|
||||
|
||||
// Note: In compliance with RFC 9309, for Googlebot a specific
|
||||
// Crawl-delay is set. However, Googlebot is not allowed to crawl any
|
||||
// page (same as qihoobot).
|
||||
BaseRobotRules rules = createRobotRules("googlebot/2.1", krugleRobotsTxt, false);
|
||||
assertTrue(rules.isAllowed("http://www.krugle.com/examples/index.html"));
|
||||
assertEquals(1000L, rules.getCrawlDelay());
|
||||
assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
|
||||
assertFalse(rules.isAllowed("http://www.krugle.com/"));
|
||||
|
||||
rules = createRobotRules("googlebot", krugleRobotsTxt, true);
|
||||
assertTrue(rules.isAllowed("http://www.krugle.com/examples/index.html"));
|
||||
assertEquals(1000L, rules.getCrawlDelay());
|
||||
assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
|
||||
assertFalse(rules.isAllowed("http://www.krugle.com/"));
|
||||
|
||||
rules = createRobotRules("qihoobot", krugleRobotsTxt, true);
|
||||
assertEquals(BaseRobotRules.UNSET_CRAWL_DELAY, rules.getCrawlDelay());
|
||||
assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
|
||||
assertFalse(rules.isAllowed("http://www.krugle.com/"));
|
||||
|
||||
rules = createRobotRules("anybot", krugleRobotsTxt, true);
|
||||
assertEquals(3000L, rules.getCrawlDelay());
|
||||
assertFalse(rules.isAllowed("http://www.krugle.com/examples/index.html"));
|
||||
assertTrue(rules.isAllowed("http://www.krugle.com/"));
|
||||
}
|
||||
|
||||
/** Test selection of Crawl-delay directives in robots.txt (see #114) */
|
||||
@Test
|
||||
void testSelectCrawlDelayGroup() {
|
||||
final String robotsTxt = "User-Agent: bingbot" + CRLF //
|
||||
+ "Crawl-delay: 1" + CRLF + CRLF //
|
||||
+ "User-Agent: msnbot" + CRLF //
|
||||
+ "Crawl-delay: 2" + CRLF + CRLF //
|
||||
+ "User-Agent: pinterestbot" + CRLF //
|
||||
+ "Crawl-delay: 0.2" + CRLF + CRLF //
|
||||
+ "User-agent: *" + CRLF //
|
||||
+ "Disallow: /login" + CRLF //
|
||||
+ "Sitemap: http://www.example.com/sitemap.xml" + CRLF //
|
||||
+ "Disallow: /assets/";
|
||||
|
||||
// test for specific Crawl-delay values but same set of rules
|
||||
Map<String, Long> expectedCrawlDelays = Map.of("bingbot", 1000L, "msnbot", 2000L, "anybot", BaseRobotRules.UNSET_CRAWL_DELAY);
|
||||
List<String> sitemaps = List.of("http://www.example.com/sitemap.xml");
|
||||
for (Entry<String, Long> e : expectedCrawlDelays.entrySet()) {
|
||||
BaseRobotRules rules = createRobotRules(e.getKey(), robotsTxt);
|
||||
assertEquals(e.getValue(), rules.getCrawlDelay(), "Crawl-delay for " + e.getKey() + " = " + e.getValue());
|
||||
assertFalse(rules.isAllowed("http://www.example.com/login"), "Disallow path /login for all agents");
|
||||
assertFalse(rules.isAllowed("http://www.example.com/assets/"), "Disallow path /assets for all agents");
|
||||
assertTrue(rules.isAllowed("http://www.example.com/"), "Implicitly allowed");
|
||||
assertEquals(sitemaps, rules.getSitemaps());
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
|
|
Loading…
Reference in New Issue