mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-09-28 10:41:04 +02:00
Improve robots check draft rfc compliance
This commit is contained in:
parent
d3ccb553df
commit
1f0e79b72a
@ -163,13 +163,36 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||
return true;
|
||||
}
|
||||
|
||||
boolean isAllowed = true;
|
||||
int longestRuleMatch = Integer.MIN_VALUE;
|
||||
for (RobotRule rule : _rules) {
|
||||
if (ruleMatches(pathWithQuery, rule._prefix)) {
|
||||
return rule._allow;
|
||||
int matchLength = ruleMatches(pathWithQuery, rule._prefix);
|
||||
if (matchLength == -1) {
|
||||
// See precedence-of-rules test case for an example
|
||||
// Some webmasters expect behavior close to google's, and this block is equivalent to:
|
||||
// https://github.com/google/robotstxt/blob/02bc6cdfa32db50d42563180c42aeb47042b4f0c/robots.cc#L605-L618
|
||||
// There are example robots.txt in the wild that benefit from this.
|
||||
// As of 2/7/2022, https://venmo.com/robots.txt for instance.
|
||||
if (rule._prefix.endsWith("index.htm") || rule._prefix.endsWith("index.html")) {
|
||||
matchLength = ruleMatches(pathWithQuery, rule._prefix.substring(0, rule._prefix.indexOf("index.htm")) + "$");
|
||||
if (matchLength == -1) {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (longestRuleMatch < matchLength) {
|
||||
longestRuleMatch = matchLength;
|
||||
isAllowed = rule.isAllow();
|
||||
} else if (longestRuleMatch == matchLength) {
|
||||
isAllowed |= rule.isAllow();
|
||||
}
|
||||
// else we've already got a more specific rule, and this match doesn't matter
|
||||
}
|
||||
|
||||
return true;
|
||||
return isAllowed;
|
||||
}
|
||||
}
|
||||
|
||||
@ -197,7 +220,7 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||
}
|
||||
}
|
||||
|
||||
private boolean ruleMatches(String text, String pattern) {
|
||||
private int ruleMatches(String text, String pattern) {
|
||||
int patternPos = 0;
|
||||
int textPos = 0;
|
||||
|
||||
@ -223,7 +246,7 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||
patternPos += 1;
|
||||
if (patternPos >= patternEnd) {
|
||||
// Pattern ends with '*', we're all good.
|
||||
return true;
|
||||
return pattern.length();
|
||||
}
|
||||
|
||||
// TODO - don't worry about having two '*' in a row?
|
||||
@ -255,14 +278,14 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||
|
||||
// If we matched, we're all set, otherwise we failed
|
||||
if (!matched) {
|
||||
return false;
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
// See if the pattern from patternPos to wildcardPos matches the
|
||||
// text starting at textPos
|
||||
while ((patternPos < wildcardPos) && (textPos < textEnd)) {
|
||||
if (text.charAt(textPos++) != pattern.charAt(patternPos++)) {
|
||||
return false;
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -277,7 +300,11 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||
// We're at the end, so we have a match if the pattern was completely
|
||||
// consumed, and either we consumed all the text or we didn't have to
|
||||
// match it all (no '$' at end of the pattern)
|
||||
return (patternPos == patternEnd) && ((textPos == textEnd) || !containsEndChar);
|
||||
if ((patternPos == patternEnd) && ((textPos == textEnd) || !containsEndChar)) {
|
||||
return pattern.length();
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -167,11 +167,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||
*/
|
||||
private boolean _finishedAgentFields;
|
||||
|
||||
// True if we're done adding rules for a matched (not wildcard) agent
|
||||
// name. When this is true, we only consider sitemap directives, so we
|
||||
// skip all remaining user agent blocks.
|
||||
private boolean _skipAgents;
|
||||
|
||||
/*
|
||||
* Counter of warnings reporting invalid rules/lines in the robots.txt
|
||||
* file. The counter is used to limit the number of logged warnings.
|
||||
@ -225,14 +220,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||
_finishedAgentFields = finishedAgentFields;
|
||||
}
|
||||
|
||||
public boolean isSkipAgents() {
|
||||
return _skipAgents;
|
||||
}
|
||||
|
||||
public void setSkipAgents(boolean skipAgents) {
|
||||
_skipAgents = skipAgents;
|
||||
}
|
||||
|
||||
public void clearRules() {
|
||||
_curRules.clearRules();
|
||||
}
|
||||
@ -513,22 +500,27 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||
break;
|
||||
|
||||
case DISALLOW:
|
||||
parseState.setFinishedAgentFields(true);
|
||||
handleDisallow(parseState, token);
|
||||
break;
|
||||
|
||||
case ALLOW:
|
||||
parseState.setFinishedAgentFields(true);
|
||||
handleAllow(parseState, token);
|
||||
break;
|
||||
|
||||
case CRAWL_DELAY:
|
||||
parseState.setFinishedAgentFields(true);
|
||||
handleCrawlDelay(parseState, token);
|
||||
break;
|
||||
|
||||
case SITEMAP:
|
||||
parseState.setFinishedAgentFields(true);
|
||||
handleSitemap(parseState, token);
|
||||
break;
|
||||
|
||||
case HTTP:
|
||||
parseState.setFinishedAgentFields(true);
|
||||
handleHttp(parseState, token);
|
||||
break;
|
||||
|
||||
@ -593,21 +585,16 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||
* data for directive
|
||||
*/
|
||||
private void handleUserAgent(ParseState state, RobotToken token) {
|
||||
if (state.isMatchedRealName()) {
|
||||
if (state.isFinishedAgentFields()) {
|
||||
state.setSkipAgents(true);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (state.isFinishedAgentFields()) {
|
||||
// We've got a user agent field, so we haven't yet seen anything
|
||||
// that tells us we're done with this set of agent names.
|
||||
state.setFinishedAgentFields(false);
|
||||
// If we are adding rules, and had already finished reading user agent
|
||||
// fields, then we are in a new section, hence stop adding rules
|
||||
if (state.isAddingRules() && state.isFinishedAgentFields()) {
|
||||
state.setAddingRules(false);
|
||||
}
|
||||
|
||||
// Clearly we've encountered a new user agent directive, hence we need to start
|
||||
// processing until we are finished.
|
||||
state.setFinishedAgentFields(false);
|
||||
|
||||
// Handle the case when there are multiple target names are passed
|
||||
// We assume we should do case-insensitive comparison of target name.
|
||||
String[] targetNames = state.getTargetName().toLowerCase(Locale.ROOT).split(",");
|
||||
@ -621,21 +608,24 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||
// TODO KKr - catch case of multiple names, log as non-standard.
|
||||
String[] agentNames = token.getData().split("[ \t,]");
|
||||
for (String agentName : agentNames) {
|
||||
// TODO should we do case-insensitive matching? Probably yes.
|
||||
agentName = agentName.trim().toLowerCase(Locale.ROOT);
|
||||
if (agentName.isEmpty()) {
|
||||
// Ignore empty names
|
||||
} else if (agentName.equals("*") && !state.isMatchedWildcard()) {
|
||||
} else if (agentName.equals("*") && !state.isMatchedRealName()) {
|
||||
state.setMatchedWildcard(true);
|
||||
state.setAddingRules(true);
|
||||
} else {
|
||||
// TODO use longest match as winner
|
||||
for (String targetName : targetNameSplits) {
|
||||
if (targetName.startsWith(agentName)) {
|
||||
// In case we previously hit a wildcard rule match
|
||||
// but we could also have hit our own user agent before.
|
||||
// only clear if wildcard was matched earlier
|
||||
if (state.isMatchedWildcard()) {
|
||||
state.clearRules();
|
||||
}
|
||||
state.setMatchedRealName(true);
|
||||
state.setAddingRules(true);
|
||||
// In case we previously hit a wildcard rule match
|
||||
state.clearRules();
|
||||
state.setMatchedWildcard(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -644,6 +634,17 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Add any uniform rules to clean up path directives
|
||||
* @param path
|
||||
* @return clean path
|
||||
*/
|
||||
private String normalizePathDirective(String path) {
|
||||
path = path.trim();
|
||||
|
||||
return path;
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle the disallow: directive
|
||||
*
|
||||
@ -653,12 +654,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||
* data for directive
|
||||
*/
|
||||
private void handleDisallow(ParseState state, RobotToken token) {
|
||||
if (state.isSkipAgents()) {
|
||||
return;
|
||||
}
|
||||
|
||||
state.setFinishedAgentFields(true);
|
||||
|
||||
if (!state.isAddingRules()) {
|
||||
return;
|
||||
}
|
||||
@ -667,7 +662,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||
|
||||
try {
|
||||
path = URLDecoder.decode(path, "UTF-8");
|
||||
|
||||
path = normalizePathDirective(path);
|
||||
if (path.length() == 0) {
|
||||
// Disallow: <nothing> => allow all.
|
||||
state.clearRules();
|
||||
@ -688,12 +683,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||
* data for directive
|
||||
*/
|
||||
private void handleAllow(ParseState state, RobotToken token) {
|
||||
if (state.isSkipAgents()) {
|
||||
return;
|
||||
}
|
||||
|
||||
state.setFinishedAgentFields(true);
|
||||
|
||||
if (!state.isAddingRules()) {
|
||||
return;
|
||||
}
|
||||
@ -702,6 +691,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||
|
||||
try {
|
||||
path = URLDecoder.decode(path, "UTF-8");
|
||||
path = normalizePathDirective(path);
|
||||
} catch (Exception e) {
|
||||
reportWarning(state, "Error parsing robots rules - can't decode path: {}", path);
|
||||
}
|
||||
@ -723,12 +713,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||
* data for directive
|
||||
*/
|
||||
private void handleCrawlDelay(ParseState state, RobotToken token) {
|
||||
if (state.isSkipAgents()) {
|
||||
return;
|
||||
}
|
||||
|
||||
state.setFinishedAgentFields(true);
|
||||
|
||||
if (!state.isAddingRules()) {
|
||||
return;
|
||||
}
|
||||
|
@ -287,27 +287,28 @@ public class SimpleRobotRulesParserTest {
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@CsvSource({ "False, http://www.domain.com/a",//
|
||||
"False, http://www.domain.com/a/",//
|
||||
"False, http://www.domain.com/a/bloh/foo.html",//
|
||||
"True, http://www.domain.com/b",//
|
||||
"False, http://www.domain.com/b/a",//
|
||||
"False, http://www.domain.com/b/a/index.html",//
|
||||
"True, http://www.domain.com/b/b/foo.html",//
|
||||
"True, http://www.domain.com/c",//
|
||||
"True, http://www.domain.com/c/a",//
|
||||
"True, http://www.domain.com/c/a/index.html",//
|
||||
"True, http://www.domain.com/c/b/foo.html",//
|
||||
"True, http://www.domain.com/d",//
|
||||
"True, http://www.domain.com/d/a",//
|
||||
"True, http://www.domain.com/e/a/index.html",//
|
||||
"True, http://www.domain.com/e/d",//
|
||||
"True, http://www.domain.com/e/d/foo.html",//
|
||||
"True, http://www.domain.com/e/doh.html",//
|
||||
"True, http://www.domain.com/f/index.html",//
|
||||
"True, http://www.domain.com/foo/bar/baz.html",//
|
||||
"True, http://www.domain.com/f/" })
|
||||
void testNutchCases(boolean isAllowed, String urlStr) {
|
||||
@CsvSource({
|
||||
"False, False, http://www.domain.com/a",//
|
||||
"False, False, http://www.domain.com/a/",//
|
||||
"False, False, http://www.domain.com/a/bloh/foo.html",//
|
||||
"True, True, http://www.domain.com/b",//
|
||||
"False, False, http://www.domain.com/b/a",//
|
||||
"False, False, http://www.domain.com/b/a/index.html",//
|
||||
"True, True, http://www.domain.com/b/b/foo.html",//
|
||||
"True, True, http://www.domain.com/c",//
|
||||
"True, True, http://www.domain.com/c/a",//
|
||||
"True, True, http://www.domain.com/c/a/index.html",//
|
||||
"True, True, http://www.domain.com/c/b/foo.html",//
|
||||
"True, False, http://www.domain.com/d",//
|
||||
"True, False, http://www.domain.com/d/a",//
|
||||
"True, True, http://www.domain.com/e/a/index.html",//
|
||||
"True, True, http://www.domain.com/e/d",//
|
||||
"True, False, http://www.domain.com/e/d/foo.html",//
|
||||
"True, True, http://www.domain.com/e/doh.html",//
|
||||
"True, True, http://www.domain.com/f/index.html",//
|
||||
"True, False, http://www.domain.com/foo/bar/baz.html",//
|
||||
"True, True, http://www.domain.com/f/" })
|
||||
void testNutchCases(boolean isAllowed, boolean isMergeAllowed, String urlStr) {
|
||||
// Run through the Nutch test cases.
|
||||
final String nutchRobotsTxt = "User-Agent: Agent1 #foo" + CR + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + CR + "" + CR + "" + CR + "User-Agent: Agent2 Agent3#foo" + CR
|
||||
+ "User-Agent: Agent4" + CR + "Disallow: /d" + CR + "Disallow: /e/d/" + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR;
|
||||
@ -317,11 +318,11 @@ public class SimpleRobotRulesParserTest {
|
||||
rules = createRobotRules("Agent1", nutchRobotsTxt);
|
||||
assertEquals(isAllowed, rules.isAllowed(urlStr));
|
||||
|
||||
// Note that the SimpleRobotRulesParser only parses the rule set of the
|
||||
// first matching agent name. For the following example, the parser
|
||||
// returns only the rules matching 'Agent1'.
|
||||
|
||||
|
||||
// Note that SimpleRobotRulesParser now merges all matching user agent rules.
|
||||
rules = createRobotRules("Agent5,Agent2,Agent1,Agent3,*", nutchRobotsTxt);
|
||||
assertEquals(isAllowed, rules.isAllowed(urlStr));
|
||||
assertEquals(isMergeAllowed, rules.isAllowed(urlStr));
|
||||
}
|
||||
|
||||
@ParameterizedTest
|
||||
@ -477,7 +478,13 @@ public class SimpleRobotRulesParserTest {
|
||||
@Test
|
||||
void testMultiWildcard() {
|
||||
// Make sure we only take the first wildcard entry.
|
||||
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + CRLF + "User-agent: *" + CRLF + "Disallow: /";
|
||||
final String simpleRobotsTxt =
|
||||
"User-agent: *" + CRLF +
|
||||
"Disallow: /index.html" + CRLF +
|
||||
"Allow: /" + CRLF +
|
||||
CRLF +
|
||||
"User-agent: *" + CRLF +
|
||||
"Disallow: /";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt);
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/index.html"));
|
||||
@ -713,6 +720,31 @@ public class SimpleRobotRulesParserTest {
|
||||
assertFalse(rules.isAllowed("http://domain.com/bot-trap/"), "many-user-agents");
|
||||
}
|
||||
|
||||
@Test
|
||||
void testManyUserAgentsMergeRules() throws Exception {
|
||||
BaseRobotRules rules = createRobotRules("wget", readFile("/robots/merge-rules.txt"));
|
||||
assertTrue(rules.isAllowed("http://domain.com/foo/bar/bar.txt"), "merge-rules");
|
||||
assertFalse(rules.isAllowed("http://domain.com/tmp/bar/bar.txt"), "merge-rules");
|
||||
assertTrue(rules.isAllowed("http://domain.com/chk/bar/bar.txt"), "merge-rules");
|
||||
}
|
||||
|
||||
@Test
|
||||
void testPrecedenceOfRules() throws Exception {
|
||||
BaseRobotRules rules = createRobotRules("wget", readFile("/robots/precedence-of-rules.txt"));
|
||||
|
||||
assertTrue(rules.isAllowed("http://domain.com/"), "precedence-of-rules");
|
||||
assertTrue(rules.isAllowed("http://domain.com/foo/bar"), "precedence-of-rules");
|
||||
assertFalse(rules.isAllowed("http://domain.com/bar/foo"), "precedence-of-rules");
|
||||
|
||||
rules = createRobotRules("testy", readFile("/robots/precedence-of-rules.txt"));
|
||||
|
||||
assertTrue(rules.isAllowed("http://domain.com/"), "precedence-of-rules");
|
||||
assertTrue(rules.isAllowed("http://domain.com/foo/bar"), "precedence-of-rules");
|
||||
assertFalse(rules.isAllowed("http://domain.com/foo/blue"), "precedence-of-rules");
|
||||
assertFalse(rules.isAllowed("http://domain.com/foo/"), "precedence-of-rules");
|
||||
assertFalse(rules.isAllowed("http://domain.com/foo/tar"), "precedence-of-rules");
|
||||
}
|
||||
|
||||
@Test
|
||||
void testMalformedPathInRobotsFile() throws Exception {
|
||||
BaseRobotRules rules = createRobotRules("bot1", readFile("/robots/malformed-path.txt"));
|
||||
|
16
src/test/resources/robots/merge-rules.txt
Normal file
16
src/test/resources/robots/merge-rules.txt
Normal file
@ -0,0 +1,16 @@
|
||||
User-Agent: testy
|
||||
Disallow: /
|
||||
|
||||
User-Agent: curl
|
||||
Disallow: /
|
||||
|
||||
User-Agent: wget
|
||||
Allow: /foo/bar
|
||||
|
||||
User-Agent: *
|
||||
User-Agent: wget
|
||||
Disallow: /foo/bar
|
||||
Disallow: /tmp/bar
|
||||
Allow: /ckh/bar
|
||||
|
||||
|
15
src/test/resources/robots/precedence-of-rules.txt
Normal file
15
src/test/resources/robots/precedence-of-rules.txt
Normal file
@ -0,0 +1,15 @@
|
||||
User-Agent: testy
|
||||
Disallow: /foo
|
||||
|
||||
User-Agent: wget
|
||||
Allow: /
|
||||
Allow: /foo
|
||||
Allow: /index.html
|
||||
Disallow: /*
|
||||
|
||||
User-Agent: testy
|
||||
Allow: /foo/bar
|
||||
Disallow: /foo/b*
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user