1
0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-09-28 10:41:04 +02:00

Improve robots check draft rfc compliance

This commit is contained in:
Eduardo Jimenez 2021-12-14 11:47:42 -08:00 committed by Sebastian Nagel
parent d3ccb553df
commit 1f0e79b72a
5 changed files with 157 additions and 83 deletions

@ -163,13 +163,36 @@ public class SimpleRobotRules extends BaseRobotRules {
return true;
}
boolean isAllowed = true;
int longestRuleMatch = Integer.MIN_VALUE;
for (RobotRule rule : _rules) {
if (ruleMatches(pathWithQuery, rule._prefix)) {
return rule._allow;
int matchLength = ruleMatches(pathWithQuery, rule._prefix);
if (matchLength == -1) {
// See precedence-of-rules test case for an example
// Some webmasters expect behavior close to google's, and this block is equivalent to:
// https://github.com/google/robotstxt/blob/02bc6cdfa32db50d42563180c42aeb47042b4f0c/robots.cc#L605-L618
// There are example robots.txt in the wild that benefit from this.
// As of 2/7/2022, https://venmo.com/robots.txt for instance.
if (rule._prefix.endsWith("index.htm") || rule._prefix.endsWith("index.html")) {
matchLength = ruleMatches(pathWithQuery, rule._prefix.substring(0, rule._prefix.indexOf("index.htm")) + "$");
if (matchLength == -1) {
continue;
}
} else {
continue;
}
}
if (longestRuleMatch < matchLength) {
longestRuleMatch = matchLength;
isAllowed = rule.isAllow();
} else if (longestRuleMatch == matchLength) {
isAllowed |= rule.isAllow();
}
// else we've already got a more specific rule, and this match doesn't matter
}
return true;
return isAllowed;
}
}
@ -197,7 +220,7 @@ public class SimpleRobotRules extends BaseRobotRules {
}
}
private boolean ruleMatches(String text, String pattern) {
private int ruleMatches(String text, String pattern) {
int patternPos = 0;
int textPos = 0;
@ -223,7 +246,7 @@ public class SimpleRobotRules extends BaseRobotRules {
patternPos += 1;
if (patternPos >= patternEnd) {
// Pattern ends with '*', we're all good.
return true;
return pattern.length();
}
// TODO - don't worry about having two '*' in a row?
@ -255,14 +278,14 @@ public class SimpleRobotRules extends BaseRobotRules {
// If we matched, we're all set, otherwise we failed
if (!matched) {
return false;
return -1;
}
} else {
// See if the pattern from patternPos to wildcardPos matches the
// text starting at textPos
while ((patternPos < wildcardPos) && (textPos < textEnd)) {
if (text.charAt(textPos++) != pattern.charAt(patternPos++)) {
return false;
return -1;
}
}
}
@ -277,7 +300,11 @@ public class SimpleRobotRules extends BaseRobotRules {
// We're at the end, so we have a match if the pattern was completely
// consumed, and either we consumed all the text or we didn't have to
// match it all (no '$' at end of the pattern)
return (patternPos == patternEnd) && ((textPos == textEnd) || !containsEndChar);
if ((patternPos == patternEnd) && ((textPos == textEnd) || !containsEndChar)) {
return pattern.length();
} else {
return -1;
}
}
/**

@ -167,11 +167,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
*/
private boolean _finishedAgentFields;
// True if we're done adding rules for a matched (not wildcard) agent
// name. When this is true, we only consider sitemap directives, so we
// skip all remaining user agent blocks.
private boolean _skipAgents;
/*
* Counter of warnings reporting invalid rules/lines in the robots.txt
* file. The counter is used to limit the number of logged warnings.
@ -225,14 +220,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
_finishedAgentFields = finishedAgentFields;
}
public boolean isSkipAgents() {
return _skipAgents;
}
public void setSkipAgents(boolean skipAgents) {
_skipAgents = skipAgents;
}
public void clearRules() {
_curRules.clearRules();
}
@ -513,22 +500,27 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
break;
case DISALLOW:
parseState.setFinishedAgentFields(true);
handleDisallow(parseState, token);
break;
case ALLOW:
parseState.setFinishedAgentFields(true);
handleAllow(parseState, token);
break;
case CRAWL_DELAY:
parseState.setFinishedAgentFields(true);
handleCrawlDelay(parseState, token);
break;
case SITEMAP:
parseState.setFinishedAgentFields(true);
handleSitemap(parseState, token);
break;
case HTTP:
parseState.setFinishedAgentFields(true);
handleHttp(parseState, token);
break;
@ -593,21 +585,16 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
* data for directive
*/
private void handleUserAgent(ParseState state, RobotToken token) {
if (state.isMatchedRealName()) {
if (state.isFinishedAgentFields()) {
state.setSkipAgents(true);
}
return;
}
if (state.isFinishedAgentFields()) {
// We've got a user agent field, so we haven't yet seen anything
// that tells us we're done with this set of agent names.
state.setFinishedAgentFields(false);
// If we are adding rules, and had already finished reading user agent
// fields, then we are in a new section, hence stop adding rules
if (state.isAddingRules() && state.isFinishedAgentFields()) {
state.setAddingRules(false);
}
// Clearly we've encountered a new user agent directive, hence we need to start
// processing until we are finished.
state.setFinishedAgentFields(false);
// Handle the case when there are multiple target names are passed
// We assume we should do case-insensitive comparison of target name.
String[] targetNames = state.getTargetName().toLowerCase(Locale.ROOT).split(",");
@ -621,21 +608,24 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
// TODO KKr - catch case of multiple names, log as non-standard.
String[] agentNames = token.getData().split("[ \t,]");
for (String agentName : agentNames) {
// TODO should we do case-insensitive matching? Probably yes.
agentName = agentName.trim().toLowerCase(Locale.ROOT);
if (agentName.isEmpty()) {
// Ignore empty names
} else if (agentName.equals("*") && !state.isMatchedWildcard()) {
} else if (agentName.equals("*") && !state.isMatchedRealName()) {
state.setMatchedWildcard(true);
state.setAddingRules(true);
} else {
// TODO use longest match as winner
for (String targetName : targetNameSplits) {
if (targetName.startsWith(agentName)) {
// In case we previously hit a wildcard rule match
// but we could also have hit our own user agent before.
// only clear if wildcard was matched earlier
if (state.isMatchedWildcard()) {
state.clearRules();
}
state.setMatchedRealName(true);
state.setAddingRules(true);
// In case we previously hit a wildcard rule match
state.clearRules();
state.setMatchedWildcard(false);
break;
}
}
@ -644,6 +634,17 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
}
}
/**
* Add any uniform rules to clean up path directives
* @param path
* @return clean path
*/
private String normalizePathDirective(String path) {
path = path.trim();
return path;
}
/**
* Handle the disallow: directive
*
@ -653,12 +654,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
* data for directive
*/
private void handleDisallow(ParseState state, RobotToken token) {
if (state.isSkipAgents()) {
return;
}
state.setFinishedAgentFields(true);
if (!state.isAddingRules()) {
return;
}
@ -667,7 +662,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
try {
path = URLDecoder.decode(path, "UTF-8");
path = normalizePathDirective(path);
if (path.length() == 0) {
// Disallow: <nothing> => allow all.
state.clearRules();
@ -688,12 +683,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
* data for directive
*/
private void handleAllow(ParseState state, RobotToken token) {
if (state.isSkipAgents()) {
return;
}
state.setFinishedAgentFields(true);
if (!state.isAddingRules()) {
return;
}
@ -702,6 +691,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
try {
path = URLDecoder.decode(path, "UTF-8");
path = normalizePathDirective(path);
} catch (Exception e) {
reportWarning(state, "Error parsing robots rules - can't decode path: {}", path);
}
@ -723,12 +713,6 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
* data for directive
*/
private void handleCrawlDelay(ParseState state, RobotToken token) {
if (state.isSkipAgents()) {
return;
}
state.setFinishedAgentFields(true);
if (!state.isAddingRules()) {
return;
}

@ -287,27 +287,28 @@ public class SimpleRobotRulesParserTest {
}
@ParameterizedTest
@CsvSource({ "False, http://www.domain.com/a",//
"False, http://www.domain.com/a/",//
"False, http://www.domain.com/a/bloh/foo.html",//
"True, http://www.domain.com/b",//
"False, http://www.domain.com/b/a",//
"False, http://www.domain.com/b/a/index.html",//
"True, http://www.domain.com/b/b/foo.html",//
"True, http://www.domain.com/c",//
"True, http://www.domain.com/c/a",//
"True, http://www.domain.com/c/a/index.html",//
"True, http://www.domain.com/c/b/foo.html",//
"True, http://www.domain.com/d",//
"True, http://www.domain.com/d/a",//
"True, http://www.domain.com/e/a/index.html",//
"True, http://www.domain.com/e/d",//
"True, http://www.domain.com/e/d/foo.html",//
"True, http://www.domain.com/e/doh.html",//
"True, http://www.domain.com/f/index.html",//
"True, http://www.domain.com/foo/bar/baz.html",//
"True, http://www.domain.com/f/" })
void testNutchCases(boolean isAllowed, String urlStr) {
@CsvSource({
"False, False, http://www.domain.com/a",//
"False, False, http://www.domain.com/a/",//
"False, False, http://www.domain.com/a/bloh/foo.html",//
"True, True, http://www.domain.com/b",//
"False, False, http://www.domain.com/b/a",//
"False, False, http://www.domain.com/b/a/index.html",//
"True, True, http://www.domain.com/b/b/foo.html",//
"True, True, http://www.domain.com/c",//
"True, True, http://www.domain.com/c/a",//
"True, True, http://www.domain.com/c/a/index.html",//
"True, True, http://www.domain.com/c/b/foo.html",//
"True, False, http://www.domain.com/d",//
"True, False, http://www.domain.com/d/a",//
"True, True, http://www.domain.com/e/a/index.html",//
"True, True, http://www.domain.com/e/d",//
"True, False, http://www.domain.com/e/d/foo.html",//
"True, True, http://www.domain.com/e/doh.html",//
"True, True, http://www.domain.com/f/index.html",//
"True, False, http://www.domain.com/foo/bar/baz.html",//
"True, True, http://www.domain.com/f/" })
void testNutchCases(boolean isAllowed, boolean isMergeAllowed, String urlStr) {
// Run through the Nutch test cases.
final String nutchRobotsTxt = "User-Agent: Agent1 #foo" + CR + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + CR + "" + CR + "" + CR + "User-Agent: Agent2 Agent3#foo" + CR
+ "User-Agent: Agent4" + CR + "Disallow: /d" + CR + "Disallow: /e/d/" + CR + "" + CR + "User-Agent: *" + CR + "Disallow: /foo/bar/" + CR;
@ -317,11 +318,11 @@ public class SimpleRobotRulesParserTest {
rules = createRobotRules("Agent1", nutchRobotsTxt);
assertEquals(isAllowed, rules.isAllowed(urlStr));
// Note that the SimpleRobotRulesParser only parses the rule set of the
// first matching agent name. For the following example, the parser
// returns only the rules matching 'Agent1'.
// Note that SimpleRobotRulesParser now merges all matching user agent rules.
rules = createRobotRules("Agent5,Agent2,Agent1,Agent3,*", nutchRobotsTxt);
assertEquals(isAllowed, rules.isAllowed(urlStr));
assertEquals(isMergeAllowed, rules.isAllowed(urlStr));
}
@ParameterizedTest
@ -477,7 +478,13 @@ public class SimpleRobotRulesParserTest {
@Test
void testMultiWildcard() {
// Make sure we only take the first wildcard entry.
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + CRLF + "User-agent: *" + CRLF + "Disallow: /";
final String simpleRobotsTxt =
"User-agent: *" + CRLF +
"Disallow: /index.html" + CRLF +
"Allow: /" + CRLF +
CRLF +
"User-agent: *" + CRLF +
"Disallow: /";
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt);
assertFalse(rules.isAllowed("http://www.domain.com/index.html"));
@ -713,6 +720,31 @@ public class SimpleRobotRulesParserTest {
assertFalse(rules.isAllowed("http://domain.com/bot-trap/"), "many-user-agents");
}
@Test
void testManyUserAgentsMergeRules() throws Exception {
BaseRobotRules rules = createRobotRules("wget", readFile("/robots/merge-rules.txt"));
assertTrue(rules.isAllowed("http://domain.com/foo/bar/bar.txt"), "merge-rules");
assertFalse(rules.isAllowed("http://domain.com/tmp/bar/bar.txt"), "merge-rules");
assertTrue(rules.isAllowed("http://domain.com/chk/bar/bar.txt"), "merge-rules");
}
@Test
void testPrecedenceOfRules() throws Exception {
BaseRobotRules rules = createRobotRules("wget", readFile("/robots/precedence-of-rules.txt"));
assertTrue(rules.isAllowed("http://domain.com/"), "precedence-of-rules");
assertTrue(rules.isAllowed("http://domain.com/foo/bar"), "precedence-of-rules");
assertFalse(rules.isAllowed("http://domain.com/bar/foo"), "precedence-of-rules");
rules = createRobotRules("testy", readFile("/robots/precedence-of-rules.txt"));
assertTrue(rules.isAllowed("http://domain.com/"), "precedence-of-rules");
assertTrue(rules.isAllowed("http://domain.com/foo/bar"), "precedence-of-rules");
assertFalse(rules.isAllowed("http://domain.com/foo/blue"), "precedence-of-rules");
assertFalse(rules.isAllowed("http://domain.com/foo/"), "precedence-of-rules");
assertFalse(rules.isAllowed("http://domain.com/foo/tar"), "precedence-of-rules");
}
@Test
void testMalformedPathInRobotsFile() throws Exception {
BaseRobotRules rules = createRobotRules("bot1", readFile("/robots/malformed-path.txt"));

@ -0,0 +1,16 @@
User-Agent: testy
Disallow: /
User-Agent: curl
Disallow: /
User-Agent: wget
Allow: /foo/bar
User-Agent: *
User-Agent: wget
Disallow: /foo/bar
Disallow: /tmp/bar
Allow: /ckh/bar

@ -0,0 +1,15 @@
User-Agent: testy
Disallow: /foo
User-Agent: wget
Allow: /
Allow: /foo
Allow: /index.html
Disallow: /*
User-Agent: testy
Allow: /foo/bar
Disallow: /foo/b*