From 6c0d91e40bbaeb53b80983c55f04429a438c90fd Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Fri, 9 Jun 2023 10:10:06 +0200 Subject: [PATCH] [Robots.txt] Deduplicate robots rules before matching (#416) * [Robots.txt] Deduplicate robots rules before matching - update SimpleRobotRules documentation: add references to RFC 9309 * [Robots.txt] Deduplicate robots rules before matching * SimpleRobotRules: add missing Override annotation --- .../robots/SimpleRobotRules.java | 45 ++++++++++++++----- 1 file changed, 33 insertions(+), 12 deletions(-) diff --git a/src/main/java/crawlercommons/robots/SimpleRobotRules.java b/src/main/java/crawlercommons/robots/SimpleRobotRules.java index 4a74fc6..8204912 100644 --- a/src/main/java/crawlercommons/robots/SimpleRobotRules.java +++ b/src/main/java/crawlercommons/robots/SimpleRobotRules.java @@ -19,19 +19,28 @@ package crawlercommons.robots; import java.io.Serializable; import java.net.URL; import java.util.ArrayList; -import java.util.Collections; import java.util.List; +import java.util.stream.Collectors; import crawlercommons.filters.basic.BasicURLNormalizer; /** - * Result from parsing a single robots.txt file - which means we get a set of - * rules, and an optional crawl-delay, and an optional sitemap URL. Note that we - * support Google's extensions (Allow directive and '$'/'*' special chars) plus - * the more widely used Sitemap directive. + * Result from parsing a single robots.txt file - set of rules, and optionally a + * crawl + * -delay and sitemap URLs. + * The Robots Exclusion + * Protocol RFC 9309 is fully supported. This includes Google's + * robots.txt extensions to the original RFC draft are + * covered: the Allow directive, $/* + * special characters and precedence of more specific patterns * - * See https://en.wikipedia.org/wiki/Robots_exclusion_standard See - * https://developers.google.com/search/reference/robots_txt + * See also: Robots + * Exclusion on Wikipedia */ @SuppressWarnings("serial") @@ -62,9 +71,9 @@ public class SimpleRobotRules extends BaseRobotRules { return this._prefix; } - // Sort from longest to shortest rules. @Override public int compareTo(RobotRule o) { + // order from longest to shortest path prefixes/patterns if (_prefix.length() < o._prefix.length()) { return 1; } else if (_prefix.length() > o._prefix.length()) { @@ -158,6 +167,7 @@ public class SimpleRobotRules extends BaseRobotRules { return this._rules; } + @Override public boolean isAllowed(String url) { if (_mode == RobotRulesMode.ALLOW_NONE) { return false; @@ -241,7 +251,7 @@ public class SimpleRobotRules extends BaseRobotRules { } /* - * We used to lower-case the path, but Google says we need to do + * We used to lower-case the path, but Google and RFC 9309 require * case-sensitive matching. * * However, we need to properly decode percent-encoded characters, @@ -346,11 +356,22 @@ public class SimpleRobotRules extends BaseRobotRules { } /** - * In order to match up with Google's convention, we want to match rules - * from longest to shortest. So sort the rules. + * Sort and deduplicate robot rules. This method must be called after the + * robots.txt has been processed and before rule matching. + * + * The ordering is implemented in {@link RobotRule#compareTo(RobotRule)} and + * defined by RFC + * 9309, section 2.2.2: + * + *
The most specific match found MUST be used. The most specific + * match is the match that has the most octets. Duplicate rules in a group + * MAY be deduplicated.
*/ public void sortRules() { - Collections.sort(_rules); + if (_rules.size() > 1) { + _rules = new ArrayList<>(_rules.stream().sorted().distinct().collect(Collectors.toList())); + } } /**