mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-07 07:16:02 +02:00
[Robots.txt] Deduplicate robots rules before matching (#416)
* [Robots.txt] Deduplicate robots rules before matching - update SimpleRobotRules documentation: add references to RFC 9309 * [Robots.txt] Deduplicate robots rules before matching * SimpleRobotRules: add missing Override annotation
This commit is contained in:
parent
bfb5b9b067
commit
6c0d91e40b
|
@ -19,19 +19,28 @@ package crawlercommons.robots;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.net.URL;
|
import java.net.URL;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
|
||||||
import crawlercommons.filters.basic.BasicURLNormalizer;
|
import crawlercommons.filters.basic.BasicURLNormalizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Result from parsing a single robots.txt file - which means we get a set of
|
* Result from parsing a single robots.txt file - set of rules, and optionally a
|
||||||
* rules, and an optional crawl-delay, and an optional sitemap URL. Note that we
|
* <a href=
|
||||||
* support Google's extensions (Allow directive and '$'/'*' special chars) plus
|
* "https://en.wikipedia.org/wiki/Robots.txt#Crawl-delay_directive">crawl
|
||||||
* the more widely used Sitemap directive.
|
* -delay</a> and <a
|
||||||
|
* href="https://www.sitemaps.org/protocol.html#submit_robots">sitemap</a> URLs.
|
||||||
|
* The <a href="https://www.rfc-editor.org/rfc/rfc9309.html">Robots Exclusion
|
||||||
|
* Protocol RFC 9309</a> is fully supported. This includes <a href=
|
||||||
|
* "https://developers.google.com/search/reference/robots_txt">Google's
|
||||||
|
* robots.txt extensions</a> to the <a
|
||||||
|
* href="http://www.robotstxt.org/robotstxt.html">original RFC draft</a> are
|
||||||
|
* covered: the <code>Allow</code> directive, <code>$</code>/<code>*</code>
|
||||||
|
* special characters and precedence of more specific patterns
|
||||||
*
|
*
|
||||||
* See https://en.wikipedia.org/wiki/Robots_exclusion_standard See
|
* See also: <a
|
||||||
* https://developers.google.com/search/reference/robots_txt
|
* href="https://en.wikipedia.org/wiki/Robots_exclusion_standard">Robots
|
||||||
|
* Exclusion on Wikipedia</a>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@SuppressWarnings("serial")
|
@SuppressWarnings("serial")
|
||||||
|
@ -62,9 +71,9 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||||
return this._prefix;
|
return this._prefix;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sort from longest to shortest rules.
|
|
||||||
@Override
|
@Override
|
||||||
public int compareTo(RobotRule o) {
|
public int compareTo(RobotRule o) {
|
||||||
|
// order from longest to shortest path prefixes/patterns
|
||||||
if (_prefix.length() < o._prefix.length()) {
|
if (_prefix.length() < o._prefix.length()) {
|
||||||
return 1;
|
return 1;
|
||||||
} else if (_prefix.length() > o._prefix.length()) {
|
} else if (_prefix.length() > o._prefix.length()) {
|
||||||
|
@ -158,6 +167,7 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||||
return this._rules;
|
return this._rules;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public boolean isAllowed(String url) {
|
public boolean isAllowed(String url) {
|
||||||
if (_mode == RobotRulesMode.ALLOW_NONE) {
|
if (_mode == RobotRulesMode.ALLOW_NONE) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -241,7 +251,7 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* We used to lower-case the path, but Google says we need to do
|
* We used to lower-case the path, but Google and RFC 9309 require
|
||||||
* case-sensitive matching.
|
* case-sensitive matching.
|
||||||
*
|
*
|
||||||
* However, we need to properly decode percent-encoded characters,
|
* However, we need to properly decode percent-encoded characters,
|
||||||
|
@ -346,11 +356,22 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* In order to match up with Google's convention, we want to match rules
|
* Sort and deduplicate robot rules. This method must be called after the
|
||||||
* from longest to shortest. So sort the rules.
|
* robots.txt has been processed and before rule matching.
|
||||||
|
*
|
||||||
|
* The ordering is implemented in {@link RobotRule#compareTo(RobotRule)} and
|
||||||
|
* defined by <a
|
||||||
|
* href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.2">RFC
|
||||||
|
* 9309, section 2.2.2</a>:
|
||||||
|
*
|
||||||
|
* <blockquote>The most specific match found MUST be used. The most specific
|
||||||
|
* match is the match that has the most octets. Duplicate rules in a group
|
||||||
|
* MAY be deduplicated.</blockquote>
|
||||||
*/
|
*/
|
||||||
public void sortRules() {
|
public void sortRules() {
|
||||||
Collections.sort(_rules);
|
if (_rules.size() > 1) {
|
||||||
|
_rules = new ArrayList<>(_rules.stream().sorted().distinct().collect(Collectors.toList()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in New Issue