mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-18 01:56:06 +02:00
[Robots.txt] Update Javadoc to document changes in Robots.txt classes
related to RFC 9309 compliance - document effect of rules merging in combination with multiple agent names, fixes #423 - document that rules addressed to the wildcard agent are followed if none of the passed agent names matches - without any need to pass the wildcard agent name as one of the agent names - complete documentation - use @inheritDoc to avoid duplicated documentation - strip doc strings where inherited automatically by @Override annotations
This commit is contained in:
parent
0eb2c74294
commit
a5bd9645fa
|
@ -22,8 +22,11 @@ import java.util.LinkedHashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Result from parsing a single robots.txt file - which means we get a set of
|
* Result from parsing a single robots.txt file – a set of allow/disallow rules
|
||||||
* rules, and a crawl-delay.
|
* to check whether a given URL is allowed, and optionally a <a href=
|
||||||
|
* "https://en.wikipedia.org/wiki/Robots.txt#Crawl-delay_directive"
|
||||||
|
* >Crawl-delay</a> and <a
|
||||||
|
* href="https://www.sitemaps.org/protocol.html#submit_robots">Sitemap</a> URLs.
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("serial")
|
@SuppressWarnings("serial")
|
||||||
public abstract class BaseRobotRules implements Serializable {
|
public abstract class BaseRobotRules implements Serializable {
|
||||||
|
@ -52,10 +55,17 @@ public abstract class BaseRobotRules implements Serializable {
|
||||||
_crawlDelay = crawlDelay;
|
_crawlDelay = crawlDelay;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return whether to defer visits to the server
|
||||||
|
*/
|
||||||
public boolean isDeferVisits() {
|
public boolean isDeferVisits() {
|
||||||
return _deferVisits;
|
return _deferVisits;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Indicate to defer visits to the server, e.g. to wait until the robots.txt
|
||||||
|
* becomes available.
|
||||||
|
*/
|
||||||
public void setDeferVisits(boolean deferVisits) {
|
public void setDeferVisits(boolean deferVisits) {
|
||||||
_deferVisits = deferVisits;
|
_deferVisits = deferVisits;
|
||||||
}
|
}
|
||||||
|
@ -103,7 +113,7 @@ public abstract class BaseRobotRules implements Serializable {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a string with the crawl delay as well as a list of sitemaps if
|
* Returns a string with the crawl delay as well as a list of sitemaps if
|
||||||
* they exist (and aren't more than 10)
|
* they exist (and aren't more than 10).
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
|
@ -19,15 +19,16 @@ package crawlercommons.robots;
|
||||||
import java.io.Serializable;
|
import java.io.Serializable;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
|
||||||
|
/** Robots.txt parser definition. */
|
||||||
@SuppressWarnings("serial")
|
@SuppressWarnings("serial")
|
||||||
public abstract class BaseRobotsParser implements Serializable {
|
public abstract class BaseRobotsParser implements Serializable {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse the robots.txt file in <i>content</i>, and return rules appropriate
|
* Parse the robots.txt file in <i>content</i>, and return rules appropriate
|
||||||
* for processing paths by <i>userAgent</i>. Note that multiple agent names
|
* for processing paths by <i>userAgent</i>. Note that multiple agent names
|
||||||
* may be provided as comma-separated values; the order of these shouldn't
|
* may be provided as comma-separated values. How agent names are matched
|
||||||
* matter, as the file is parsed in order, and each agent name found in the
|
* against user-agent lines in the robots.txt depends on the implementing
|
||||||
* file will be compared to every agent name found in robotNames.
|
* class.
|
||||||
*
|
*
|
||||||
* Also note that names are lower-cased before comparison, and that any
|
* Also note that names are lower-cased before comparison, and that any
|
||||||
* robot name you pass shouldn't contain commas or spaces; if the name has
|
* robot name you pass shouldn't contain commas or spaces; if the name has
|
||||||
|
@ -76,7 +77,7 @@ public abstract class BaseRobotsParser implements Serializable {
|
||||||
* @param content
|
* @param content
|
||||||
* raw bytes from the site's robots.txt file
|
* raw bytes from the site's robots.txt file
|
||||||
* @param contentType
|
* @param contentType
|
||||||
* HTTP response header (mime-type)
|
* content type (MIME type) from HTTP response header
|
||||||
* @param robotNames
|
* @param robotNames
|
||||||
* name(s) of crawler, used to select rules from the robots.txt
|
* name(s) of crawler, used to select rules from the robots.txt
|
||||||
* file by matching the names against the user-agent lines in the
|
* file by matching the names against the user-agent lines in the
|
||||||
|
|
|
@ -25,22 +25,24 @@ import java.util.stream.Collectors;
|
||||||
import crawlercommons.filters.basic.BasicURLNormalizer;
|
import crawlercommons.filters.basic.BasicURLNormalizer;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Result from parsing a single robots.txt file - set of rules, and optionally a
|
* {@inheritDoc}
|
||||||
* <a href=
|
*
|
||||||
* "https://en.wikipedia.org/wiki/Robots.txt#Crawl-delay_directive">crawl
|
* <p>
|
||||||
* -delay</a> and <a
|
* Allow/disallow rules are matched following the <a
|
||||||
* href="https://www.sitemaps.org/protocol.html#submit_robots">sitemap</a> URLs.
|
* href="https://www.rfc-editor.org/rfc/rfc9309.html">Robots Exclusion Protocol
|
||||||
* The <a href="https://www.rfc-editor.org/rfc/rfc9309.html">Robots Exclusion
|
* RFC 9309</a>. This includes <a href=
|
||||||
* Protocol RFC 9309</a> is fully supported. This includes <a href=
|
|
||||||
* "https://developers.google.com/search/reference/robots_txt">Google's
|
* "https://developers.google.com/search/reference/robots_txt">Google's
|
||||||
* robots.txt extensions</a> to the <a
|
* robots.txt extensions</a> to the <a
|
||||||
* href="http://www.robotstxt.org/robotstxt.html">original RFC draft</a> are
|
* href="http://www.robotstxt.org/robotstxt.html">original RFC draft</a>: the
|
||||||
* covered: the <code>Allow</code> directive, <code>$</code>/<code>*</code>
|
* <code>Allow</code> directive, <code>$</code>/<code>*</code> special
|
||||||
* special characters and precedence of more specific patterns
|
* characters and precedence of longer (more specific) patterns.
|
||||||
|
* </p>
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
* See also: <a
|
* See also: <a
|
||||||
* href="https://en.wikipedia.org/wiki/Robots_exclusion_standard">Robots
|
* href="https://en.wikipedia.org/wiki/Robots_exclusion_standard">Robots
|
||||||
* Exclusion on Wikipedia</a>
|
* Exclusion on Wikipedia</a>
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
|
|
||||||
@SuppressWarnings("serial")
|
@SuppressWarnings("serial")
|
||||||
|
@ -58,6 +60,10 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||||
String _prefix;
|
String _prefix;
|
||||||
boolean _allow;
|
boolean _allow;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* A allow/disallow rule: a path prefix or pattern and whether it is
|
||||||
|
* allowed or disallowed.
|
||||||
|
*/
|
||||||
public RobotRule(String prefix, boolean allow) {
|
public RobotRule(String prefix, boolean allow) {
|
||||||
_prefix = prefix;
|
_prefix = prefix;
|
||||||
_allow = allow;
|
_allow = allow;
|
||||||
|
@ -88,11 +94,6 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
*
|
|
||||||
* @see java.lang.Object#hashCode()
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
final int prime = 31;
|
final int prime = 31;
|
||||||
|
@ -102,11 +103,6 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
*
|
|
||||||
* @see java.lang.Object#equals(java.lang.Object)
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj) {
|
public boolean equals(Object obj) {
|
||||||
if (this == obj)
|
if (this == obj)
|
||||||
|
@ -163,6 +159,9 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||||
_rules.add(new RobotRule(prefix, allow));
|
_rules.add(new RobotRule(prefix, allow));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @return the list of allow/disallow rules
|
||||||
|
*/
|
||||||
public List<RobotRule> getRobotRules() {
|
public List<RobotRule> getRobotRules() {
|
||||||
return this._rules;
|
return this._rules;
|
||||||
}
|
}
|
||||||
|
@ -377,6 +376,11 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||||
/**
|
/**
|
||||||
* Is our ruleset set up to allow all access?
|
* Is our ruleset set up to allow all access?
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* Note: This is decided only based on the {@link RobotRulesMode} without
|
||||||
|
* inspecting the set of allow/disallow rules.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @return true if all URLs are allowed.
|
* @return true if all URLs are allowed.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
|
@ -387,6 +391,11 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||||
/**
|
/**
|
||||||
* Is our ruleset set up to disallow all access?
|
* Is our ruleset set up to disallow all access?
|
||||||
*
|
*
|
||||||
|
* <p>
|
||||||
|
* Note: This is decided only based on the {@link RobotRulesMode} without
|
||||||
|
* inspecting the set of allow/disallow rules.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
* @return true if no URLs are allowed.
|
* @return true if no URLs are allowed.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
|
@ -394,11 +403,6 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||||
return _mode == RobotRulesMode.ALLOW_NONE;
|
return _mode == RobotRulesMode.ALLOW_NONE;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
*
|
|
||||||
* @see java.lang.Object#hashCode()
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public int hashCode() {
|
public int hashCode() {
|
||||||
final int prime = 31;
|
final int prime = 31;
|
||||||
|
@ -408,11 +412,6 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* (non-Javadoc)
|
|
||||||
*
|
|
||||||
* @see java.lang.Object#equals(java.lang.Object)
|
|
||||||
*/
|
|
||||||
@Override
|
@Override
|
||||||
public boolean equals(Object obj) {
|
public boolean equals(Object obj) {
|
||||||
if (this == obj)
|
if (this == obj)
|
||||||
|
@ -433,9 +432,10 @@ public class SimpleRobotRules extends BaseRobotRules {
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* (non-Javadoc)
|
* {@inheritDoc}
|
||||||
*
|
*
|
||||||
* @see java.lang.Object#equals(java.lang.Object)
|
* In addition, the number of allow/disallow rules and the most specific
|
||||||
|
* rules by pattern length are shown (ten rules, at maximum).
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
|
|
|
@ -41,37 +41,62 @@ import org.slf4j.LoggerFactory;
|
||||||
import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
|
import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
* Robots.txt parser following RFC 9309, supporting the Sitemap and Crawl-delay
|
||||||
|
* extensions.
|
||||||
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* This implementation of {@link BaseRobotsParser} retrieves a set of
|
* This implementation of {@link BaseRobotsParser} retrieves a set of
|
||||||
* {@link SimpleRobotRules rules} for an agent with the given name from the
|
* {@link SimpleRobotRules rules} for an agent with the given name from the
|
||||||
* <code>robots.txt</code> file of a given domain. The implementation follows
|
* <code>robots.txt</code> file of a given domain. The implementation follows
|
||||||
* <a href="https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method">RFC
|
* <a href="https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method">RFC
|
||||||
* 9309</a>.
|
* 9309</a>. The following robots.txt extensions are supported:</p>
|
||||||
* </p>
|
* <ul>
|
||||||
|
* <li>the <a href=
|
||||||
|
* "https://en.wikipedia.org/wiki/Robots.txt#Crawl-delay_directive">Crawl-delay</a>
|
||||||
|
* directive</li>
|
||||||
|
* <li>the
|
||||||
|
* <a href="https://www.sitemaps.org/protocol.html#submit_robots">Sitemap
|
||||||
|
* directive</a></li>
|
||||||
|
* </ul>
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* The class fulfills two tasks. The first one is the parsing of the
|
* The class fulfills two tasks. The first one is the parsing of the
|
||||||
* <code>robots.txt</code> file done in
|
* <code>robots.txt</code> file, done in
|
||||||
* {@link #parseContent(String, byte[], String, String)}. During the parsing
|
* {@link #parseContent(String, byte[], String, Collection)}. During the parsing
|
||||||
* process the parser searches for the provided agent name(s). If the parser
|
* process the parser searches for the provided agent name(s). If the parser
|
||||||
* finds a matching name, the set of rules for this name is parsed and returned
|
* finds a matching name, the set of rules for this name is parsed and added to
|
||||||
* as the result. <b>Note</b> that if more than one agent name is given to the
|
* the list of rules returned later. If another group of rules is matched in the
|
||||||
* parser, it parses the rules for the first matching agent name inside the file
|
* robots.txt file, also the rules of this group are added to the returned list
|
||||||
* and skips all following user agent groups. It doesn't matter which of the
|
* of rules. See
|
||||||
* other given agent names would match additional rules inside the file. Thus,
|
* <a href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1">RFC 9309,
|
||||||
* if more than one agent name is given to the parser, the result can be
|
* section 2.2.1</a> about the merging of groups of rules.
|
||||||
* influenced by the order of rule sets inside the <code>robots.txt</code> file.
|
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* Note that the parser always parses the entire file, even if a matching agent
|
* By default and following
|
||||||
* name group has been found, as it needs to collect all of the sitemap
|
* <a href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1">RFC 9309,
|
||||||
* directives.
|
* section 2.2.1</a>, the user-agent name ("product token") is matched
|
||||||
|
* literally but case-insensitive over the full name. See
|
||||||
|
* {@link #setExactUserAgentMatching(boolean)} for details of agent name
|
||||||
|
* matching and a legacy substring prefix matching mode.
|
||||||
* </p>
|
* </p>
|
||||||
*
|
*
|
||||||
* <p>
|
* <p>
|
||||||
* If no rule set matches any of the provided agent names, the rule set for the
|
* {@link SimpleRobotRulesParser} allows to pass multiple agent names as a
|
||||||
* <code>'*'</code> agent is returned. If there is no such rule set inside the
|
* collection. All rule groups matching any of the agent names are followed and
|
||||||
|
* merged into one set of rules. The order of the agent names in the collection
|
||||||
|
* does not affect the selection of rules.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* The parser always parses the entire file to select all matching rule groups,
|
||||||
|
* and also to collect all of the sitemap directives.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* If no rule set matches any of the provided user-agent names, or if an empty
|
||||||
|
* collection of agent names is passed, the rule set for the <code>'*'</code>
|
||||||
|
* agent is returned. If there is no such rule set inside the
|
||||||
* <code>robots.txt</code> file, a rule set allowing all resource to be crawled
|
* <code>robots.txt</code> file, a rule set allowing all resource to be crawled
|
||||||
* is returned.
|
* is returned.
|
||||||
* </p>
|
* </p>
|
||||||
|
@ -96,6 +121,14 @@ import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
|
||||||
* or 5xx) the parser assumes a temporary error and a set of rules prohibiting
|
* or 5xx) the parser assumes a temporary error and a set of rules prohibiting
|
||||||
* any crawling is returned.
|
* any crawling is returned.
|
||||||
* </p>
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Note that fetching of the robots.txt file is outside the scope of this class.
|
||||||
|
* It must be implemented in the calling code, including the following of
|
||||||
|
* "at least five consecutive redirects" as required by
|
||||||
|
* <a href="https://www.rfc-editor.org/rfc/rfc9309.html#name-redirects">RFC
|
||||||
|
* 9309, section 2.3.1.2</a>.
|
||||||
|
* </p>
|
||||||
*/
|
*/
|
||||||
@SuppressWarnings("serial")
|
@SuppressWarnings("serial")
|
||||||
public class SimpleRobotRulesParser extends BaseRobotsParser {
|
public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||||
|
@ -402,6 +435,23 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||||
return userAgent != null && USER_AGENT_PRODUCT_TOKEN_MATCHER.matcher(userAgent).matches();
|
return userAgent != null && USER_AGENT_PRODUCT_TOKEN_MATCHER.matcher(userAgent).matches();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* {@inheritDoc}
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* Set rules for response status codes following <a href=
|
||||||
|
* "https://www.rfc-editor.org/rfc/rfc9309.html#name-access-results">RFC
|
||||||
|
* 9309, section 2.3.1</a></p>:
|
||||||
|
* <ul>
|
||||||
|
* <li>Success (HTTP 200-299): throws {@link IllegalStateException} because
|
||||||
|
* the response content needs to be parsed</li>
|
||||||
|
* <li>"Unavailable" Status (HTTP 400-499): allow all</li>
|
||||||
|
* <li>"Unreachable" Status (HTTP 500-599): disallow all</li>
|
||||||
|
* <li>every other HTTP status code is treated as "allow all", but further
|
||||||
|
* visits on the server are deferred (see
|
||||||
|
* {@link SimpleRobotRules#setDeferVisits(boolean)})</li>
|
||||||
|
* </ul>
|
||||||
|
*/
|
||||||
@Override
|
@Override
|
||||||
public SimpleRobotRules failedFetch(int httpStatusCode) {
|
public SimpleRobotRules failedFetch(int httpStatusCode) {
|
||||||
SimpleRobotRules result;
|
SimpleRobotRules result;
|
||||||
|
@ -459,25 +509,37 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Parse the robots.txt file in <i>content</i>, and return rules appropriate
|
* Parse the robots.txt file in <i>content</i>, and return rules appropriate
|
||||||
* for processing paths by <i>userAgent</i>. Multiple agent names are
|
* for processing paths by <i>userAgent</i>.
|
||||||
* provided as collection. See {@link #setExactUserAgentMatching(boolean)}
|
*
|
||||||
* for details how agent names are matched.
|
* <p>
|
||||||
|
* Multiple agent names can be passed as a collection. See
|
||||||
|
* {@link #setExactUserAgentMatching(boolean)} for details how agent names
|
||||||
|
* are matched. If multiple agent names are passed, all matching rule groups
|
||||||
|
* are followed and merged into one set of rules. The order of the agent
|
||||||
|
* names in the collection does not affect the selection of rules.
|
||||||
|
* </p>
|
||||||
|
*
|
||||||
|
* <p>
|
||||||
|
* If none of the provided agent names is matched, rules addressed to the
|
||||||
|
* wildcard user-agent (<code>*</code>) are selected.
|
||||||
|
* </p>
|
||||||
*
|
*
|
||||||
* @param url
|
* @param url
|
||||||
* URL that robots.txt content was fetched from. A complete and
|
* {@inheritDoc}
|
||||||
* valid URL (e.g., https://example.com/robots.txt) is expected.
|
|
||||||
* Used to resolve relative sitemap URLs and for
|
|
||||||
* logging/reporting purposes.
|
|
||||||
* @param content
|
* @param content
|
||||||
* raw bytes from the site's robots.txt file
|
* {@inheritDoc}
|
||||||
* @param contentType
|
* @param contentType
|
||||||
* HTTP response header (mime-type)
|
* {@inheritDoc}
|
||||||
* @param robotNames
|
* @param robotNames
|
||||||
* name(s) of crawler, used to select rules from the robots.txt
|
* crawler (user-agent) name(s), used to select rules from the
|
||||||
* file by matching the names against the user-agent lines in the
|
* robots.txt file by matching the names against the user-agent
|
||||||
* robots.txt file. Robot names should be single token names, w/o
|
* lines in the robots.txt file. Robot names should be single
|
||||||
* version or other parts. Names should be lower-case, as the
|
* token names, without version or other parts. Names must be
|
||||||
* user-agent line is also converted to lower-case for matching.
|
* lower-case, as the user-agent line is also converted to
|
||||||
|
* lower-case for matching. If the collection is empty, the rules
|
||||||
|
* for the wildcard user-agent (<code>*</code>) are selected. The
|
||||||
|
* wildcard user-agent name should not be contained in
|
||||||
|
* <i>robotNames</i>.
|
||||||
* @return robot rules.
|
* @return robot rules.
|
||||||
*/
|
*/
|
||||||
@Override
|
@Override
|
||||||
|
|
Loading…
Reference in New Issue