[Robots.txt] Update Javadoc to document changes in Robots.txt classes

related to RFC 9309 compliance - document effect of rules merging in combination with multiple agent names, fixes #423 - document that rules addressed to the wildcard agent are followed if none of the passed agent names matches - without any need to pass the wildcard agent name as one of the agent names - complete documentation - use @inheritDoc to avoid duplicated documentation - strip doc strings where inherited automatically by @Override annotations
2024-05-04 14:36:04 +02:00 · 2023-06-14 16:52:20 +02:00 · 2023-06-14 16:52:20 +02:00 · a5bd9645fa
parent 0eb2c74294
commit a5bd9645fa
4 changed files with 142 additions and 69 deletions
--- a/src/main/java/crawlercommons/robots/BaseRobotRules.java
+++ b/src/main/java/crawlercommons/robots/BaseRobotRules.java
@ -22,8 +22,11 @@ import java.util.LinkedHashSet;
 import java.util.List;

 /**
- * Result from parsing a single robots.txt file - which means we get a set of
- * rules, and a crawl-delay.
+ * Result from parsing a single robots.txt file – a set of allow/disallow rules
+ * to check whether a given URL is allowed, and optionally a <a href=
+ * "https://en.wikipedia.org/wiki/Robots.txt#Crawl-delay_directive"
+ * >Crawl-delay</a> and <a
+ * href="https://www.sitemaps.org/protocol.html#submit_robots">Sitemap</a> URLs.
 */
@SuppressWarnings("serial")
 public abstract class BaseRobotRules implements Serializable {
@ -52,10 +55,17 @@ public abstract class BaseRobotRules implements Serializable {
        _crawlDelay = crawlDelay;
    }

+    /**
+     * @return whether to defer visits to the server
+     */
    public boolean isDeferVisits() {
        return _deferVisits;
    }

+    /**
+     * Indicate to defer visits to the server, e.g. to wait until the robots.txt
+     * becomes available.
+     */
    public void setDeferVisits(boolean deferVisits) {
        _deferVisits = deferVisits;
    }
@ -103,7 +113,7 @@ public abstract class BaseRobotRules implements Serializable {

    /**
     * Returns a string with the crawl delay as well as a list of sitemaps if
-     * they exist (and aren't more than 10)
+     * they exist (and aren't more than 10).
     */
    @Override
    public String toString() {
--- a/src/main/java/crawlercommons/robots/BaseRobotsParser.java
+++ b/src/main/java/crawlercommons/robots/BaseRobotsParser.java
@ -19,15 +19,16 @@ package crawlercommons.robots;
 import java.io.Serializable;
 import java.util.Collection;

+/** Robots.txt parser definition. */
@SuppressWarnings("serial")
 public abstract class BaseRobotsParser implements Serializable {

    /**
     * Parse the robots.txt file in <i>content</i>, and return rules appropriate
     * for processing paths by <i>userAgent</i>. Note that multiple agent names
-     * may be provided as comma-separated values; the order of these shouldn't
-     * matter, as the file is parsed in order, and each agent name found in the
-     * file will be compared to every agent name found in robotNames.
+     * may be provided as comma-separated values. How agent names are matched
+     * against user-agent lines in the robots.txt depends on the implementing
+     * class.
     * 
     * Also note that names are lower-cased before comparison, and that any
     * robot name you pass shouldn't contain commas or spaces; if the name has
@ -76,7 +77,7 @@ public abstract class BaseRobotsParser implements Serializable {
     * @param content
     *            raw bytes from the site's robots.txt file
     * @param contentType
-     *            HTTP response header (mime-type)
+     *            content type (MIME type) from HTTP response header
     * @param robotNames
     *            name(s) of crawler, used to select rules from the robots.txt
     *            file by matching the names against the user-agent lines in the
--- a/src/main/java/crawlercommons/robots/SimpleRobotRules.java
+++ b/src/main/java/crawlercommons/robots/SimpleRobotRules.java
@ -25,22 +25,24 @@ import java.util.stream.Collectors;
 import crawlercommons.filters.basic.BasicURLNormalizer;

 /**
- * Result from parsing a single robots.txt file - set of rules, and optionally a
- * <a href=
- * "https://en.wikipedia.org/wiki/Robots.txt#Crawl-delay_directive">crawl
- * -delay</a> and <a
- * href="https://www.sitemaps.org/protocol.html#submit_robots">sitemap</a> URLs.
- * The <a href="https://www.rfc-editor.org/rfc/rfc9309.html">Robots Exclusion
- * Protocol RFC 9309</a> is fully supported. This includes <a href=
+ * {@inheritDoc}
+ * 
+ * <p>
+ * Allow/disallow rules are matched following the <a
+ * href="https://www.rfc-editor.org/rfc/rfc9309.html">Robots Exclusion Protocol
+ * RFC 9309</a>. This includes <a href=
 * "https://developers.google.com/search/reference/robots_txt">Google's
 * robots.txt extensions</a> to the <a
- * href="http://www.robotstxt.org/robotstxt.html">original RFC draft</a> are
- * covered: the <code>Allow</code> directive, <code>$</code>/<code>*</code>
- * special characters and precedence of more specific patterns
+ * href="http://www.robotstxt.org/robotstxt.html">original RFC draft</a>: the
+ * <code>Allow</code> directive, <code>$</code>/<code>*</code> special
+ * characters and precedence of longer (more specific) patterns.
+ * </p>
 * 
+ * <p>
 * See also: <a
 * href="https://en.wikipedia.org/wiki/Robots_exclusion_standard">Robots
 * Exclusion on Wikipedia</a>
+ * </p>
 */

@SuppressWarnings("serial")
@ -58,6 +60,10 @@ public class SimpleRobotRules extends BaseRobotRules {
        String _prefix;
        boolean _allow;

+        /**
+         * A allow/disallow rule: a path prefix or pattern and whether it is
+         * allowed or disallowed.
+         */
        public RobotRule(String prefix, boolean allow) {
            _prefix = prefix;
            _allow = allow;
@ -88,11 +94,6 @@ public class SimpleRobotRules extends BaseRobotRules {
            }
        }

-        /*
-         * (non-Javadoc)
-         * 
-         * @see java.lang.Object#hashCode()
-         */
        @Override
        public int hashCode() {
            final int prime = 31;
@ -102,11 +103,6 @@ public class SimpleRobotRules extends BaseRobotRules {
            return result;
        }

-        /*
-         * (non-Javadoc)
-         * 
-         * @see java.lang.Object#equals(java.lang.Object)
-         */
        @Override
        public boolean equals(Object obj) {
            if (this == obj)
@ -163,6 +159,9 @@ public class SimpleRobotRules extends BaseRobotRules {
        _rules.add(new RobotRule(prefix, allow));
    }

+    /**
+     * @return the list of allow/disallow rules
+     */
    public List<RobotRule> getRobotRules() {
        return this._rules;
    }
@ -377,6 +376,11 @@ public class SimpleRobotRules extends BaseRobotRules {
    /**
     * Is our ruleset set up to allow all access?
     * 
+     * <p>
+     * Note: This is decided only based on the {@link RobotRulesMode} without
+     * inspecting the set of allow/disallow rules.
+     * </p>
+     * 
     * @return true if all URLs are allowed.
     */
    @Override
@ -387,6 +391,11 @@ public class SimpleRobotRules extends BaseRobotRules {
    /**
     * Is our ruleset set up to disallow all access?
     * 
+     * <p>
+     * Note: This is decided only based on the {@link RobotRulesMode} without
+     * inspecting the set of allow/disallow rules.
+     * </p>
+     * 
     * @return true if no URLs are allowed.
     */
    @Override
@ -394,11 +403,6 @@ public class SimpleRobotRules extends BaseRobotRules {
        return _mode == RobotRulesMode.ALLOW_NONE;
    }

-    /*
-     * (non-Javadoc)
-     * 
-     * @see java.lang.Object#hashCode()
-     */
    @Override
    public int hashCode() {
        final int prime = 31;
@ -408,11 +412,6 @@ public class SimpleRobotRules extends BaseRobotRules {
        return result;
    }

-    /*
-     * (non-Javadoc)
-     * 
-     * @see java.lang.Object#equals(java.lang.Object)
-     */
    @Override
    public boolean equals(Object obj) {
        if (this == obj)
@ -433,9 +432,10 @@ public class SimpleRobotRules extends BaseRobotRules {
    }

    /*
-     * (non-Javadoc)
+     * {@inheritDoc}
     * 
-     * @see java.lang.Object#equals(java.lang.Object)
+     * In addition, the number of allow/disallow rules and the most specific
+     * rules by pattern length are shown (ten rules, at maximum).
     */
    @Override
    public String toString() {
--- a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
+++ b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
@ -41,37 +41,62 @@ import org.slf4j.LoggerFactory;
 import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;

 /**
+ * Robots.txt parser following RFC 9309, supporting the Sitemap and Crawl-delay
+ * extensions.
+ * 
 * <p>
 * This implementation of {@link BaseRobotsParser} retrieves a set of
 * {@link SimpleRobotRules rules} for an agent with the given name from the
 * <code>robots.txt</code> file of a given domain. The implementation follows
 * <a href="https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method">RFC
- * 9309</a>.
- * </p>
+ * 9309</a>. The following robots.txt extensions are supported:</p>
+ * <ul>
+ * <li>the <a href=
+ * "https://en.wikipedia.org/wiki/Robots.txt#Crawl-delay_directive">Crawl-delay</a>
+ * directive</li>
+ * <li>the
+ * <a href="https://www.sitemaps.org/protocol.html#submit_robots">Sitemap
+ * directive</a></li>
+ * </ul>
 * 
 * <p>
 * The class fulfills two tasks. The first one is the parsing of the
- * <code>robots.txt</code> file done in
- * {@link #parseContent(String, byte[], String, String)}. During the parsing
+ * <code>robots.txt</code> file, done in
+ * {@link #parseContent(String, byte[], String, Collection)}. During the parsing
 * process the parser searches for the provided agent name(s). If the parser
- * finds a matching name, the set of rules for this name is parsed and returned
- * as the result. <b>Note</b> that if more than one agent name is given to the
- * parser, it parses the rules for the first matching agent name inside the file
- * and skips all following user agent groups. It doesn't matter which of the
- * other given agent names would match additional rules inside the file. Thus,
- * if more than one agent name is given to the parser, the result can be
- * influenced by the order of rule sets inside the <code>robots.txt</code> file.
+ * finds a matching name, the set of rules for this name is parsed and added to
+ * the list of rules returned later. If another group of rules is matched in the
+ * robots.txt file, also the rules of this group are added to the returned list
+ * of rules. See
+ * <a href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1">RFC 9309,
+ * section 2.2.1</a> about the merging of groups of rules.
 * </p>
 * 
 * <p>
- * Note that the parser always parses the entire file, even if a matching agent
- * name group has been found, as it needs to collect all of the sitemap
- * directives.
+ * By default and following
+ * <a href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1">RFC 9309,
+ * section 2.2.1</a>, the user-agent name (&quot;product token&quot;) is matched
+ * literally but case-insensitive over the full name. See
+ * {@link #setExactUserAgentMatching(boolean)} for details of agent name
+ * matching and a legacy substring prefix matching mode.
 * </p>
 * 
 * <p>
- * If no rule set matches any of the provided agent names, the rule set for the
- * <code>'*'</code> agent is returned. If there is no such rule set inside the
+ * {@link SimpleRobotRulesParser} allows to pass multiple agent names as a
+ * collection. All rule groups matching any of the agent names are followed and
+ * merged into one set of rules. The order of the agent names in the collection
+ * does not affect the selection of rules.
+ * </p>
+ * 
+ * <p>
+ * The parser always parses the entire file to select all matching rule groups,
+ * and also to collect all of the sitemap directives.
+ * </p>
+ * 
+ * <p>
+ * If no rule set matches any of the provided user-agent names, or if an empty
+ * collection of agent names is passed, the rule set for the <code>'*'</code>
+ * agent is returned. If there is no such rule set inside the
 * <code>robots.txt</code> file, a rule set allowing all resource to be crawled
 * is returned.
 * </p>
@ -96,6 +121,14 @@ import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
 * or 5xx) the parser assumes a temporary error and a set of rules prohibiting
 * any crawling is returned.
 * </p>
+ * 
+ * <p>
+ * Note that fetching of the robots.txt file is outside the scope of this class.
+ * It must be implemented in the calling code, including the following of
+ * &quot;at least five consecutive redirects&quot; as required by
+ * <a href="https://www.rfc-editor.org/rfc/rfc9309.html#name-redirects">RFC
+ * 9309, section 2.3.1.2</a>.
+ * </p>
 */
@SuppressWarnings("serial")
 public class SimpleRobotRulesParser extends BaseRobotsParser {
@ -402,6 +435,23 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
        return userAgent != null && USER_AGENT_PRODUCT_TOKEN_MATCHER.matcher(userAgent).matches();
    }

+    /**
+     * {@inheritDoc}
+     * 
+     * <p>
+     * Set rules for response status codes following <a href=
+     * "https://www.rfc-editor.org/rfc/rfc9309.html#name-access-results">RFC
+     * 9309, section 2.3.1</a></p>:
+     * <ul>
+     * <li>Success (HTTP 200-299): throws {@link IllegalStateException} because
+     * the response content needs to be parsed</li>
+     * <li>"Unavailable" Status (HTTP 400-499): allow all</li>
+     * <li>"Unreachable" Status (HTTP 500-599): disallow all</li>
+     * <li>every other HTTP status code is treated as "allow all", but further
+     * visits on the server are deferred (see
+     * {@link SimpleRobotRules#setDeferVisits(boolean)})</li>
+     * </ul>
+     */
    @Override
    public SimpleRobotRules failedFetch(int httpStatusCode) {
        SimpleRobotRules result;
@ -459,25 +509,37 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {

    /**
     * Parse the robots.txt file in <i>content</i>, and return rules appropriate
-     * for processing paths by <i>userAgent</i>. Multiple agent names are
-     * provided as collection. See {@link #setExactUserAgentMatching(boolean)}
-     * for details how agent names are matched.
+     * for processing paths by <i>userAgent</i>.
+     * 
+     * <p>
+     * Multiple agent names can be passed as a collection. See
+     * {@link #setExactUserAgentMatching(boolean)} for details how agent names
+     * are matched. If multiple agent names are passed, all matching rule groups
+     * are followed and merged into one set of rules. The order of the agent
+     * names in the collection does not affect the selection of rules.
+     * </p>
+     * 
+     * <p>
+     * If none of the provided agent names is matched, rules addressed to the
+     * wildcard user-agent (<code>*</code>) are selected.
+     * </p>
     * 
     * @param url
-     *            URL that robots.txt content was fetched from. A complete and
-     *            valid URL (e.g., https://example.com/robots.txt) is expected.
-     *            Used to resolve relative sitemap URLs and for
-     *            logging/reporting purposes.
+     *            {@inheritDoc}
     * @param content
-     *            raw bytes from the site's robots.txt file
+     *            {@inheritDoc}
     * @param contentType
-     *            HTTP response header (mime-type)
+     *            {@inheritDoc}
     * @param robotNames
-     *            name(s) of crawler, used to select rules from the robots.txt
-     *            file by matching the names against the user-agent lines in the
-     *            robots.txt file. Robot names should be single token names, w/o
-     *            version or other parts. Names should be lower-case, as the
-     *            user-agent line is also converted to lower-case for matching.
+     *            crawler (user-agent) name(s), used to select rules from the
+     *            robots.txt file by matching the names against the user-agent
+     *            lines in the robots.txt file. Robot names should be single
+     *            token names, without version or other parts. Names must be
+     *            lower-case, as the user-agent line is also converted to
+     *            lower-case for matching. If the collection is empty, the rules
+     *            for the wildcard user-agent (<code>*</code>) are selected. The
+     *            wildcard user-agent name should not be contained in
+     *            <i>robotNames</i>.
     * @return robot rules.
     */
    @Override