SimpleRobotRulesParser: Expose MAX_CRAWL_DELAY #194

- Makes MAX_CRAWL_DELAY configurable through class constructor
2024-05-25 11:46:03 +02:00 · 2018-05-13 20:10:55 -04:00 · 2018-05-13 20:10:55 -04:00 · fd1e7fcffe
parent e25309d26c
commit fd1e7fcffe
2 changed files with 20 additions and 8 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -13,6 +13,7 @@ Current Development 0.10-SNAPSHOT (yyyy-mm-dd)
 - Handle new suffixes in PaidLevelDomain (kkrugler) #183
 - Remove Tika dependency (kkrugler) #199
 - Improve MIME detection for sitemaps (sebastian-nagel) #200
+- SimpleRobotRulesParser: Expose MAX_CRAWL_DELAY #194

 Release 0.9 (2017-10-27)
 - [Sitemaps] Removed DOM-based sitemap parser (jnioche) #177
--- a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
+++ b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
@ -69,8 +69,9 @@ import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
 * <p>
 * The crawl delay is parsed and added to the rules. Note that if the crawl
 * delay inside the file exceeds a maximum value, the crawling of all resources
- * is prohibited. The maximum value is defined with {@link #MAX_CRAWL_DELAY}=
- * {@value #MAX_CRAWL_DELAY}.
+ * is prohibited. The default maximum value is defined with
+ * {@link #DEFAULT_MAX_CRAWL_DELAY}={@value #DEFAULT_MAX_CRAWL_DELAY}. The
+ * default value can be changed using the constructor.
 * </p>
 * 
 * <p>
@ -341,14 +342,24 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
    private static final Pattern USER_AGENT_PATTERN = Pattern.compile("(?i)user-agent:");

    // Max # of warnings during parse of any one robots.txt file.
-    private static final int MAX_WARNINGS = 5;
+    private static final int DEFAULT_MAX_WARNINGS = 5;

    // Max value for crawl delay we'll use from robots.txt file. If the value is
-    // greater
-    // than this, we'll skip all pages.
-    private static final long MAX_CRAWL_DELAY = 300000;
+    // greater than this, we'll skip all pages.
+    private static final long DEFAULT_MAX_CRAWL_DELAY = 300000;

    private int _numWarnings;
+    private int _maxWarnings;
+    private long _maxCrawlDelay;
+
+    public SimpleRobotRulesParser() {
+        this(DEFAULT_MAX_CRAWL_DELAY, DEFAULT_MAX_WARNINGS);
+    }
+
+    public SimpleRobotRulesParser(long maxCrawlDelay, int maxWarnings) {
+        this._maxCrawlDelay = maxCrawlDelay;
+        this._maxWarnings = maxWarnings;
+    }

    @Override
    public BaseRobotRules failedFetch(int httpStatusCode) {
@ -522,7 +533,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
        }

        SimpleRobotRules result = parseState.getRobotRules();
-        if (result.getCrawlDelay() > MAX_CRAWL_DELAY) {
+        if (result.getCrawlDelay() > _maxCrawlDelay) {
            // Some evil sites use a value like 3600 (seconds) for the crawl
            // delay, which would
            // cause lots of problems for us.
@ -541,7 +552,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
            LOGGER.warn("Problem processing robots.txt for {}", url);
        }

-        if (_numWarnings < MAX_WARNINGS) {
+        if (_numWarnings < _maxWarnings) {
            LOGGER.warn("\t {}", msg);
        }
    }