From fd1e7fcffe65676d5c0d333a3c1162eaf796d3a9 Mon Sep 17 00:00:00 2001 From: Aecio Santos Date: Sun, 13 May 2018 20:10:55 -0400 Subject: [PATCH] SimpleRobotRulesParser: Expose MAX_CRAWL_DELAY #194 - Makes MAX_CRAWL_DELAY configurable through class constructor --- CHANGES.txt | 1 + .../robots/SimpleRobotRulesParser.java | 27 +++++++++++++------ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 4f09713..06a7991 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -13,6 +13,7 @@ Current Development 0.10-SNAPSHOT (yyyy-mm-dd) - Handle new suffixes in PaidLevelDomain (kkrugler) #183 - Remove Tika dependency (kkrugler) #199 - Improve MIME detection for sitemaps (sebastian-nagel) #200 +- SimpleRobotRulesParser: Expose MAX_CRAWL_DELAY #194 Release 0.9 (2017-10-27) - [Sitemaps] Removed DOM-based sitemap parser (jnioche) #177 diff --git a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java index 4354f41..d49b157 100644 --- a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java +++ b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java @@ -69,8 +69,9 @@ import crawlercommons.robots.SimpleRobotRules.RobotRulesMode; *

* The crawl delay is parsed and added to the rules. Note that if the crawl * delay inside the file exceeds a maximum value, the crawling of all resources - * is prohibited. The maximum value is defined with {@link #MAX_CRAWL_DELAY}= - * {@value #MAX_CRAWL_DELAY}. + * is prohibited. The default maximum value is defined with + * {@link #DEFAULT_MAX_CRAWL_DELAY}={@value #DEFAULT_MAX_CRAWL_DELAY}. The + * default value can be changed using the constructor. *

* *

@@ -341,14 +342,24 @@ public class SimpleRobotRulesParser extends BaseRobotsParser { private static final Pattern USER_AGENT_PATTERN = Pattern.compile("(?i)user-agent:"); // Max # of warnings during parse of any one robots.txt file. - private static final int MAX_WARNINGS = 5; + private static final int DEFAULT_MAX_WARNINGS = 5; // Max value for crawl delay we'll use from robots.txt file. If the value is - // greater - // than this, we'll skip all pages. - private static final long MAX_CRAWL_DELAY = 300000; + // greater than this, we'll skip all pages. + private static final long DEFAULT_MAX_CRAWL_DELAY = 300000; private int _numWarnings; + private int _maxWarnings; + private long _maxCrawlDelay; + + public SimpleRobotRulesParser() { + this(DEFAULT_MAX_CRAWL_DELAY, DEFAULT_MAX_WARNINGS); + } + + public SimpleRobotRulesParser(long maxCrawlDelay, int maxWarnings) { + this._maxCrawlDelay = maxCrawlDelay; + this._maxWarnings = maxWarnings; + } @Override public BaseRobotRules failedFetch(int httpStatusCode) { @@ -522,7 +533,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser { } SimpleRobotRules result = parseState.getRobotRules(); - if (result.getCrawlDelay() > MAX_CRAWL_DELAY) { + if (result.getCrawlDelay() > _maxCrawlDelay) { // Some evil sites use a value like 3600 (seconds) for the crawl // delay, which would // cause lots of problems for us. @@ -541,7 +552,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser { LOGGER.warn("Problem processing robots.txt for {}", url); } - if (_numWarnings < MAX_WARNINGS) { + if (_numWarnings < _maxWarnings) { LOGGER.warn("\t {}", msg); } }