1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-25 11:46:03 +02:00

SimpleRobotRulesParser: Expose MAX_CRAWL_DELAY #194

- Makes MAX_CRAWL_DELAY configurable through class constructor
This commit is contained in:
Aecio Santos 2018-05-13 20:10:55 -04:00
parent e25309d26c
commit fd1e7fcffe
2 changed files with 20 additions and 8 deletions

View File

@ -13,6 +13,7 @@ Current Development 0.10-SNAPSHOT (yyyy-mm-dd)
- Handle new suffixes in PaidLevelDomain (kkrugler) #183
- Remove Tika dependency (kkrugler) #199
- Improve MIME detection for sitemaps (sebastian-nagel) #200
- SimpleRobotRulesParser: Expose MAX_CRAWL_DELAY #194
Release 0.9 (2017-10-27)
- [Sitemaps] Removed DOM-based sitemap parser (jnioche) #177

View File

@ -69,8 +69,9 @@ import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
* <p>
* The crawl delay is parsed and added to the rules. Note that if the crawl
* delay inside the file exceeds a maximum value, the crawling of all resources
* is prohibited. The maximum value is defined with {@link #MAX_CRAWL_DELAY}=
* {@value #MAX_CRAWL_DELAY}.
* is prohibited. The default maximum value is defined with
* {@link #DEFAULT_MAX_CRAWL_DELAY}={@value #DEFAULT_MAX_CRAWL_DELAY}. The
* default value can be changed using the constructor.
* </p>
*
* <p>
@ -341,14 +342,24 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
private static final Pattern USER_AGENT_PATTERN = Pattern.compile("(?i)user-agent:");
// Max # of warnings during parse of any one robots.txt file.
private static final int MAX_WARNINGS = 5;
private static final int DEFAULT_MAX_WARNINGS = 5;
// Max value for crawl delay we'll use from robots.txt file. If the value is
// greater
// than this, we'll skip all pages.
private static final long MAX_CRAWL_DELAY = 300000;
// greater than this, we'll skip all pages.
private static final long DEFAULT_MAX_CRAWL_DELAY = 300000;
private int _numWarnings;
private int _maxWarnings;
private long _maxCrawlDelay;
public SimpleRobotRulesParser() {
this(DEFAULT_MAX_CRAWL_DELAY, DEFAULT_MAX_WARNINGS);
}
public SimpleRobotRulesParser(long maxCrawlDelay, int maxWarnings) {
this._maxCrawlDelay = maxCrawlDelay;
this._maxWarnings = maxWarnings;
}
@Override
public BaseRobotRules failedFetch(int httpStatusCode) {
@ -522,7 +533,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
}
SimpleRobotRules result = parseState.getRobotRules();
if (result.getCrawlDelay() > MAX_CRAWL_DELAY) {
if (result.getCrawlDelay() > _maxCrawlDelay) {
// Some evil sites use a value like 3600 (seconds) for the crawl
// delay, which would
// cause lots of problems for us.
@ -541,7 +552,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
LOGGER.warn("Problem processing robots.txt for {}", url);
}
if (_numWarnings < MAX_WARNINGS) {
if (_numWarnings < _maxWarnings) {
LOGGER.warn("\t {}", msg);
}
}