mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-09-25 01:50:39 +02:00
SimpleRobotRulesParser: Expose MAX_CRAWL_DELAY #194
- Makes MAX_CRAWL_DELAY configurable through class constructor
This commit is contained in:
parent
e25309d26c
commit
fd1e7fcffe
@ -13,6 +13,7 @@ Current Development 0.10-SNAPSHOT (yyyy-mm-dd)
|
||||
- Handle new suffixes in PaidLevelDomain (kkrugler) #183
|
||||
- Remove Tika dependency (kkrugler) #199
|
||||
- Improve MIME detection for sitemaps (sebastian-nagel) #200
|
||||
- SimpleRobotRulesParser: Expose MAX_CRAWL_DELAY #194
|
||||
|
||||
Release 0.9 (2017-10-27)
|
||||
- [Sitemaps] Removed DOM-based sitemap parser (jnioche) #177
|
||||
|
@ -69,8 +69,9 @@ import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
|
||||
* <p>
|
||||
* The crawl delay is parsed and added to the rules. Note that if the crawl
|
||||
* delay inside the file exceeds a maximum value, the crawling of all resources
|
||||
* is prohibited. The maximum value is defined with {@link #MAX_CRAWL_DELAY}=
|
||||
* {@value #MAX_CRAWL_DELAY}.
|
||||
* is prohibited. The default maximum value is defined with
|
||||
* {@link #DEFAULT_MAX_CRAWL_DELAY}={@value #DEFAULT_MAX_CRAWL_DELAY}. The
|
||||
* default value can be changed using the constructor.
|
||||
* </p>
|
||||
*
|
||||
* <p>
|
||||
@ -341,14 +342,24 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||
private static final Pattern USER_AGENT_PATTERN = Pattern.compile("(?i)user-agent:");
|
||||
|
||||
// Max # of warnings during parse of any one robots.txt file.
|
||||
private static final int MAX_WARNINGS = 5;
|
||||
private static final int DEFAULT_MAX_WARNINGS = 5;
|
||||
|
||||
// Max value for crawl delay we'll use from robots.txt file. If the value is
|
||||
// greater
|
||||
// than this, we'll skip all pages.
|
||||
private static final long MAX_CRAWL_DELAY = 300000;
|
||||
// greater than this, we'll skip all pages.
|
||||
private static final long DEFAULT_MAX_CRAWL_DELAY = 300000;
|
||||
|
||||
private int _numWarnings;
|
||||
private int _maxWarnings;
|
||||
private long _maxCrawlDelay;
|
||||
|
||||
public SimpleRobotRulesParser() {
|
||||
this(DEFAULT_MAX_CRAWL_DELAY, DEFAULT_MAX_WARNINGS);
|
||||
}
|
||||
|
||||
public SimpleRobotRulesParser(long maxCrawlDelay, int maxWarnings) {
|
||||
this._maxCrawlDelay = maxCrawlDelay;
|
||||
this._maxWarnings = maxWarnings;
|
||||
}
|
||||
|
||||
@Override
|
||||
public BaseRobotRules failedFetch(int httpStatusCode) {
|
||||
@ -522,7 +533,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||
}
|
||||
|
||||
SimpleRobotRules result = parseState.getRobotRules();
|
||||
if (result.getCrawlDelay() > MAX_CRAWL_DELAY) {
|
||||
if (result.getCrawlDelay() > _maxCrawlDelay) {
|
||||
// Some evil sites use a value like 3600 (seconds) for the crawl
|
||||
// delay, which would
|
||||
// cause lots of problems for us.
|
||||
@ -541,7 +552,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||
LOGGER.warn("Problem processing robots.txt for {}", url);
|
||||
}
|
||||
|
||||
if (_numWarnings < MAX_WARNINGS) {
|
||||
if (_numWarnings < _maxWarnings) {
|
||||
LOGGER.warn("\t {}", msg);
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user