mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-24 19:36:06 +02:00
Merge pull request #208 from aecio/issue-134
Make RobotRules accessible #134
This commit is contained in:
commit
993c4fdb9e
|
@ -13,6 +13,7 @@ Current Development 0.10-SNAPSHOT (yyyy-mm-dd)
|
|||
- Handle new suffixes in PaidLevelDomain (kkrugler) #183
|
||||
- Remove Tika dependency (kkrugler) #199
|
||||
- Improve MIME detection for sitemaps (sebastian-nagel) #200
|
||||
- Make RobotRules accessible (aecio via kkrugler) #134
|
||||
- SimpleRobotRulesParser: Expose MAX_WARNINGS and MAX_CRAWL_DELAY (aecio via kkrugler) #194
|
||||
|
||||
Release 0.9 (2017-10-27)
|
||||
|
|
|
@ -21,6 +21,7 @@ import java.net.URL;
|
|||
import java.net.URLDecoder;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
|
||||
/**
|
||||
* Result from parsing a single robots.txt file - which means we get a set of
|
||||
|
@ -38,6 +39,7 @@ public class SimpleRobotRules extends BaseRobotRules {
|
|||
* Single rule that maps from a path prefix to an allow flag.
|
||||
*/
|
||||
public static class RobotRule implements Comparable<RobotRule>, Serializable {
|
||||
|
||||
String _prefix;
|
||||
boolean _allow;
|
||||
|
||||
|
@ -46,6 +48,14 @@ public class SimpleRobotRules extends BaseRobotRules {
|
|||
_allow = allow;
|
||||
}
|
||||
|
||||
public boolean isAllow() {
|
||||
return this._allow;
|
||||
}
|
||||
|
||||
public String getPrefix() {
|
||||
return this._prefix;
|
||||
}
|
||||
|
||||
// Sort from longest to shortest rules.
|
||||
@Override
|
||||
public int compareTo(RobotRule o) {
|
||||
|
@ -103,8 +113,8 @@ public class SimpleRobotRules extends BaseRobotRules {
|
|||
|
||||
}
|
||||
|
||||
private ArrayList<RobotRule> _rules;
|
||||
private RobotRulesMode _mode;
|
||||
protected ArrayList<RobotRule> _rules;
|
||||
protected RobotRulesMode _mode;
|
||||
|
||||
public SimpleRobotRules() {
|
||||
this(RobotRulesMode.ALLOW_SOME);
|
||||
|
@ -131,6 +141,10 @@ public class SimpleRobotRules extends BaseRobotRules {
|
|||
_rules.add(new RobotRule(prefix, allow));
|
||||
}
|
||||
|
||||
public List<RobotRule> getRobotRules() {
|
||||
return this._rules;
|
||||
}
|
||||
|
||||
public boolean isAllowed(String url) {
|
||||
if (_mode == RobotRulesMode.ALLOW_NONE) {
|
||||
return false;
|
||||
|
|
|
@ -362,7 +362,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
}
|
||||
|
||||
@Override
|
||||
public BaseRobotRules failedFetch(int httpStatusCode) {
|
||||
public SimpleRobotRules failedFetch(int httpStatusCode) {
|
||||
SimpleRobotRules result;
|
||||
|
||||
if ((httpStatusCode >= 200) && (httpStatusCode < 300)) {
|
||||
|
@ -397,7 +397,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
* byte[], java.lang.String, java.lang.String)
|
||||
*/
|
||||
@Override
|
||||
public BaseRobotRules parseContent(String url, byte[] content, String contentType, String robotNames) {
|
||||
public SimpleRobotRules parseContent(String url, byte[] content, String contentType, String robotNames) {
|
||||
_numWarnings = 0;
|
||||
|
||||
// If there's nothing there, treat it like we have no restrictions.
|
||||
|
|
Loading…
Reference in New Issue