1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-24 19:36:06 +02:00

Merge pull request #208 from aecio/issue-134

Make RobotRules accessible #134
This commit is contained in:
Ken Krugler 2018-05-14 11:21:42 -07:00 committed by GitHub
commit 993c4fdb9e
Signed by: GitHub
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 19 additions and 4 deletions

View File

@ -13,6 +13,7 @@ Current Development 0.10-SNAPSHOT (yyyy-mm-dd)
- Handle new suffixes in PaidLevelDomain (kkrugler) #183
- Remove Tika dependency (kkrugler) #199
- Improve MIME detection for sitemaps (sebastian-nagel) #200
- Make RobotRules accessible (aecio via kkrugler) #134
- SimpleRobotRulesParser: Expose MAX_WARNINGS and MAX_CRAWL_DELAY (aecio via kkrugler) #194
Release 0.9 (2017-10-27)

View File

@ -21,6 +21,7 @@ import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* Result from parsing a single robots.txt file - which means we get a set of
@ -38,6 +39,7 @@ public class SimpleRobotRules extends BaseRobotRules {
* Single rule that maps from a path prefix to an allow flag.
*/
public static class RobotRule implements Comparable<RobotRule>, Serializable {
String _prefix;
boolean _allow;
@ -46,6 +48,14 @@ public class SimpleRobotRules extends BaseRobotRules {
_allow = allow;
}
public boolean isAllow() {
return this._allow;
}
public String getPrefix() {
return this._prefix;
}
// Sort from longest to shortest rules.
@Override
public int compareTo(RobotRule o) {
@ -103,8 +113,8 @@ public class SimpleRobotRules extends BaseRobotRules {
}
private ArrayList<RobotRule> _rules;
private RobotRulesMode _mode;
protected ArrayList<RobotRule> _rules;
protected RobotRulesMode _mode;
public SimpleRobotRules() {
this(RobotRulesMode.ALLOW_SOME);
@ -131,6 +141,10 @@ public class SimpleRobotRules extends BaseRobotRules {
_rules.add(new RobotRule(prefix, allow));
}
public List<RobotRule> getRobotRules() {
return this._rules;
}
public boolean isAllowed(String url) {
if (_mode == RobotRulesMode.ALLOW_NONE) {
return false;

View File

@ -362,7 +362,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
}
@Override
public BaseRobotRules failedFetch(int httpStatusCode) {
public SimpleRobotRules failedFetch(int httpStatusCode) {
SimpleRobotRules result;
if ((httpStatusCode >= 200) && (httpStatusCode < 300)) {
@ -397,7 +397,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
* byte[], java.lang.String, java.lang.String)
*/
@Override
public BaseRobotRules parseContent(String url, byte[] content, String contentType, String robotNames) {
public SimpleRobotRules parseContent(String url, byte[] content, String contentType, String robotNames) {
_numWarnings = 0;
// If there's nothing there, treat it like we have no restrictions.