1
0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-09-20 08:42:15 +02:00

Make RobotRules accessible #134

- Makes SimpleRobotRulesParser._rules property protected
  and adds getters for SimpleRobotRulesParser._rules and
  RobotRules's properties
- Changes SimpleRobotRulesParser return type from BaseRobotRules
  to SimpleRobotRules to allow access to concrete class without
  nasty type casts while still obeying super class contract
This commit is contained in:
Aecio Santos 2018-05-13 20:06:21 -04:00
parent e25309d26c
commit 7bef14d386
3 changed files with 19 additions and 4 deletions

View File

@ -13,6 +13,7 @@ Current Development 0.10-SNAPSHOT (yyyy-mm-dd)
- Handle new suffixes in PaidLevelDomain (kkrugler) #183
- Remove Tika dependency (kkrugler) #199
- Improve MIME detection for sitemaps (sebastian-nagel) #200
- Make RobotRules accessible #134
Release 0.9 (2017-10-27)
- [Sitemaps] Removed DOM-based sitemap parser (jnioche) #177

View File

@ -21,6 +21,7 @@ import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
/**
* Result from parsing a single robots.txt file - which means we get a set of
@ -38,6 +39,7 @@ public class SimpleRobotRules extends BaseRobotRules {
* Single rule that maps from a path prefix to an allow flag.
*/
public static class RobotRule implements Comparable<RobotRule>, Serializable {
String _prefix;
boolean _allow;
@ -46,6 +48,14 @@ public class SimpleRobotRules extends BaseRobotRules {
_allow = allow;
}
public boolean isAllow() {
return this._allow;
}
public String getPrefix() {
return this._prefix;
}
// Sort from longest to shortest rules.
@Override
public int compareTo(RobotRule o) {
@ -103,8 +113,8 @@ public class SimpleRobotRules extends BaseRobotRules {
}
private ArrayList<RobotRule> _rules;
private RobotRulesMode _mode;
protected ArrayList<RobotRule> _rules;
protected RobotRulesMode _mode;
public SimpleRobotRules() {
this(RobotRulesMode.ALLOW_SOME);
@ -131,6 +141,10 @@ public class SimpleRobotRules extends BaseRobotRules {
_rules.add(new RobotRule(prefix, allow));
}
public List<RobotRule> getRobotRules() {
return this._rules;
}
public boolean isAllowed(String url) {
if (_mode == RobotRulesMode.ALLOW_NONE) {
return false;

View File

@ -351,7 +351,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
private int _numWarnings;
@Override
public BaseRobotRules failedFetch(int httpStatusCode) {
public SimpleRobotRules failedFetch(int httpStatusCode) {
SimpleRobotRules result;
if ((httpStatusCode >= 200) && (httpStatusCode < 300)) {
@ -386,7 +386,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
* byte[], java.lang.String, java.lang.String)
*/
@Override
public BaseRobotRules parseContent(String url, byte[] content, String contentType, String robotNames) {
public SimpleRobotRules parseContent(String url, byte[] content, String contentType, String robotNames) {
_numWarnings = 0;
// If there's nothing there, treat it like we have no restrictions.