From 7bef14d386cdc17f315e55d5303ed2cbbe56a6b8 Mon Sep 17 00:00:00 2001 From: Aecio Santos Date: Sun, 13 May 2018 20:06:21 -0400 Subject: [PATCH] Make RobotRules accessible #134 - Makes SimpleRobotRulesParser._rules property protected and adds getters for SimpleRobotRulesParser._rules and RobotRules's properties - Changes SimpleRobotRulesParser return type from BaseRobotRules to SimpleRobotRules to allow access to concrete class without nasty type casts while still obeying super class contract --- CHANGES.txt | 1 + .../robots/SimpleRobotRules.java | 18 ++++++++++++++++-- .../robots/SimpleRobotRulesParser.java | 4 ++-- 3 files changed, 19 insertions(+), 4 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 4f09713..4394880 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -13,6 +13,7 @@ Current Development 0.10-SNAPSHOT (yyyy-mm-dd) - Handle new suffixes in PaidLevelDomain (kkrugler) #183 - Remove Tika dependency (kkrugler) #199 - Improve MIME detection for sitemaps (sebastian-nagel) #200 +- Make RobotRules accessible #134 Release 0.9 (2017-10-27) - [Sitemaps] Removed DOM-based sitemap parser (jnioche) #177 diff --git a/src/main/java/crawlercommons/robots/SimpleRobotRules.java b/src/main/java/crawlercommons/robots/SimpleRobotRules.java index efe761c..c9d0e93 100644 --- a/src/main/java/crawlercommons/robots/SimpleRobotRules.java +++ b/src/main/java/crawlercommons/robots/SimpleRobotRules.java @@ -21,6 +21,7 @@ import java.net.URL; import java.net.URLDecoder; import java.util.ArrayList; import java.util.Collections; +import java.util.List; /** * Result from parsing a single robots.txt file - which means we get a set of @@ -38,6 +39,7 @@ public class SimpleRobotRules extends BaseRobotRules { * Single rule that maps from a path prefix to an allow flag. */ public static class RobotRule implements Comparable, Serializable { + String _prefix; boolean _allow; @@ -46,6 +48,14 @@ public class SimpleRobotRules extends BaseRobotRules { _allow = allow; } + public boolean isAllow() { + return this._allow; + } + + public String getPrefix() { + return this._prefix; + } + // Sort from longest to shortest rules. @Override public int compareTo(RobotRule o) { @@ -103,8 +113,8 @@ public class SimpleRobotRules extends BaseRobotRules { } - private ArrayList _rules; - private RobotRulesMode _mode; + protected ArrayList _rules; + protected RobotRulesMode _mode; public SimpleRobotRules() { this(RobotRulesMode.ALLOW_SOME); @@ -131,6 +141,10 @@ public class SimpleRobotRules extends BaseRobotRules { _rules.add(new RobotRule(prefix, allow)); } + public List getRobotRules() { + return this._rules; + } + public boolean isAllowed(String url) { if (_mode == RobotRulesMode.ALLOW_NONE) { return false; diff --git a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java index 4354f41..5f0be53 100644 --- a/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java +++ b/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java @@ -351,7 +351,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser { private int _numWarnings; @Override - public BaseRobotRules failedFetch(int httpStatusCode) { + public SimpleRobotRules failedFetch(int httpStatusCode) { SimpleRobotRules result; if ((httpStatusCode >= 200) && (httpStatusCode < 300)) { @@ -386,7 +386,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser { * byte[], java.lang.String, java.lang.String) */ @Override - public BaseRobotRules parseContent(String url, byte[] content, String contentType, String robotNames) { + public SimpleRobotRules parseContent(String url, byte[] content, String contentType, String robotNames) { _numWarnings = 0; // If there's nothing there, treat it like we have no restrictions.