1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-07 07:16:02 +02:00
crawler-commons/src/main/java/crawlercommons/robots/SimpleRobotRules.java
Sebastian Nagel 6c0d91e40b
[Robots.txt] Deduplicate robots rules before matching (#416)
* [Robots.txt] Deduplicate robots rules before matching
- update SimpleRobotRules documentation: add references
  to RFC 9309

* [Robots.txt] Deduplicate robots rules before matching

* SimpleRobotRules: add missing Override annotation
2023-06-09 09:10:06 +01:00

465 lines
16 KiB
Java

/**
* Copyright 2016 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.robots;
import java.io.Serializable;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;
import crawlercommons.filters.basic.BasicURLNormalizer;
/**
* Result from parsing a single robots.txt file - set of rules, and optionally a
* <a href=
* "https://en.wikipedia.org/wiki/Robots.txt#Crawl-delay_directive">crawl
* -delay</a> and <a
* href="https://www.sitemaps.org/protocol.html#submit_robots">sitemap</a> URLs.
* The <a href="https://www.rfc-editor.org/rfc/rfc9309.html">Robots Exclusion
* Protocol RFC 9309</a> is fully supported. This includes <a href=
* "https://developers.google.com/search/reference/robots_txt">Google's
* robots.txt extensions</a> to the <a
* href="http://www.robotstxt.org/robotstxt.html">original RFC draft</a> are
* covered: the <code>Allow</code> directive, <code>$</code>/<code>*</code>
* special characters and precedence of more specific patterns
*
* See also: <a
* href="https://en.wikipedia.org/wiki/Robots_exclusion_standard">Robots
* Exclusion on Wikipedia</a>
*/
@SuppressWarnings("serial")
public class SimpleRobotRules extends BaseRobotRules {
public enum RobotRulesMode {
ALLOW_ALL, ALLOW_NONE, ALLOW_SOME
}
/**
* Single rule that maps from a path prefix to an allow flag.
*/
public static class RobotRule implements Comparable<RobotRule>, Serializable {
String _prefix;
boolean _allow;
public RobotRule(String prefix, boolean allow) {
_prefix = prefix;
_allow = allow;
}
public boolean isAllow() {
return this._allow;
}
public String getPrefix() {
return this._prefix;
}
@Override
public int compareTo(RobotRule o) {
// order from longest to shortest path prefixes/patterns
if (_prefix.length() < o._prefix.length()) {
return 1;
} else if (_prefix.length() > o._prefix.length()) {
return -1;
} else if (_allow == o._allow) {
return 0;
} else if (_allow) {
// Allow comes before disallow
return -1;
} else {
return 1;
}
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#hashCode()
*/
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + (_allow ? 1231 : 1237);
result = prime * result + ((_prefix == null) ? 0 : _prefix.hashCode());
return result;
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#equals(java.lang.Object)
*/
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
RobotRule other = (RobotRule) obj;
if (_allow != other._allow)
return false;
if (_prefix == null) {
if (other._prefix != null)
return false;
} else if (!_prefix.equals(other._prefix))
return false;
return true;
}
}
protected ArrayList<RobotRule> _rules;
protected RobotRulesMode _mode;
/** Special characters which require percent-encoding for path matching */
protected final static boolean[] specialCharactersPathMatching = new boolean[128];
static {
specialCharactersPathMatching['*'] = true;
specialCharactersPathMatching['$'] = true;
}
public SimpleRobotRules() {
this(RobotRulesMode.ALLOW_SOME);
}
public SimpleRobotRules(RobotRulesMode mode) {
super();
_mode = mode;
_rules = new ArrayList<>();
}
public void clearRules() {
_rules.clear();
}
public void addRule(String prefix, boolean allow) {
// Convert old-style case of disallow: <nothing>
// into new allow: <nothing>.
if (!allow && (prefix.length() == 0)) {
allow = true;
}
_rules.add(new RobotRule(prefix, allow));
}
public List<RobotRule> getRobotRules() {
return this._rules;
}
@Override
public boolean isAllowed(String url) {
if (_mode == RobotRulesMode.ALLOW_NONE) {
return false;
} else if (_mode == RobotRulesMode.ALLOW_ALL) {
return true;
} else {
String pathWithQuery = getPath(url, true);
// Always allow robots.txt
if (pathWithQuery.equals("/robots.txt")) {
return true;
}
boolean isAllowed = true;
int longestRuleMatch = Integer.MIN_VALUE;
for (RobotRule rule : _rules) {
int matchLength = ruleMatches(pathWithQuery, rule._prefix);
if (matchLength == -1) {
// See precedence-of-rules test case for an example
// Some webmasters expect behavior close to google's, and
// this block is equivalent to:
// https://github.com/google/robotstxt/blob/02bc6cdfa32db50d42563180c42aeb47042b4f0c/robots.cc#L605-L618
// There are example robots.txt in the wild that benefit
// from this.
// As of 2/7/2022, https://venmo.com/robots.txt for
// instance.
if (rule._prefix.endsWith("index.htm") || rule._prefix.endsWith("index.html")) {
matchLength = ruleMatches(pathWithQuery, rule._prefix.substring(0, rule._prefix.indexOf("index.htm")) + "$");
if (matchLength == -1) {
continue;
}
} else {
continue;
}
}
if (longestRuleMatch < matchLength) {
longestRuleMatch = matchLength;
isAllowed = rule.isAllow();
} else if (longestRuleMatch == matchLength) {
isAllowed |= rule.isAllow();
}
// else we've already got a more specific rule, and this match
// doesn't matter
}
return isAllowed;
}
}
/**
* Encode/decode (using percent-encoding) all characters where necessary:
* encode Unicode/non-ASCII characters) and decode printable ASCII
* characters without special semantics.
*
* @param urlPathQuery
* path and query component of the URL
* @param additionalEncodedBytes
* boolean array to request bytes (ASCII characters) to be
* percent-encoded in addition to other characters requiring
* encoding (Unicode/non-ASCII and characters not allowed in
* URLs).
* @return properly percent-encoded URL path and query
*/
public static String escapePath(String urlPathQuery, boolean[] additionalEncodedBytes) {
return BasicURLNormalizer.escapePath(BasicURLNormalizer.unescapePath(urlPathQuery), additionalEncodedBytes);
}
private String getPath(String url, boolean getWithQuery) {
try {
URL urlObj = new URL(url);
String path = urlObj.getPath();
if ((path == null) || (path.equals(""))) {
path = "/";
}
String query = urlObj.getQuery();
if (getWithQuery && query != null) {
path += "?" + query;
}
/*
* We used to lower-case the path, but Google and RFC 9309 require
* case-sensitive matching.
*
* However, we need to properly decode percent-encoded characters,
* but preserve those escaped characters which have special
* semantics in path matching, e.g. slash `/`. However, for the
* implementation of the path matching requires that asterisk `*`
* and dollar `$` are exceptionally percent-encoded.
*/
return escapePath(path, specialCharactersPathMatching);
} catch (Exception e) {
// If the URL is invalid, we don't really care since the fetch
// will fail, so return the root.
return "/";
}
}
private int ruleMatches(String text, String pattern) {
int patternPos = 0;
int textPos = 0;
int patternEnd = pattern.length();
int textEnd = text.length();
boolean containsEndChar = pattern.endsWith("$");
if (containsEndChar) {
patternEnd -= 1;
}
while ((patternPos < patternEnd) && (textPos < textEnd)) {
// Find next wildcard in the pattern.
int wildcardPos = pattern.indexOf('*', patternPos);
if (wildcardPos == -1) {
wildcardPos = patternEnd;
}
// If we're at a wildcard in the pattern, find the place in the text
// where the character(s) after the wildcard match up with what's in
// the text.
if (wildcardPos == patternPos) {
patternPos += 1;
if (patternPos >= patternEnd) {
// Pattern ends with '*', we're all good.
return pattern.length();
}
// TODO - don't worry about having two '*' in a row?
// Find the end of the pattern piece we need to match.
int patternPieceEnd = pattern.indexOf('*', patternPos);
if (patternPieceEnd == -1) {
patternPieceEnd = patternEnd;
}
boolean matched = false;
int patternPieceLen = patternPieceEnd - patternPos;
while ((textPos + patternPieceLen <= textEnd) && !matched) {
// See if patternPieceLen chars from text at textPos match
// chars from pattern at patternPos
matched = true;
for (int i = 0; i < patternPieceLen && matched; i++) {
if (text.charAt(textPos + i) != pattern.charAt(patternPos + i)) {
matched = false;
}
}
// If we matched, we're all set, otherwise we have to
// advance textPos
if (!matched) {
textPos += 1;
}
}
// If we matched, we're all set, otherwise we failed
if (!matched) {
return -1;
}
} else {
// See if the pattern from patternPos to wildcardPos matches the
// text starting at textPos
while ((patternPos < wildcardPos) && (textPos < textEnd)) {
if (text.charAt(textPos++) != pattern.charAt(patternPos++)) {
return -1;
}
}
}
}
// If we didn't reach the end of the pattern, make sure we're not at a
// wildcard, that's a 0 or more match, so then we're still OK.
while ((patternPos < patternEnd) && (pattern.charAt(patternPos) == '*')) {
patternPos += 1;
}
// We're at the end, so we have a match if the pattern was completely
// consumed, and either we consumed all the text or we didn't have to
// match it all (no '$' at end of the pattern)
if ((patternPos == patternEnd) && ((textPos == textEnd) || !containsEndChar)) {
return pattern.length();
} else {
return -1;
}
}
/**
* Sort and deduplicate robot rules. This method must be called after the
* robots.txt has been processed and before rule matching.
*
* The ordering is implemented in {@link RobotRule#compareTo(RobotRule)} and
* defined by <a
* href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.2">RFC
* 9309, section 2.2.2</a>:
*
* <blockquote>The most specific match found MUST be used. The most specific
* match is the match that has the most octets. Duplicate rules in a group
* MAY be deduplicated.</blockquote>
*/
public void sortRules() {
if (_rules.size() > 1) {
_rules = new ArrayList<>(_rules.stream().sorted().distinct().collect(Collectors.toList()));
}
}
/**
* Is our ruleset set up to allow all access?
*
* @return true if all URLs are allowed.
*/
@Override
public boolean isAllowAll() {
return _mode == RobotRulesMode.ALLOW_ALL;
}
/**
* Is our ruleset set up to disallow all access?
*
* @return true if no URLs are allowed.
*/
@Override
public boolean isAllowNone() {
return _mode == RobotRulesMode.ALLOW_NONE;
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#hashCode()
*/
@Override
public int hashCode() {
final int prime = 31;
int result = super.hashCode();
result = prime * result + ((_mode == null) ? 0 : _mode.hashCode());
result = prime * result + ((_rules == null) ? 0 : _rules.hashCode());
return result;
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#equals(java.lang.Object)
*/
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (!super.equals(obj))
return false;
if (getClass() != obj.getClass())
return false;
SimpleRobotRules other = (SimpleRobotRules) obj;
if (_mode != other._mode)
return false;
if (_rules == null) {
if (other._rules != null)
return false;
} else if (!_rules.equals(other._rules))
return false;
return true;
}
/*
* (non-Javadoc)
*
* @see java.lang.Object#equals(java.lang.Object)
*/
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(super.toString());
int nRules = _rules.size();
if (nRules == 0) {
sb.append(" - no rules");
if (isAllowNone()) {
sb.append(" (allow none)");
} else if (isAllowAll()) {
sb.append(" (allow all)");
}
sb.append('\n');
} else {
sb.append(" - number of rules: ").append(nRules).append('\n');
int numOfRulesToShow = Math.min(nRules, 10);
for (int i = 0; i < numOfRulesToShow; i++) {
RobotRule r = _rules.get(i);
sb.append(r._allow ? " A" : " Disa").append("llow: ").append(r._prefix).append('\n');
}
}
return sb.toString();
}
}