1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-18 01:56:06 +02:00
crawler-commons/src/main/java/crawlercommons/robots/SimpleRobotRulesParser.java
Sebastian Nagel 871e4e61d2
Merge pull request #430 from sebastian-nagel/cc-390-114-robots-closing-rule-group
[Robots.txt] Close groups of rules as defined in RFC 9309
2023-07-12 10:35:48 +02:00

1320 lines
51 KiB
Java

/**
* Copyright 2016 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.robots;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.net.URLDecoder;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.LinkedHashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;
/**
* Robots.txt parser following RFC 9309, supporting the Sitemap and Crawl-delay
* extensions.
*
* <p>
* This implementation of {@link BaseRobotsParser} retrieves a set of
* {@link SimpleRobotRules rules} for an agent with the given name from the
* <code>robots.txt</code> file of a given domain. The implementation follows
* <a href="https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method">RFC
* 9309</a>. The following robots.txt extensions are supported:</p>
* <ul>
* <li>the <a href=
* "https://en.wikipedia.org/wiki/Robots.txt#Crawl-delay_directive">Crawl-delay</a>
* directive</li>
* <li>the
* <a href="https://www.sitemaps.org/protocol.html#submit_robots">Sitemap
* directive</a></li>
* </ul>
*
* <p>
* The class fulfills two tasks. The first one is the parsing of the
* <code>robots.txt</code> file, done in
* {@link #parseContent(String, byte[], String, Collection)}. During the parsing
* process the parser searches for the provided agent name(s). If the parser
* finds a matching name, the set of rules for this name is parsed and added to
* the list of rules returned later. If another group of rules is matched in the
* robots.txt file, also the rules of this group are added to the returned list
* of rules. See
* <a href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1">RFC 9309,
* section 2.2.1</a> about the merging of groups of rules.
* </p>
*
* <p>
* By default and following
* <a href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1">RFC 9309,
* section 2.2.1</a>, the user-agent name (&quot;product token&quot;) is matched
* literally but case-insensitive over the full name. See
* {@link #setExactUserAgentMatching(boolean)} for details of agent name
* matching and a legacy substring prefix matching mode.
* </p>
*
* <p>
* {@link SimpleRobotRulesParser} allows to pass multiple agent names as a
* collection. All rule groups matching any of the agent names are followed and
* merged into one set of rules. The order of the agent names in the collection
* does not affect the selection of rules.
* </p>
*
* <p>
* The parser always parses the entire file to select all matching rule groups,
* and also to collect all of the sitemap directives.
* </p>
*
* <p>
* If no rule set matches any of the provided user-agent names, or if an empty
* collection of agent names is passed, the rule set for the <code>'*'</code>
* agent is returned. If there is no such rule set inside the
* <code>robots.txt</code> file, a rule set allowing all resource to be crawled
* is returned.
* </p>
*
* <p>
* The crawl-delay is parsed and added to the rules. Note that if the crawl
* delay inside the file exceeds a maximum value, the crawling of all resources
* is prohibited. The default maximum value is defined with
* {@link #DEFAULT_MAX_CRAWL_DELAY}={@value #DEFAULT_MAX_CRAWL_DELAY}
* milliseconds. The default value can be changed using the constructor
* ({@link #SimpleRobotRulesParser(long, int)} or via
* {@link #setMaxCrawlDelay(long)}.
* </p>
*
* <p>
* The second task of this class is to generate a set of rules if the fetching
* of the <code>robots.txt</code> file fails. The {@link #failedFetch(int)}
* method returns a predefined set of rules based on the given error code. If
* the status code is indicating a client error (status code = 4xx) we can
* assume that the <code>robots.txt</code> file is not there and crawling of all
* resources is allowed. If the status code equals a different error code (3xx
* or 5xx) the parser assumes a temporary error and a set of rules prohibiting
* any crawling is returned.
* </p>
*
* <p>
* Note that fetching of the robots.txt file is outside the scope of this class.
* It must be implemented in the calling code, including the following of
* &quot;at least five consecutive redirects&quot; as required by
* <a href="https://www.rfc-editor.org/rfc/rfc9309.html#name-redirects">RFC
* 9309, section 2.3.1.2</a>.
* </p>
*/
@SuppressWarnings("serial")
public class SimpleRobotRulesParser extends BaseRobotsParser {
private static final Logger LOGGER = LoggerFactory.getLogger(SimpleRobotRulesParser.class);
private enum RobotDirective {
USER_AGENT, DISALLOW,
ALLOW, CRAWL_DELAY, SITEMAP,
/**
* The &quot;Host&quot; directive was used by Yandex to indicate the
* main or canonical host of a set of mirrored web sites, see <a href=
* "https://web.archive.org/web/20130509230548/http://help.yandex.com/webmaster/?id=1113851#1113856">Yandex'
* Using robots.txt (archived version)</a>
*/
HOST,
// Google extension
NO_INDEX,
/**
* <a href=
* "https://en.wikipedia.org/wiki/Automated_Content_Access_Protocol">Automated
* Content Access Protocol</a> directives all start with
* <code>ACAP-</code>.
*/
ACAP_(true, false),
// Extensions to the standard
REQUEST_RATE, VISIT_TIME, ROBOT_VERSION, COMMENT,
// Line starts with http:, which we treat as sitemap directive.
HTTP,
// Line has no known directive on it.
UNKNOWN(false, true),
// Line has no directive on it
MISSING(false, true);
private boolean _prefix;
private boolean _special;
private RobotDirective() {
_prefix = false;
_special = false;
}
private RobotDirective(boolean isPrefix, boolean isSpecial) {
_prefix = isPrefix;
_special = isSpecial;
}
public boolean isSpecial() {
return _special;
}
public boolean isPrefix() {
return _prefix;
}
}
private static class ParseState {
/**
* Flag indicating whether the given name of the agent matched.
*/
private boolean _matchedRealName;
/**
* Flag that is true if not the given agent name but a wildcard matched.
*/
private boolean _matchedWildcard;
/**
* Flag that is true as long as it is allowed to add rules for the given
* agent.
*/
private boolean _addingRules;
/**
* Flag that is true as it is allowed to add a Crawl-delay for the given
* agent.
*/
private boolean _addingCrawlDelay;
/**
* Flag indicating whether all consecutive agent fields has been seen.
* It is set to false if an agent field is found and back to true if
* something else has been found.
*/
private boolean _finishedAgentFields;
/**
* Flag indicating whether the Crawl-delay was set for the given agent
* name (false means either not set at all or set for the wildcard
* agent).
*/
private boolean _crawlDelaySetRealName;
/*
* Counter of warnings reporting invalid rules/lines in the robots.txt
* file. The counter is used to limit the number of logged warnings.
*/
private int _numWarnings;
private String _url;
private Collection<String> _targetNames;
private SimpleRobotRules _curRules;
public ParseState(String url, Collection<String> targetNames) {
_url = url;
_targetNames = targetNames;
_curRules = new SimpleRobotRules();
}
public Collection<String> getTargetNames() {
return _targetNames;
}
public boolean isMatchedRealName() {
return _matchedRealName;
}
public void setMatchedRealName(boolean matchedRealName) {
_matchedRealName = matchedRealName;
}
public boolean isMatchedWildcard() {
return _matchedWildcard;
}
public void setMatchedWildcard(boolean matchedWildcard) {
_matchedWildcard = matchedWildcard;
}
public boolean isAddingRules() {
return _addingRules;
}
public void setAddingRules(boolean addingRules) {
_addingRules = addingRules;
}
public boolean isAddingCrawlDelay() {
return _addingCrawlDelay;
}
public void setAddingCrawlDelay(boolean addingCrawlDelay) {
_addingCrawlDelay = addingCrawlDelay;
}
public boolean isFinishedAgentFields() {
return _finishedAgentFields;
}
public void setFinishedAgentFields(boolean finishedAgentFields) {
_finishedAgentFields = finishedAgentFields;
}
public void clearRules() {
_curRules.clearRules();
}
public void addRule(String prefix, boolean allow) {
_curRules.addRule(prefix, allow);
}
/**
* Set the Crawl-delay given the current parse state.
*
* <p>
* The <a href=
* "https://en.wikipedia.org/wiki/Robots.txt#Crawl-delay_directive">Crawl
* -delay</a> is not part of RFC 9309 treating it (same as the Sitemap
* directive) as <a href=
* "https://www.rfc-editor.org/rfc/rfc9309.html#name-other-records">other
* records</a> and specifies: <cite>Parsing of other records MUST NOT
* interfere with the parsing of explicitly defined records</cite>.
* </p>
*
* This methods sets the Crawl-delay in the robots rules
* ({@link BaseRobotRules#setCrawlDelay(long)})
* <ul>
* <li>always if the given agent name was matched in the current
* group</li>
* <li>for wildcard agent groups, only if the crawl-delay wasn't already
* defined for the given agent</li>
* </ul>
*
* <p>
* In case of a multiple defined crawl-delay, resolve the conflict and
* override the current value if the value is shorter than the already
* defined one. But do not override a Crawl-delay defined for a specific
* agent by a value defined for the wildcard agent.
* </p>
*/
public void setCrawlDelay(long delay) {
if (_matchedRealName) {
// set crawl-delay for the given agent name
if (_crawlDelaySetRealName && _addingCrawlDelay) {
/*
* Crawl-delay defined twice for the same agent or defined
* for more than one of multiple agent names.
*
* Resolve conflict and select shortest Crawl-delay.
*/
if (_curRules.getCrawlDelay() > delay) {
_curRules.setCrawlDelay(delay);
}
} else {
_curRules.setCrawlDelay(delay);
}
_crawlDelaySetRealName = true;
} else if (_curRules.getCrawlDelay() == BaseRobotRules.UNSET_CRAWL_DELAY) {
// not set yet
_curRules.setCrawlDelay(delay);
} else if (_curRules.getCrawlDelay() > delay) {
// Crawl-delay defined twice for the wildcard agent.
// Resolve conflict and select shortest Crawl-delay.
_curRules.setCrawlDelay(delay);
}
}
public void clearCrawlDelay() {
_curRules.setCrawlDelay(BaseRobotRules.UNSET_CRAWL_DELAY);
}
public SimpleRobotRules getRobotRules() {
return _curRules;
}
public String getUrl() {
return _url;
}
public void addSitemap(String sitemap) {
_curRules.addSitemap(sitemap);
}
}
private static class RobotToken {
private RobotDirective _directive;
private String _data;
public RobotToken(RobotDirective directive, String data) {
_directive = directive;
_data = data;
}
public RobotDirective getDirective() {
return _directive;
}
public String getData() {
return _data;
}
}
private static Map<String, RobotDirective> DIRECTIVE_PREFIX = new HashMap<String, RobotDirective>();
static {
for (RobotDirective directive : RobotDirective.values()) {
if (!directive.isSpecial()) {
String prefix = directive.name().toLowerCase(Locale.ROOT).replaceAll("_", "-");
DIRECTIVE_PREFIX.put(prefix, directive);
}
}
DIRECTIVE_PREFIX.put("useragent", RobotDirective.USER_AGENT);
DIRECTIVE_PREFIX.put("useg-agent", RobotDirective.USER_AGENT);
DIRECTIVE_PREFIX.put("ser-agent", RobotDirective.USER_AGENT);
DIRECTIVE_PREFIX.put("users-agent", RobotDirective.USER_AGENT);
DIRECTIVE_PREFIX.put("user agent", RobotDirective.USER_AGENT);
DIRECTIVE_PREFIX.put("user-agnet", RobotDirective.USER_AGENT);
DIRECTIVE_PREFIX.put("user-agents", RobotDirective.USER_AGENT);
DIRECTIVE_PREFIX.put("desallow", RobotDirective.DISALLOW);
DIRECTIVE_PREFIX.put("dissallow", RobotDirective.DISALLOW);
DIRECTIVE_PREFIX.put("dissalow", RobotDirective.DISALLOW);
DIRECTIVE_PREFIX.put("disalow", RobotDirective.DISALLOW);
DIRECTIVE_PREFIX.put("dssalow", RobotDirective.DISALLOW);
DIRECTIVE_PREFIX.put("dsallow", RobotDirective.DISALLOW);
DIRECTIVE_PREFIX.put("diasllow", RobotDirective.DISALLOW);
DIRECTIVE_PREFIX.put("disallaw", RobotDirective.DISALLOW);
DIRECTIVE_PREFIX.put("diallow", RobotDirective.DISALLOW);
DIRECTIVE_PREFIX.put("disallows", RobotDirective.DISALLOW);
DIRECTIVE_PREFIX.put("disllow", RobotDirective.DISALLOW);
DIRECTIVE_PREFIX.put("crawl delay", RobotDirective.CRAWL_DELAY);
DIRECTIVE_PREFIX.put("clawl-delay", RobotDirective.CRAWL_DELAY);
DIRECTIVE_PREFIX.put("craw-delay", RobotDirective.CRAWL_DELAY);
DIRECTIVE_PREFIX.put("crawl-deley", RobotDirective.CRAWL_DELAY);
DIRECTIVE_PREFIX.put("sitemaps", RobotDirective.SITEMAP);
DIRECTIVE_PREFIX.put("https", RobotDirective.HTTP);
}
// separator is either one or more spaces/tabs, or a colon
private static final Pattern COLON_DIRECTIVE_DELIMITER = Pattern.compile("[ \t]*:[ \t]*(.*)");
private static final Pattern BLANK_DIRECTIVE_DELIMITER = Pattern.compile("[ \t]+(.*)");
// match the rest of the directive, up until whitespace or colon
private static final Pattern DIRECTIVE_SUFFIX_PATTERN = Pattern.compile("[^: \t]+(.*)");
// split pattern for robot names
private static final Pattern ROBOT_NAMES_SPLIT = Pattern.compile("\\s*,\\s*|\\s+");
/**
* Figure out directive on line of text from robots.txt file. We assume the
* line has been lower-cased
*
* @param line
* @return robot command found on line
*/
private static RobotToken tokenize(String line) {
String lowerLine = line.toLowerCase(Locale.ROOT);
for (String prefix : DIRECTIVE_PREFIX.keySet()) {
int prefixLength = prefix.length();
if (lowerLine.startsWith(prefix)) {
RobotDirective directive = DIRECTIVE_PREFIX.get(prefix);
String dataPortion = line.substring(prefixLength);
if (directive.isPrefix()) {
Matcher m = DIRECTIVE_SUFFIX_PATTERN.matcher(dataPortion);
if (m.matches()) {
dataPortion = m.group(1);
} else {
continue;
}
}
Matcher m = COLON_DIRECTIVE_DELIMITER.matcher(dataPortion);
if (!m.matches()) {
m = BLANK_DIRECTIVE_DELIMITER.matcher(dataPortion);
}
if (m.matches()) {
return new RobotToken(directive, m.group(1).trim());
}
}
}
Matcher m = COLON_DIRECTIVE_DELIMITER.matcher(lowerLine);
if (m.matches()) {
return new RobotToken(RobotDirective.UNKNOWN, line);
} else {
return new RobotToken(RobotDirective.MISSING, line);
}
}
private static final Pattern SIMPLE_HTML_PATTERN = Pattern.compile("(?is)<(html|head|body)\\s*>");
private static final Pattern USER_AGENT_PATTERN = Pattern.compile("(?i)user-agent:");
/**
* Pattern to match a valid user-agent product tokens as defined in
* <a href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1">RFC
* 9309, section 2.2.1</a>
*/
protected static final Pattern USER_AGENT_PRODUCT_TOKEN_MATCHER = Pattern.compile("[a-zA-Z_-]+");
/**
* Default max number of warnings logged during parse of any one robots.txt
* file, see {@link #setMaxWarnings(int)}
*/
public static final int DEFAULT_MAX_WARNINGS = 5;
/**
* Default max Crawl-Delay in milliseconds, see
* {@link #setMaxCrawlDelay(long)}
*/
public static final long DEFAULT_MAX_CRAWL_DELAY = 300000;
// number of warnings found in the latest processed robots.txt file
private ThreadLocal<Integer> _numWarningsDuringLastParse = new ThreadLocal<>();
private int _maxWarnings;
private long _maxCrawlDelay;
private boolean _exactUserAgentMatching;
public SimpleRobotRulesParser() {
this(DEFAULT_MAX_CRAWL_DELAY, DEFAULT_MAX_WARNINGS);
}
/**
* @param maxCrawlDelay
* see {@link #setMaxCrawlDelay(long)}
* @param maxWarnings
* see {@link #setMaxWarnings(int)}
*/
public SimpleRobotRulesParser(long maxCrawlDelay, int maxWarnings) {
this._maxCrawlDelay = maxCrawlDelay;
this._maxWarnings = maxWarnings;
this._exactUserAgentMatching = true;
}
/**
* Validate a user-agent product token as defined in
* <a href="https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2.1">RFC
* 9309, section 2.2.1</a>
*
* @param userAgent
* user-agent token to verify
* @return true if the product token is valid
*/
protected static boolean isValidUserAgentToObey(String userAgent) {
return userAgent != null && USER_AGENT_PRODUCT_TOKEN_MATCHER.matcher(userAgent).matches();
}
/**
* {@inheritDoc}
*
* <p>
* Set rules for response status codes following <a href=
* "https://www.rfc-editor.org/rfc/rfc9309.html#name-access-results">RFC
* 9309, section 2.3.1</a></p>:
* <ul>
* <li>Success (HTTP 200-299): throws {@link IllegalStateException} because
* the response content needs to be parsed</li>
* <li>"Unavailable" Status (HTTP 400-499): allow all</li>
* <li>"Unreachable" Status (HTTP 500-599): disallow all</li>
* <li>every other HTTP status code is treated as "allow all", but further
* visits on the server are deferred (see
* {@link SimpleRobotRules#setDeferVisits(boolean)})</li>
* </ul>
*/
@Override
public SimpleRobotRules failedFetch(int httpStatusCode) {
SimpleRobotRules result;
if ((httpStatusCode >= 200) && (httpStatusCode < 300)) {
throw new IllegalStateException("Can't use status code constructor with 2xx response");
} else if ((httpStatusCode >= 300) && (httpStatusCode < 400)) {
// Should only happen if we're getting endless redirects (more than
// our follow limit), so treat it as a temporary failure.
result = new SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
result.setDeferVisits(true);
} else if ((httpStatusCode >= 400) && (httpStatusCode < 500)) {
// Some sites return 410 (gone) instead of 404 (not found), so treat
// as the same. Actually treat all (including forbidden) as "no
// robots.txt", as that's what Google and other search engines do.
result = new SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
} else {
// Treat all other status codes as a temporary failure.
result = new SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
result.setDeferVisits(true);
}
return result;
}
/*
* (non-Javadoc)
*
* @see
* crawlercommons.robots.BaseRobotsParser#parseContent(java.lang.String,
* byte[], java.lang.String, java.lang.String)
*/
@Deprecated
@Override
public SimpleRobotRules parseContent(String url, byte[] content, String contentType, String robotNames) {
return parseContent(url, content, contentType, new LinkedHashSet<>(Arrays.asList(splitRobotNames(robotNames))), false);
}
/**
* Split a string listing user-agent / robot names into tokens.
*
* Splitting is done at comma and/or whitespace, the tokens are converted to
* lower-case.
*
* @param robotNames
* robot / user-agent string
* @return array of user-agent / robot tokens
*/
protected String[] splitRobotNames(String robotNames) {
// lower case and trim
robotNames = robotNames.toLowerCase(Locale.ROOT).trim();
// split at commas and whitespace
return ROBOT_NAMES_SPLIT.split(robotNames);
}
/**
* Parse the robots.txt file in <i>content</i>, and return rules appropriate
* for processing paths by <i>userAgent</i>.
*
* <p>
* Multiple agent names can be passed as a collection. See
* {@link #setExactUserAgentMatching(boolean)} for details how agent names
* are matched. If multiple agent names are passed, all matching rule groups
* are followed and merged into one set of rules. The order of the agent
* names in the collection does not affect the selection of rules.
* </p>
*
* <p>
* If none of the provided agent names is matched, rules addressed to the
* wildcard user-agent (<code>*</code>) are selected.
* </p>
*
* @param url
* {@inheritDoc}
* @param content
* {@inheritDoc}
* @param contentType
* {@inheritDoc}
* @param robotNames
* crawler (user-agent) name(s), used to select rules from the
* robots.txt file by matching the names against the user-agent
* lines in the robots.txt file. Robot names should be single
* token names, without version or other parts. Names must be
* lower-case, as the user-agent line is also converted to
* lower-case for matching. If the collection is empty, the rules
* for the wildcard user-agent (<code>*</code>) are selected. The
* wildcard user-agent name should not be contained in
* <i>robotNames</i>.
* @return robot rules.
*/
@Override
public SimpleRobotRules parseContent(String url, byte[] content, String contentType, Collection<String> robotNames) {
return parseContent(url, content, contentType, robotNames, isExactUserAgentMatching());
}
private SimpleRobotRules parseContent(String url, byte[] content, String contentType, Collection<String> robotNames, boolean exactUserAgentMatching) {
// If there's nothing there, treat it like we have no restrictions.
if ((content == null) || (content.length == 0)) {
return new SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
}
int bytesLen = content.length;
int offset = 0;
/*
* RFC 9309 requires that is "UTF-8 encoded" (<a href=
* "https://www.rfc-editor.org/rfc/rfc9309.html#name-access-method"> RFC
* 9309, section 2.3 Access Method</a>), but
* "Implementors MAY bridge encoding mismatches if they detect that the robots.txt file is not UTF-8 encoded."
* (<a href=
* "https://www.rfc-editor.org/rfc/rfc9309.html#name-the-allow-and-disallow-line"
* > RFC 9309, section 2.2.2. The "Allow" and "Disallow" Lines</a>)
*/
Charset encoding = StandardCharsets.UTF_8;
// Check for a UTF-8 BOM at the beginning (EF BB BF)
if ((bytesLen >= 3) && (content[0] == (byte) 0xEF) && (content[1] == (byte) 0xBB) && (content[2] == (byte) 0xBF)) {
offset = 3;
bytesLen -= 3;
encoding = StandardCharsets.UTF_8;
}
// Check for UTF-16LE BOM at the beginning (FF FE)
else if ((bytesLen >= 2) && (content[0] == (byte) 0xFF) && (content[1] == (byte) 0xFE)) {
offset = 2;
bytesLen -= 2;
encoding = StandardCharsets.UTF_16LE;
}
// Check for UTF-16BE BOM at the beginning (FE FF)
else if ((bytesLen >= 2) && (content[0] == (byte) 0xFE) && (content[1] == (byte) 0xFF)) {
offset = 2;
bytesLen -= 2;
encoding = StandardCharsets.UTF_16BE;
}
String contentAsStr;
contentAsStr = new String(content, offset, bytesLen, encoding);
// Decide if we need to do special HTML processing.
boolean isHtmlType = ((contentType != null) && contentType.toLowerCase(Locale.ROOT).startsWith("text/html"));
/*
* If it looks like it contains HTML, but doesn't have a user agent
* field, then assume somebody messed up and returned back to us a
* random HTML page instead of a robots.txt file.
*/
boolean hasHTML = false;
if (isHtmlType || SIMPLE_HTML_PATTERN.matcher(contentAsStr).find()) {
if (!USER_AGENT_PATTERN.matcher(contentAsStr).find()) {
LOGGER.trace("Found non-robots.txt HTML file: " + url);
return new SimpleRobotRules(RobotRulesMode.ALLOW_ALL);
} else {
// We'll try to strip out HTML tags below.
if (isHtmlType) {
LOGGER.debug("HTML content type returned for robots.txt file: " + url);
} else {
LOGGER.debug("Found HTML in robots.txt file: " + url);
}
hasHTML = true;
}
}
// Break on anything that might be used as a line ending. Since
// tokenizer doesn't return empty tokens, a \r\n sequence still
// works since it looks like an empty string between the \r and \n.
StringTokenizer lineParser = new StringTokenizer(contentAsStr, "\n\r\u0085\u2028\u2029");
ParseState parseState = new ParseState(url, robotNames);
while (lineParser.hasMoreTokens()) {
String line = lineParser.nextToken();
/*
* Get rid of HTML markup, in case some brain-dead webmaster has
* created an HTML page for robots.txt. We could do more
* sophisticated processing here to better handle bad HTML, but
* that's a very tiny percentage of all robots.txt files.
*/
if (hasHTML) {
line = line.replaceAll("<[^>]+>", "");
}
// trim out comments and whitespace
int hashPos = line.indexOf("#");
if (hashPos >= 0) {
line = line.substring(0, hashPos);
}
line = line.trim();
if (line.length() == 0) {
continue;
}
RobotToken token = tokenize(line);
switch (token.getDirective()) {
case USER_AGENT:
handleUserAgent(parseState, token);
break;
case DISALLOW:
parseState.setFinishedAgentFields(true);
handleDisallow(parseState, token);
break;
case ALLOW:
parseState.setFinishedAgentFields(true);
handleAllow(parseState, token);
break;
case CRAWL_DELAY:
handleCrawlDelay(parseState, token);
parseState.setAddingCrawlDelay(false);
break;
case SITEMAP:
handleSitemap(parseState, token);
break;
case HTTP:
handleHttp(parseState, token);
break;
case UNKNOWN:
reportWarning(parseState, "Unknown directive in robots.txt file: {}", line);
break;
case MISSING:
reportWarning(parseState, "Unknown line in robots.txt file (size {}): {}", content.length, line);
break;
default:
// All others we just ignore
break;
}
}
this._numWarningsDuringLastParse.set(parseState._numWarnings);
SimpleRobotRules result = parseState.getRobotRules();
if (result.getCrawlDelay() > _maxCrawlDelay) {
// Some evil sites use a value like 3600 (seconds) for the crawl
// delay, which would cause lots of problems for us.
LOGGER.debug("Crawl delay exceeds max value - so disallowing all URLs: {}", url);
return new SimpleRobotRules(RobotRulesMode.ALLOW_NONE);
} else {
result.sortRules();
return result;
}
}
private void reportWarning(ParseState state, String msg, Object... args) {
state._numWarnings += 1;
if (state._numWarnings == 1) {
LOGGER.warn("Problem processing robots.txt for {}", state._url);
}
if (state._numWarnings < _maxWarnings) {
for (int i = 0; i < args.length; i++) {
if (args[i] instanceof String && ((String) args[i]).length() > 1024) {
// clip overlong strings to prevent from overflows in log messages
args[i] = ((String) args[i]).substring(0, 1024) + " ...";
}
}
LOGGER.warn("\t " + msg, args);
}
}
/*
* Check whether user-agent line starts with a valid user-agent product
* token, but continues with additional characters to be ignored e.g. "foo"
* matches "User-agent: foo/1.2" or "butterfly" matches
* "User-agent: Butterfly/1.0"
*
* See https://www.rfc-editor.org/rfc/rfc9309.html#section-2.3.1.5
* "Crawlers MUST try to parse each line of the robots.txt file. Crawlers MUST use the parseable rules."
* and https://github.com/google/robotstxt/issues/56
*
* This method may be overridden to implement non-standard user-agent matching.
*
* @param agentName user-agent, found in the <a href=
* "https://www.rfc-editor.org/rfc/rfc9309.html#name-the-user-agent-line">
* robots.txt user-agent line</a>
*
* @param targetTokens collection of user-agent product tokens we're looking
* for. Product tokens are expected to be lowercase.
*
* @return true if the product token is match as token prefix
*/
protected boolean userAgentProductTokenPartialMatch(String agentName, Collection<String> targetTokens) {
Matcher m = USER_AGENT_PRODUCT_TOKEN_MATCHER.matcher(agentName);
return m.lookingAt() && targetTokens.contains(m.group());
}
/**
* Handle the user-agent: directive
*
* @param state
* current parsing state
* @param token
* data for directive
*/
private void handleUserAgent(ParseState state, RobotToken token) {
// If we are adding rules, and had already finished reading user agent
// fields, then we are in a new section, hence stop adding rules.
if (state.isAddingRules() && state.isFinishedAgentFields()) {
state.setAddingRules(false);
state.setAddingCrawlDelay(false);
}
// Clearly we've encountered a new user agent directive, hence we need to start
// processing until we are finished.
state.setFinishedAgentFields(false);
// Handle the case when multiple target names are passed.
// We assume we should do case-insensitive comparison of every target name.
Collection<String> targetNames = state.getTargetNames();
if (isExactUserAgentMatching()) {
String agentName = token.getData().trim().toLowerCase(Locale.ROOT);
if (agentName.isEmpty()) {
// Ignore empty names
} else if (agentName.equals("*") && !state.isMatchedRealName()) {
state.setMatchedWildcard(true);
state.setAddingRules(true);
state.setAddingCrawlDelay(true);
} else if (targetNames.contains(agentName) || (!isValidUserAgentToObey(agentName) && userAgentProductTokenPartialMatch(agentName, targetNames))) {
if (state.isMatchedWildcard()) {
// Clear rules and Crawl-delay of the wildcard user-agent
// found before the non-wildcard user-agent match.
state.clearRules();
state.clearCrawlDelay();
}
state.setMatchedRealName(true);
state.setAddingRules(true);
state.setAddingCrawlDelay(true);
state.setMatchedWildcard(false);
}
} else {
/*
* prefix matching on user-agent words - backward-compatibility with
* the old and deprecated API if "User-Agent" HTTP request header
* strings are passed as param instead of single-word/token
* user-agent names, e.g. if the robot name is "WebCrawler/1.0" and is
* expected match the robots.txt directive "User-agent: mybot"
* or also "User-agent: my".
*/
String agentNameFull = token.getData().trim().toLowerCase(Locale.ROOT);
boolean matched = false;
if (agentNameFull.equals("*") && !state.isMatchedRealName()) {
state.setMatchedWildcard(true);
state.setAddingRules(true);
state.setAddingCrawlDelay(true);
} else if (userAgentProductTokenPartialMatch(agentNameFull, targetNames)) {
// match "butterfly" in the line "User-agent: Butterfly/1.0"
matched = true;
} else {
String[] agentNames = ROBOT_NAMES_SPLIT.split(agentNameFull);
if (agentNames.length > 1) {
LOGGER.debug("Multiple agent names in user-agent line: {}", token.getData());
}
for (String agentName : agentNames) {
for (String targetName : targetNames) {
LOGGER.debug(targetName);
if (targetName.startsWith(agentName)) {
matched = true;
break;
}
}
}
}
if (matched) {
if (state.isMatchedWildcard()) {
// Clear rules and Crawl-delay of the wildcard user-agent
// found before the non-wildcard user-agent match.
state.clearRules();
state.clearCrawlDelay();
}
state.setMatchedRealName(true);
state.setAddingRules(true);
state.setAddingCrawlDelay(true);
state.setMatchedWildcard(false);
}
}
}
/**
* Add any uniform rules to clean up path directives
* <ul>
* <li>trim leading and trailing white space and control characters not
* handled by the tokenizer</li>
* <li>properly percent-encode all characters where necessary</li>
* <li>but make sure that characters with special semantics for path
* matching (asterisk <code>*</code>, slash <code>/</code>, dollar
* <code>$</code>, etc.) are left as is (do not decode if percent-encoded).
* </ul>
*
* This method uses {@link SimpleRobotRules#escapePath(String, boolean[])}
* to normalize the URL path before matching against allow/disallow rules.
*
* @param path
* @return clean and encoded path
*/
private String normalizePathDirective(String path) {
return SimpleRobotRules.escapePath(path.trim(), null);
}
/**
* Handle the disallow: directive
*
* @param state
* current parsing state
* @param token
* data for directive
*/
private void handleDisallow(ParseState state, RobotToken token) {
if (!state.isAddingRules()) {
return;
}
String path = token.getData();
try {
path = normalizePathDirective(path);
if (path.length() == 0) {
/*
* "Disallow: <nothing>" meant "allow all" in the 1996 REP RFC
* draft because there was no "Allow" directive.
*
* RFC 9309 allows empty patterns in the formal syntax
* (https://www.rfc-editor.org/rfc/rfc9309.html#section-2.2). No
* specific handling of empty patterns is defined, as shortest
* pattern they are matched with lowest priority.
*
* Adding an extra rule with an empty pattern would have no
* effect, because an "allow all" with lowest priority means
* "allow everything else" which is the default anyway. So, we
* ignore the empty disallow statement.
*/
} else {
state.addRule(path, false);
}
} catch (Exception e) {
reportWarning(state, "Error parsing robots rules - can't decode path: {}", path);
}
}
/**
* Handle the allow: directive
*
* @param state
* current parsing state
* @param token
* data for directive
*/
private void handleAllow(ParseState state, RobotToken token) {
if (!state.isAddingRules()) {
return;
}
String path = token.getData();
try {
path = normalizePathDirective(path);
} catch (Exception e) {
reportWarning(state, "Error parsing robots rules - can't decode path: {}", path);
}
if (path.length() == 0) {
/*
* Allow: <nothing> => allow all.
*
* See handleDisallow(...): We ignore the empty allow statement.
*/
} else {
state.addRule(path, true);
}
}
/**
* Handle the crawl-delay: directive
*
* @param state
* current parsing state
* @param token
* data for directive
*/
private void handleCrawlDelay(ParseState state, RobotToken token) {
if (!state.isAddingCrawlDelay()) {
return;
}
String delayString = token.getData();
if (delayString.length() > 0) {
try {
// Some sites use values like 0.5 for the delay.
if (delayString.indexOf('.') != -1) {
double delayValue = Double.parseDouble(delayString) * 1000.0;
state.setCrawlDelay(Math.round(delayValue));
} else {
// seconds to milliseconds
long delayValue = Integer.parseInt(delayString) * 1000L;
state.setCrawlDelay(delayValue);
}
} catch (Exception e) {
reportWarning(state, "Error parsing robots rules - can't decode crawl delay: {}", delayString);
}
}
state.setAddingCrawlDelay(false);
}
/**
* Handle the sitemap: directive
*
* @param state
* current parsing state
* @param token
* data for directive
*/
private void handleSitemap(ParseState state, RobotToken token) {
String sitemap = token.getData();
try {
URL sitemapUrl;
URL base = null;
try {
base = new URL(state.getUrl());
} catch (MalformedURLException e) {
// must try without base URL
}
if (base != null) {
sitemapUrl = new URL(base, sitemap);
} else {
sitemapUrl = new URL(sitemap);
}
String hostname = sitemapUrl.getHost();
if ((hostname != null) && (hostname.length() > 0)) {
state.addSitemap(sitemapUrl.toExternalForm());
}
} catch (Exception e) {
reportWarning(state, "Invalid URL with sitemap directive: {}", sitemap);
}
}
/**
* Handle a line that starts with http: and contains "sitemap", which we
* treat as a missing sitemap: xxx directive.
*
* @param state
* current parsing state
* @param token
* data for directive
*/
private void handleHttp(ParseState state, RobotToken token) {
String urlFragment = token.getData();
if (urlFragment.contains("sitemap")) {
RobotToken fixedToken = new RobotToken(RobotDirective.SITEMAP, "http:" + token.getData());
handleSitemap(state, fixedToken);
} else {
reportWarning(state, "Found raw non-sitemap URL: http:{}", urlFragment);
}
}
/**
* Get the number of warnings due to invalid rules/lines in the latest
* processed robots.txt file (see
* {@link #parseContent(String, byte[], String, String)}.
*
* Note: an incorrect value may be returned if the processing of the
* robots.txt happened in a different than the current thread.
*
* @return number of warnings
*/
public int getNumWarnings() {
return _numWarningsDuringLastParse.get();
}
/** Get max number of logged warnings per robots.txt */
public int getMaxWarnings() {
return _maxWarnings;
}
/**
* Set the max number of warnings about parse errors logged per robots.txt
*/
public void setMaxWarnings(int maxWarnings) {
_maxWarnings = maxWarnings;
}
/**
* Get configured max crawl delay.
*
* @return the configured max. crawl delay, see
* {@link #setMaxCrawlDelay(long)}
*/
public long getMaxCrawlDelay() {
return _maxCrawlDelay;
}
/**
* Set the max value in milliseconds accepted for the <a href=
* "https://en.wikipedia.org/wiki/Robots_exclusion_standard#Crawl-delay_directive">Crawl-Delay</a>
* directive. If the value in the robots.txt is greater than the max. value,
* all pages are skipped to avoid that overtly long Crawl-Delays block fetch
* queues and make the crawling slow. Note: the value is in milliseconds as
* some sites use floating point numbers to define the delay.
*/
public void setMaxCrawlDelay(long maxCrawlDelay) {
_maxCrawlDelay = maxCrawlDelay;
}
/**
* Set how the user-agent names in the robots.txt (<code>User-agent:</code>
* lines) are matched with the provided robot names:
* <ul>
* <li>(with exact matching) follow the
* <a href= "https://datatracker.ietf.org/doc/rfc9309/">Robots Exclusion
* Protocol RFC 9309</a> and match user agent <b>literally but
* case-insensitive over the full string length</b>:
*
* <blockquote>Crawlers set their own name, which is called a product token,
* to find relevant groups. The product token MUST contain only upper and
* lowercase letters ("a-z" and "A-Z"), underscores ("_"), and hyphens
* ("-"). [...] Crawlers MUST use case-insensitive matching to find the
* group that matches the product token and then obey the rules of the
* group.</blockquote></li>
*
* <li>(without exact matching) split the user-agent and robot names at
* whitespace into words and perform a prefix match (one of the user-agent
* words must be a prefix of one of the robot words, eg. the robot name
* <code>WebCrawler/3.0</code> matches the robots.txt directive
*
* <pre>
* User-agent: webcrawler
* </pre>
*
* This prefix matching on words allows that crawler developers lazily use
* the HTTP User-Agent string also for the robots.txt parser. It does not
* cover the case when the HTTP User-Agent string is used in the
* robots.txt.</li>
* </ul>
*
* @param exactMatching
* if true, configure exact user-agent name matching. If false,
* disable exact matching and do prefix matching on user-agent
* words.
*/
public void setExactUserAgentMatching(boolean exactMatching) {
_exactUserAgentMatching = exactMatching;
}
/**
* @return whether exact user-agent matching is configured, see
* {@link #setExactUserAgentMatching(boolean)}
*/
public boolean isExactUserAgentMatching() {
return _exactUserAgentMatching;
}
public static void main(String[] args) throws MalformedURLException, IOException {
if (args.length < 1) {
System.err.println("SimpleRobotRulesParser <robots.txt> [[<agentname>] <URL>...]");
System.err.println();
System.err.println("Parse a robots.txt file");
System.err.println(" <robots.txt>\tURL pointing to robots.txt file.");
System.err.println(" \tMax. five HTTP redirects are followed.");
System.err.println(" \tTo read a local file use a file:// URL");
System.err.println(" \t(parsed as http://example.com/robots.txt)");
System.err.println(" <agentname> \tuser agent name to check for exclusion rules,");
System.err.println(" \ta single 'product token' as per RFC 9309.");
System.err.println(" \tIf not defined check with '*'");
System.err.println(" <URL> \tcheck URL whether allowed or forbidden.");
System.err.println(" \tIf no URL is given show the robots.txt rules.");
System.exit(1);
}
String url = args[0];
// empty collection to select rules for wildcard user-agent (*)
Collection<String> agentNames = Set.of();
String agentName = "*"; // for logging
if (args.length >= 2) {
agentName = args[1].trim().toLowerCase(Locale.ROOT);
agentNames = Set.of(agentName);
}
SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
BaseRobotRules rules = null;
URL u = new URL(url);
URLConnection connection = u.openConnection();
if (!agentNames.isEmpty()) {
connection.setRequestProperty("User-Agent", agentName);
}
try {
if (connection instanceof HttpURLConnection) {
HttpURLConnection httpConnection = (HttpURLConnection) connection;
int redirects = 0;
int maxRedirects = 5; // "five consecutive redirects" (RFC 9309)
while (redirects <= maxRedirects) {
httpConnection.setInstanceFollowRedirects(false);
int code = httpConnection.getResponseCode();
switch (code) {
case HttpURLConnection.HTTP_OK:
System.out.println("Successfully fetched robots.txt");
byte[] content = IOUtils.toByteArray(httpConnection);
rules = parser.parseContent(url, content, httpConnection.getContentType(), agentNames);
break;
case HttpURLConnection.HTTP_MOVED_PERM:
case HttpURLConnection.HTTP_MOVED_TEMP:
case HttpURLConnection.HTTP_SEE_OTHER:
redirects++;
String location = httpConnection.getHeaderField("Location");
if (location == null) {
System.out.println("Redirect without Location header");
rules = parser.failedFetch(code);
break;
}
location = URLDecoder.decode(location, "UTF-8");
u = new URL(u, location);
if (redirects == maxRedirects) {
System.out.println("Reached maximum of " + maxRedirects + " redirects, not following redirect to " + u.toString());
rules = parser.failedFetch(code);
break;
}
System.out.println("Following redirect to " + u.toString());
httpConnection = (HttpURLConnection) u.openConnection();
if (!agentNames.isEmpty()) {
httpConnection.setRequestProperty("User-Agent", agentName);
}
continue; // continue redirecting
default:
System.out.println("Fetch of " + url + " failed with HTTP status code " + code);
rules = parser.failedFetch(code);
break;
}
break;
}
} else {
// not a HTTP URL, maybe file://
byte[] content = IOUtils.toByteArray(connection);
rules = parser.parseContent(url, content, "text/plain", agentNames);
// use artificial URL to avoid problems resolving relative
// sitemap paths for file:/ URLs
url = "http://example.com/robots.txt";
}
} catch (IOException e) {
System.out.println("Fetch of " + url + " failed with: " + e.getMessage());
throw e;
}
if (args.length < 3) {
// no URL(s) given, print rules and exit
System.out.println("Robot rules for user agentname '" + agentName + "':");
System.out.println(rules.toString());
} else {
System.out.println("Checking URLs:");
for (int i = 2; i < args.length; i++) {
System.out.println((rules.isAllowed(args[i]) ? "allowed " : "forbidden") + "\t" + args[i]);
}
}
}
}