1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-22 02:56:03 +02:00

Fix line wrapping in comments

This commit is contained in:
Sebastian Nagel 2023-04-24 15:55:37 +02:00
parent 2c2cb3bf7a
commit be2d5c24d3

View File

@ -529,11 +529,11 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
// Decide if we need to do special HTML processing.
boolean isHtmlType = ((contentType != null) && contentType.toLowerCase(Locale.ROOT).startsWith("text/html"));
// If it looks like it contains HTML, but doesn't have a user agent
// field, then
// assume somebody messed up and returned back to us a random HTML page
// instead
// of a robots.txt file.
/*
* If it looks like it contains HTML, but doesn't have a user agent
* field, then assume somebody messed up and returned back to us a
* random HTML page instead of a robots.txt file.
*/
boolean hasHTML = false;
if (isHtmlType || SIMPLE_HTML_PATTERN.matcher(contentAsStr).find()) {
if (!USER_AGENT_PATTERN.matcher(contentAsStr).find()) {
@ -560,12 +560,12 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
while (lineParser.hasMoreTokens()) {
String line = lineParser.nextToken();
// Get rid of HTML markup, in case some brain-dead webmaster has
// created an HTML
// page for robots.txt. We could do more sophisticated processing
// here to better
// handle bad HTML, but that's a very tiny percentage of all
// robots.txt files.
/*
* Get rid of HTML markup, in case some brain-dead webmaster has
* created an HTML page for robots.txt. We could do more
* sophisticated processing here to better handle bad HTML, but
* that's a very tiny percentage of all robots.txt files.
*/
if (hasHTML) {
line = line.replaceAll("<[^>]+>", "");
}
@ -865,9 +865,8 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
double delayValue = Double.parseDouble(delayString) * 1000.0;
state.setCrawlDelay(Math.round(delayValue));
} else {
long delayValue = Integer.parseInt(delayString) * 1000L; // sec
// to
// millisec
// seconds to milliseconds
long delayValue = Integer.parseInt(delayString) * 1000L;
state.setCrawlDelay(delayValue);
}
} catch (Exception e) {