1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-18 18:06:05 +02:00

Upgrade to JDK 1.7 compiler version and introduce Maven forbidden API's plugin

This commit is contained in:
Lewis John McGibbney 2015-09-06 13:55:26 -04:00
parent 827b073d12
commit ba5906ec40
18 changed files with 234 additions and 136 deletions

84
pom.xml
View File

@ -1,4 +1,22 @@
<?xml version="1.0" encoding="UTF-8"?>
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing,
software distributed under the License is distributed on an
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
KIND, either express or implied. See the License for the
specific language governing permissions and limitations
under the License.
-->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<parent>
@ -31,7 +49,7 @@
<connection>scm:git:git://github.com/crawler-commons/crawler-commons.git</connection>
<developerConnection>scm:git:git@github.com:crawler-commons/crawler-commons.git</developerConnection>
<tag>HEAD</tag>
</scm>
</scm>
<distributionManagement>
<repository>
@ -49,7 +67,7 @@
<mailingLists>
<mailingList>
<name>Project Mailing List</name>
<post>crawler-commons [at] googlecode [dot] com</post>
<post>crawler-commons [at] googlegroups [dot] com</post>
</mailingList>
</mailingLists>
@ -135,6 +153,32 @@
<!--autoVersionSubmodules>true</autoVersionSubmodules -->
</configuration>
</plugin>
<!--This plugin's configuration is used to store Eclipse m2e settings only. It has no influence on the Maven build itself.-->
<plugin>
<groupId>org.eclipse.m2e</groupId>
<artifactId>lifecycle-mapping</artifactId>
<version>1.0.0</version>
<configuration>
<lifecycleMappingMetadata>
<pluginExecutions>
<pluginExecution>
<pluginExecutionFilter>
<groupId>de.thetaphi</groupId>
<artifactId>forbiddenapis</artifactId>
<versionRange>[1.8,)</versionRange>
<goals>
<goal>testCheck</goal>
<goal>check</goal>
</goals>
</pluginExecutionFilter>
<action>
<ignore></ignore>
</action>
</pluginExecution>
</pluginExecutions>
</lifecycleMappingMetadata>
</configuration>
</plugin>
</plugins>
</pluginManagement>
@ -183,7 +227,34 @@
<configFile>${project.basedir}/doc/eclipse-formatter.xml</configFile>
</configuration>
</plugin>
<plugin>
<groupId>de.thetaphi</groupId>
<artifactId>forbiddenapis</artifactId>
<version>1.8</version>
<configuration>
<!-- disallow undocumented classes like sun.misc.Unsafe: -->
<internalRuntimeForbidden>true</internalRuntimeForbidden>
<!--
if the used Java version is too new,
don't fail, just do nothing:
-->
<failOnUnsupportedJava>false</failOnUnsupportedJava>
<bundledSignatures>
<bundledSignature>jdk-unsafe</bundledSignature>
<bundledSignature>jdk-deprecated</bundledSignature>
<bundledSignature>jdk-system-out</bundledSignature>
<!--bundledSignature>commons-io-unsafe-${commons-io.version}</bundledSignature-->
</bundledSignatures>
</configuration>
<executions>
<execution>
<goals>
<goal>check</goal>
<goal>testCheck</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
@ -300,11 +371,12 @@
<!-- General Properties -->
<implementation.build>${scmBranch}@r${buildNumber}</implementation.build>
<javac.src.version>1.6</javac.src.version>
<javac.target.version>1.6</javac.target.version>
<javac.src.version>1.7</javac.src.version>
<javac.target.version>1.7</javac.target.version>
<maven.compiler.target>1.7</maven.compiler.target>
<maven.build.timestamp.format>yyyy-MM-dd HH:mm:ssZ</maven.build.timestamp.format>
<skipTests>false</skipTests>
<assembly.finalName>apache-${project.build.finalName}</assembly.finalName>
<assembly.finalName>${project.build.finalName}</assembly.finalName>
</properties>
<dependencies>

View File

@ -133,7 +133,7 @@ public abstract class BaseFetchException extends Exception {
@Override
public void printStackTrace() {
_exception.printStackTrace();
_exception.getMessage();
}
@Override

View File

@ -17,6 +17,7 @@
package crawlercommons.fetcher;
import java.nio.charset.Charset;
import java.security.InvalidParameterException;
import java.util.Arrays;
@ -174,7 +175,7 @@ public class FetchedResult {
report.append(" FetchedUrl : " + getFetchedUrl() + "\n");
report.append(" ContentType : " + getContentType() + "\n");
report.append(" ContentLength : " + getContentLength() + "\n");
report.append(" Content : " + new String(getContent()) + "\n"); // byte
report.append(" Content : " + new String(getContent(), Charset.defaultCharset()) + "\n"); // byte
// array
// to
// string

View File

@ -31,6 +31,7 @@ import java.security.NoSuchAlgorithmException;
import java.security.cert.CertificateException;
import java.security.cert.X509Certificate;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.concurrent.TimeUnit;
@ -519,7 +520,7 @@ public class SimpleHttpFetcher extends BaseHttpFetcher {
} catch (HttpFetchException e) {
// Don't bother generating a trace for a 404 (not found)
if (LOGGER.isTraceEnabled() && (e.getHttpStatus() != HttpStatus.SC_NOT_FOUND)) {
LOGGER.trace(String.format("Exception fetching %s (%s)", url, e.getMessage()));
LOGGER.trace(String.format(Locale.getDefault(), "Exception fetching %s (%s)", url, e.getMessage()));
}
throw e;
@ -527,11 +528,11 @@ public class SimpleHttpFetcher extends BaseHttpFetcher {
// Don't bother reporting that we bailed because the mime-type
// wasn't one that we wanted.
if (e.getAbortReason() != AbortedFetchReason.INVALID_MIMETYPE) {
LOGGER.debug(String.format("Exception fetching %s (%s)", url, e.getMessage()));
LOGGER.debug(String.format(Locale.getDefault(), "Exception fetching %s (%s)", url, e.getMessage()));
}
throw e;
} catch (BaseFetchException e) {
LOGGER.debug(String.format("Exception fetching %s (%s)", url, e.getMessage()));
LOGGER.debug(String.format(Locale.getDefault(), "Exception fetching %s (%s)", url, e.getMessage()));
throw e;
}
}
@ -547,7 +548,7 @@ public class SimpleHttpFetcher extends BaseHttpFetcher {
return doRequest(request, url, payload);
} catch (BaseFetchException e) {
if (LOGGER.isTraceEnabled()) {
LOGGER.trace(String.format("Exception fetching %s", url), e);
LOGGER.trace(String.format(Locale.getDefault(), "Exception fetching %s", url), e);
}
throw e;
}
@ -675,7 +676,7 @@ public class SimpleHttpFetcher extends BaseHttpFetcher {
throw new RedirectFetchException(url, redirectUrl, mre.getReason());
} else if (e.getCause() instanceof RedirectException) {
e.printStackTrace();
LOGGER.error(e.getMessage());
throw new RedirectFetchException(url, extractRedirectedUrl(url, localContext), RedirectExceptionReason.TOO_MANY_REDIRECTS);
} else {
throw new IOFetchException(url, e);

View File

@ -18,6 +18,7 @@
package crawlercommons.fetcher.http;
import java.io.Serializable;
import java.util.Locale;
import crawlercommons.CrawlerCommons;
@ -103,6 +104,6 @@ public class UserAgent implements Serializable {
public String getUserAgentString() {
// Mozilla/5.0 (compatible; mycrawler/1.0; +http://www.mydomain.com;
// mycrawler@mydomain.com)
return String.format("%s (compatible; %s%s; +%s; %s)", _browserVersion, getAgentName(), _crawlerVersion, _webAddress, _emailAddress);
return String.format(Locale.getDefault(), "%s (compatible; %s%s; +%s; %s)", _browserVersion, getAgentName(), _crawlerVersion, _webAddress, _emailAddress);
}
}

View File

@ -18,10 +18,10 @@
package crawlercommons.robots;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URL;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
@ -188,7 +188,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
static {
for (RobotDirective directive : RobotDirective.values()) {
if (!directive.isSpecial()) {
String prefix = directive.name().toLowerCase().replaceAll("_", "-");
String prefix = directive.name().toLowerCase(Locale.getDefault()).replaceAll("_", "-");
DIRECTIVE_PREFIX.put(prefix, directive);
}
}
@ -220,7 +220,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
* @return robot command found on line
*/
private static RobotToken tokenize(String line) {
String lowerLine = line.toLowerCase();
String lowerLine = line.toLowerCase(Locale.getDefault());
for (String prefix : DIRECTIVE_PREFIX.keySet()) {
int prefixLength = prefix.length();
if (lowerLine.startsWith(prefix)) {
@ -336,7 +336,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
}
// Decide if we need to do special HTML processing.
boolean isHtmlType = ((contentType != null) && contentType.toLowerCase().startsWith("text/html"));
boolean isHtmlType = ((contentType != null) && contentType.toLowerCase(Locale.getDefault()).startsWith("text/html"));
// If it looks like it contains HTML, but doesn't have a user agent
// field, then
@ -366,7 +366,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
// an empty
// string between the \r and \n.
StringTokenizer lineParser = new StringTokenizer(contentAsStr, "\n\r\u0085\u2028\u2029");
ParseState parseState = new ParseState(url, robotName.toLowerCase());
ParseState parseState = new ParseState(url, robotName.toLowerCase(Locale.getDefault()));
boolean keepGoing = true;
while (keepGoing && lineParser.hasMoreTokens()) {
@ -425,7 +425,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
break;
case MISSING:
reportWarning(String.format("Unknown line in robots.txt file (size %d): %s", content.length, line), url);
reportWarning(String.format(Locale.getDefault(), "Unknown line in robots.txt file (size %d): %s", content.length, line), url);
parseState.setFinishedAgentFields(true);
break;
@ -496,7 +496,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
// Handle the case when there are multiple target names are passed
// TODO should we do lowercase comparison of target name? Assuming yes.
String[] targetNames = state.getTargetName().toLowerCase().split(",");
String[] targetNames = state.getTargetName().toLowerCase(Locale.getDefault()).split(",");
for (int count = 0; count < targetNames.length; count++) {
// Extract possible match names from our target agent name, since it
@ -508,7 +508,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
String[] agentNames = token.getData().split("[ \t,]");
for (String agentName : agentNames) {
// TODO should we do case-insensitive matching? Probably yes.
agentName = agentName.trim().toLowerCase();
agentName = agentName.trim().toLowerCase(Locale.getDefault());
if (agentName.isEmpty()) {
// Ignore empty names
} else if (agentName.equals("*") && !state.isMatchedWildcard()) {

View File

@ -21,8 +21,8 @@ import java.net.URL;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.Locale;
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -42,14 +42,14 @@ public abstract class AbstractSiteMap {
private static final ThreadLocal<DateFormat> W3C_NO_SECONDS_FORMAT = new ThreadLocal<DateFormat>() {
protected DateFormat initialValue() {
return new SimpleDateFormat("yyyy-MM-dd'T'HH:mmZ");
return new SimpleDateFormat("yyyy-MM-dd'T'HH:mmZ", Locale.getDefault());
}
};
private static final ThreadLocal<DateFormat> W3C_FULLDATE_FORMAT = new ThreadLocal<DateFormat>() {
protected DateFormat initialValue() {
SimpleDateFormat result = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ");
SimpleDateFormat result = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.getDefault());
result.setTimeZone(TimeZone.getTimeZone("UTC"));
return result;
}

View File

@ -5,17 +5,21 @@ import java.net.URL;
import java.util.Collection;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Sitemap Tool for recursively fetching all URL's from a sitemap (and all of
* it's children)
**/
public class SiteMapTester {
private static final Logger LOG = LoggerFactory.getLogger(SiteMapTester.class);
private static SiteMapParser parser = new SiteMapParser(false);
public static void main(String[] args) throws IOException, UnknownFormatException {
if (args.length < 1) {
System.err.println("Usage: SiteMapTester <URL_TO_TEST> [MIME_TYPE]");
LOG.error("Usage: SiteMapTester <URL_TO_TEST> [MIME_TYPE]");
} else {
URL url = new URL(args[0]);
String mt = (args.length > 1) ? args[1] : null;
@ -47,7 +51,7 @@ public class SiteMapTester {
} else {
Collection<SiteMapURL> links = ((SiteMap) sm).getSiteMapUrls();
for (SiteMapURL smu : links) {
System.out.println(smu.getUrl());
LOG.info(smu.getUrl().toString());
}
}
}

View File

@ -23,6 +23,7 @@ import org.slf4j.LoggerFactory;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Date;
import java.util.Locale;
/**
* The SitemapUrl class represents a URL found in a Sitemap.
@ -226,7 +227,7 @@ public class SiteMapURL {
public void setChangeFrequency(String changeFreq) {
if (changeFreq != null) {
changeFreq = changeFreq.toUpperCase();
changeFreq = changeFreq.toUpperCase(Locale.getDefault());
if (changeFreq.contains("ALWAYS")) {
this.changeFreq = ChangeFrequency.ALWAYS;

View File

@ -25,6 +25,8 @@ import java.net.IDN;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.Locale;
import java.nio.charset.Charset;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -79,7 +81,7 @@ public class EffectiveTldFinder {
if (null == effective_tld_data_stream && null != this.getClass().getResource(ETLD_DATA)) {
effective_tld_data_stream = this.getClass().getResourceAsStream(ETLD_DATA);
}
BufferedReader input = new BufferedReader(new InputStreamReader(effective_tld_data_stream));
BufferedReader input = new BufferedReader(new InputStreamReader(effective_tld_data_stream, Charset.defaultCharset()));
String line = null;
while (null != (line = input.readLine())) {
if (line.length() == 0 || (line.length() > 1 && line.startsWith(COMMENT))) {
@ -148,8 +150,8 @@ public class EffectiveTldFinder {
*/
public static String getAssignedDomain(String hostname) {
EffectiveTLD etld = getEffectiveTLD(hostname);
if (null == etld || etld.getDomain() == hostname.toLowerCase()) {
return hostname.toLowerCase();
if (null == etld || etld.getDomain() == hostname.toLowerCase(Locale.getDefault())) {
return hostname.toLowerCase(Locale.getDefault());
}
String domain = hostname.replaceFirst(".*?([^.]+\\.)" + etld.getDomain() + "$", "$1" + etld.getDomain());
return domain;
@ -201,7 +203,7 @@ public class EffectiveTldFinder {
private String asciiConvert(String str) {
if (isAscii(str)) {
return str.toLowerCase();
return str.toLowerCase(Locale.getDefault());
}
return IDN.toASCII(str);
}

View File

@ -20,6 +20,7 @@ package crawlercommons.url;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Pattern;
@ -76,7 +77,7 @@ public class PaidLevelDomain {
}
int firstHostPiece = 0;
if (ccTLDs.contains(subNames[numPieces - 1].toLowerCase())) {
if (ccTLDs.contains(subNames[numPieces - 1].toLowerCase(Locale.getDefault()))) {
// We have a country code at the end. See if the preceding piece is
// either
// a two-letter name (country code or funky short gTLD), or one of
@ -85,15 +86,15 @@ public class PaidLevelDomain {
if (subNames[numPieces - 2].length() <= 2) {
// Must be xxx.co.jp format
firstHostPiece = numPieces - 3;
} else if (gTLDs.contains(subNames[numPieces - 2].toLowerCase())) {
} else if (gTLDs.contains(subNames[numPieces - 2].toLowerCase(Locale.getDefault()))) {
// Must be xxx.com.mx format
firstHostPiece = numPieces - 3;
} else {
// Must be xxx.it format
firstHostPiece = numPieces - 2;
}
} else if (gTLDs.contains(subNames[numPieces - 1].toLowerCase())) {
if (ccTLDs.contains(subNames[numPieces - 2].toLowerCase())) {
} else if (gTLDs.contains(subNames[numPieces - 1].toLowerCase(Locale.getDefault()))) {
if (ccTLDs.contains(subNames[numPieces - 2].toLowerCase(Locale.getDefault()))) {
// Must be xxx.de.com format
firstHostPiece = numPieces - 3;
} else {

View File

@ -16,22 +16,29 @@
*/
package crawlercommons.fetcher;
import java.io.UnsupportedEncodingException;
import org.apache.tika.metadata.Metadata;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* @author lmcgibbn
*
*/
public class FetchedResultTest {
private static final Logger LOG = LoggerFactory.getLogger(FetchedResultTest.class);
/**
* Test method for {@link crawlercommons.fetcher.FetchedResult#report()}.
* This does not actually test anything but simply allows us to see what a
* generated report would look like.
* @throws UnsupportedEncodingException
*/
@Test
public void testPrintReport() {
public void testPrintReport() throws UnsupportedEncodingException {
Metadata headerMetadata = new Metadata();
headerMetadata.add(Metadata.CONTENT_DISPOSITION, "This is content disposition");
headerMetadata.add(Metadata.CONTENT_ENCODING, "This is the encoding");
@ -52,8 +59,8 @@ public class FetchedResultTest {
"http://en.wikipedia.org/wiki/Glasgow", // redirectedUrl
System.currentTimeMillis(), // fetchTime
headerMetadata, new String("Glasgow (/ˈɡlɑːzɡoʊ, ˈɡlæz-/;[4] Scots: Glesca; Scottish Gaelic: Glaschu) "
+ "is the largest city in Scotland, and the third largest in the United Kingdom.").getBytes(), "ScotsText", 2014, load, "http://en.wikipedia.org/wiki/Glasgow",
+ "is the largest city in Scotland, and the third largest in the United Kingdom.").getBytes("UTF-8"), "ScotsText", 2014, load, "http://en.wikipedia.org/wiki/Glasgow",
0, "wikipedia.org", 200, "");
System.out.println(result.report());
LOG.error(result.report());
}
}

View File

@ -25,6 +25,7 @@ import static org.junit.Assert.fail;
import java.io.IOException;
import java.net.ConnectException;
import java.nio.charset.Charset;
import java.util.HashSet;
import java.util.Set;
@ -118,7 +119,7 @@ public class SimpleHttpFetcherTest {
String content = "redirected";
response.setContentLength(content.length());
response.getOutputStream().write(content.getBytes());
response.getOutputStream().write(content.getBytes("UTF-8"));
}
}
}
@ -148,7 +149,7 @@ public class SimpleHttpFetcherTest {
response.setContentType("text/plain");
response.setContentLength(content.length());
response.getOutputStream().write(content.getBytes());
response.getOutputStream().write(content.getBytes("UTF-8"));
}
}
@ -170,7 +171,7 @@ public class SimpleHttpFetcherTest {
}
response.setContentLength(content.length());
response.getOutputStream().write(content.getBytes());
response.getOutputStream().write(content.getBytes("UTF-8"));
}
}
@ -380,7 +381,7 @@ public class SimpleHttpFetcherTest {
BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT);
String url = "http://localhost:8089/";
FetchedResult result = fetcher.get(url);
String contentStr = new String(result.getContent(), 0, result.getContentLength());
String contentStr = new String(result.getContent(), 0, result.getContentLength(), Charset.defaultCharset());
assertTrue(englishContent.equals(contentStr));
}

View File

@ -114,7 +114,7 @@ public class RobotUtilsTest {
BaseHttpFetcher fetcher = Mockito.mock(BaseHttpFetcher.class);
FetchedResult result = Mockito.mock(FetchedResult.class);
Mockito.when(result.getContent()).thenReturn(simpleRobotsTxt.getBytes());
Mockito.when(result.getContent()).thenReturn(simpleRobotsTxt.getBytes("UTF-8"));
Mockito.when(fetcher.get(Mockito.any(String.class))).thenReturn(result);
UserAgent userAgent = new UserAgent("testAgent", "crawler@domain.com", "http://www.domain.com");
Mockito.when(fetcher.getUserAgent()).thenReturn(userAgent);

View File

@ -26,6 +26,7 @@ import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Locale;
import javax.servlet.http.HttpServletResponse;
@ -45,8 +46,8 @@ public class SimpleRobotRulesParserTest {
}
@Test
public void testEmptyRules() throws MalformedURLException {
BaseRobotRules rules = createRobotRules("Any-darn-crawler", "".getBytes());
public void testEmptyRules() throws MalformedURLException, UnsupportedEncodingException {
BaseRobotRules rules = createRobotRules("Any-darn-crawler", "".getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
}
@ -54,7 +55,7 @@ public class SimpleRobotRulesParserTest {
public void testQueryParamInDisallow() throws Exception {
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /index.cfm?fuseaction=sitesearch.results*";
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://searchservice.domain.com/index.cfm?fuseaction=sitesearch.results&type=People&qry=california&pg=2"));
}
@ -64,7 +65,7 @@ public class SimpleRobotRulesParserTest {
// Test for /fish
final String simpleRobotsTxt1 = "User-agent: *" + CRLF + "Disallow: /fish" + CRLF;
BaseRobotRules rule1 = createRobotRules("Any-darn-crawler", simpleRobotsTxt1.getBytes());
BaseRobotRules rule1 = createRobotRules("Any-darn-crawler", simpleRobotsTxt1.getBytes("UTF-8"));
assertFalse(rule1.isAllowed("http://www.fict.com/fish"));
assertFalse(rule1.isAllowed("http://www.fict.com/fish.html"));
assertFalse(rule1.isAllowed("http://www.fict.com/fish/salmon.html"));
@ -80,7 +81,7 @@ public class SimpleRobotRulesParserTest {
// Test for /fish*
final String simpleRobotsTxt2 = "User-agent: *" + CRLF + "Disallow: /fish*" + CRLF;
BaseRobotRules rule2 = createRobotRules("Any-darn-crawler", simpleRobotsTxt2.getBytes());
BaseRobotRules rule2 = createRobotRules("Any-darn-crawler", simpleRobotsTxt2.getBytes("UTF-8"));
assertFalse(rule2.isAllowed("http://www.fict.com/fish"));
assertFalse(rule2.isAllowed("http://www.fict.com/fish.html"));
assertFalse(rule2.isAllowed("http://www.fict.com/fish/salmon.html"));
@ -96,7 +97,7 @@ public class SimpleRobotRulesParserTest {
// Test for /fish/
final String simpleRobotsTxt3 = "User-agent: *" + CRLF + "Disallow: /fish/" + CRLF;
BaseRobotRules rule3 = createRobotRules("Any-darn-crawler", simpleRobotsTxt3.getBytes());
BaseRobotRules rule3 = createRobotRules("Any-darn-crawler", simpleRobotsTxt3.getBytes("UTF-8"));
assertFalse(rule3.isAllowed("http://www.fict.com/fish/"));
assertFalse(rule3.isAllowed("http://www.fict.com/fish/?id=anything"));
assertFalse(rule3.isAllowed("http://www.fict.com/fish/salmon.htm"));
@ -108,7 +109,7 @@ public class SimpleRobotRulesParserTest {
// Test for /*.php
final String simpleRobotsTxt4 = "User-agent: *" + CRLF + "Disallow: /*.php" + CRLF;
BaseRobotRules rule4 = createRobotRules("Any-darn-crawler", simpleRobotsTxt4.getBytes());
BaseRobotRules rule4 = createRobotRules("Any-darn-crawler", simpleRobotsTxt4.getBytes("UTF-8"));
assertFalse(rule4.isAllowed("http://www.fict.com/filename.php"));
assertFalse(rule4.isAllowed("http://www.fict.com/folder/filename.php"));
assertFalse(rule4.isAllowed("http://www.fict.com/folder/filename.php?parameters"));
@ -121,7 +122,7 @@ public class SimpleRobotRulesParserTest {
// Test for /*.php$
final String simpleRobotsTxt5 = "User-agent: *" + CRLF + "Disallow: /*.php$" + CRLF;
BaseRobotRules rule5 = createRobotRules("Any-darn-crawler", simpleRobotsTxt5.getBytes());
BaseRobotRules rule5 = createRobotRules("Any-darn-crawler", simpleRobotsTxt5.getBytes("UTF-8"));
assertFalse(rule5.isAllowed("http://www.fict.com/filename.php"));
assertFalse(rule5.isAllowed("http://www.fict.com/folder/filename.php"));
@ -133,7 +134,7 @@ public class SimpleRobotRulesParserTest {
// Test for /fish*.php
final String simpleRobotsTxt6 = "User-agent: *" + CRLF + "Disallow: /fish*.php" + CRLF;
BaseRobotRules rule6 = createRobotRules("Any-darn-crawler", simpleRobotsTxt6.getBytes());
BaseRobotRules rule6 = createRobotRules("Any-darn-crawler", simpleRobotsTxt6.getBytes("UTF-8"));
assertFalse(rule6.isAllowed("http://www.fict.com/fish.php"));
assertFalse(rule6.isAllowed("http://www.fict.com/fishheads/catfish.php?parameters"));
@ -142,35 +143,35 @@ public class SimpleRobotRulesParserTest {
// Test rule with multiple '*' characters
final String simpleRobotsTxt7 = "User-agent: *" + CRLF + "Disallow: /*fish*.php" + CRLF;
BaseRobotRules rule7 = createRobotRules("Any-darn-crawler", simpleRobotsTxt7.getBytes());
BaseRobotRules rule7 = createRobotRules("Any-darn-crawler", simpleRobotsTxt7.getBytes("UTF-8"));
assertFalse(rule7.isAllowed("http://www.fict.com/fish.php"));
assertFalse(rule7.isAllowed("http://www.fict.com/superfishheads/catfish.php?parameters"));
assertTrue(rule7.isAllowed("http://www.fict.com/fishheads/catfish.htm"));
}
@Test
public void testCommentedOutLines() throws MalformedURLException {
public void testCommentedOutLines() throws MalformedURLException, UnsupportedEncodingException {
final String simpleRobotsTxt = "#user-agent: testAgent" + LF + LF + "#allow: /index.html" + LF + "#allow: /test" + LF + LF + "#user-agent: test" + LF + LF + "#allow: /index.html" + LF
+ "#disallow: /test" + LF + LF + "#user-agent: someAgent" + LF + LF + "#disallow: /index.html" + LF + "#disallow: /test" + LF + LF;
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
Assert.assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
}
@Test
public void testRobotsTxtAlwaysAllowed() throws MalformedURLException {
public void testRobotsTxtAlwaysAllowed() throws MalformedURLException, UnsupportedEncodingException {
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /";
BaseRobotRules rules = createRobotRules("any-darn-crawler", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/robots.txt"));
}
@Test
public void testAgentNotListed() throws MalformedURLException {
public void testAgentNotListed() throws MalformedURLException, UnsupportedEncodingException {
// Access is assumed to be allowed, if no rules match an agent.
final String simpleRobotsTxt = "User-agent: crawler1" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + CRLF + "User-agent: crawler2" + CRLF + "Disallow: /";
BaseRobotRules rules = createRobotRules("crawler3", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("crawler3", simpleRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
assertTrue(rules.isAllowed("http://www.domain.com/index.html"));
}
@ -184,26 +185,26 @@ public class SimpleRobotRulesParserTest {
}
@Test
public void testSimplestAllowAll() throws MalformedURLException {
public void testSimplestAllowAll() throws MalformedURLException, UnsupportedEncodingException {
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow:";
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
}
@Test
public void testMixedEndings() throws MalformedURLException {
public void testMixedEndings() throws MalformedURLException, UnsupportedEncodingException {
final String mixedEndingsRobotsTxt = "# /robots.txt for http://www.fict.org/" + CRLF + "# comments to webmaster@fict.org" + CR + LF + "User-agent: unhipbot" + LF + "Disallow: /" + CR + ""
+ CRLF + "User-agent: webcrawler" + LF + "User-agent: excite" + CR + "Disallow: " + "\u0085" + CR + "User-agent: *" + CRLF + "Disallow: /org/plans.html" + LF + "Allow: /org/"
+ CR + "Allow: /serv" + CRLF + "Allow: /~mak" + LF + "Disallow: /" + CRLF;
BaseRobotRules rules;
rules = createRobotRules("WebCrawler/3.0", mixedEndingsRobotsTxt.getBytes());
rules = createRobotRules("WebCrawler/3.0", mixedEndingsRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.fict.org/"));
assertTrue(rules.isAllowed("http://www.fict.org/index.html"));
rules = createRobotRules("Unknown/1.0", mixedEndingsRobotsTxt.getBytes());
rules = createRobotRules("Unknown/1.0", mixedEndingsRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.fict.org/"));
assertFalse(rules.isAllowed("http://www.fict.org/index.html"));
assertTrue(rules.isAllowed("http://www.fict.org/robots.txt"));
@ -219,7 +220,7 @@ public class SimpleRobotRulesParserTest {
}
@Test
public void testRfpCases() throws MalformedURLException {
public void testRfpCases() throws MalformedURLException, UnsupportedEncodingException {
// Run through all of the tests that are part of the robots.txt RFP
// http://www.robotstxt.org/norobots-rfc.txt
final String rfpExampleRobotsTxt = "# /robots.txt for http://www.fict.org/" + CRLF + "# comments to webmaster@fict.org" + CRLF + CRLF + "User-agent: unhipbot" + CRLF + "Disallow: /" + CRLF
@ -228,7 +229,7 @@ public class SimpleRobotRulesParserTest {
BaseRobotRules rules;
rules = createRobotRules("UnhipBot/0.1", rfpExampleRobotsTxt.getBytes());
rules = createRobotRules("UnhipBot/0.1", rfpExampleRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.fict.org/"));
assertFalse(rules.isAllowed("http://www.fict.org/index.html"));
assertTrue(rules.isAllowed("http://www.fict.org/robots.txt"));
@ -241,7 +242,7 @@ public class SimpleRobotRulesParserTest {
assertFalse(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html"));
assertFalse(rules.isAllowed("http://www.fict.org/%7Emak/mak.html"));
rules = createRobotRules("WebCrawler/3.0", rfpExampleRobotsTxt.getBytes());
rules = createRobotRules("WebCrawler/3.0", rfpExampleRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.fict.org/"));
assertTrue(rules.isAllowed("http://www.fict.org/index.html"));
assertTrue(rules.isAllowed("http://www.fict.org/robots.txt"));
@ -254,7 +255,7 @@ public class SimpleRobotRulesParserTest {
assertTrue(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html"));
assertTrue(rules.isAllowed("http://www.fict.org/%7Emak/mak.html"));
rules = createRobotRules("Excite/1.0", rfpExampleRobotsTxt.getBytes());
rules = createRobotRules("Excite/1.0", rfpExampleRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.fict.org/"));
assertTrue(rules.isAllowed("http://www.fict.org/index.html"));
assertTrue(rules.isAllowed("http://www.fict.org/robots.txt"));
@ -267,7 +268,7 @@ public class SimpleRobotRulesParserTest {
assertTrue(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html"));
assertTrue(rules.isAllowed("http://www.fict.org/%7Emak/mak.html"));
rules = createRobotRules("Unknown/1.0", rfpExampleRobotsTxt.getBytes());
rules = createRobotRules("Unknown/1.0", rfpExampleRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.fict.org/"));
assertFalse(rules.isAllowed("http://www.fict.org/index.html"));
assertTrue(rules.isAllowed("http://www.fict.org/robots.txt"));
@ -282,7 +283,7 @@ public class SimpleRobotRulesParserTest {
}
@Test
public void testNutchCases() throws MalformedURLException {
public void testNutchCases() throws MalformedURLException, UnsupportedEncodingException {
// Run through the Nutch test cases.
final String nutchRobotsTxt = "User-Agent: Agent1 #foo" + CR + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + CR + "" + CR + "" + CR + "User-Agent: Agent2 Agent3#foo" + CR
@ -290,7 +291,7 @@ public class SimpleRobotRulesParserTest {
BaseRobotRules rules;
rules = createRobotRules("Agent1", nutchRobotsTxt.getBytes());
rules = createRobotRules("Agent1", nutchRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.domain.com/a"));
assertFalse(rules.isAllowed("http://www.domain.com/a/"));
assertFalse(rules.isAllowed("http://www.domain.com/a/bloh/foo.html"));
@ -312,7 +313,7 @@ public class SimpleRobotRulesParserTest {
assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html"));
assertTrue(rules.isAllowed("http://www.domain.com/f/"));
rules = createRobotRules("Agent2", nutchRobotsTxt.getBytes());
rules = createRobotRules("Agent2", nutchRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/a"));
assertTrue(rules.isAllowed("http://www.domain.com/a/"));
assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html"));
@ -334,7 +335,7 @@ public class SimpleRobotRulesParserTest {
assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html"));
assertTrue(rules.isAllowed("http://www.domain.com/f/"));
rules = createRobotRules("Agent3", nutchRobotsTxt.getBytes());
rules = createRobotRules("Agent3", nutchRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/a"));
assertTrue(rules.isAllowed("http://www.domain.com/a/"));
assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html"));
@ -356,7 +357,7 @@ public class SimpleRobotRulesParserTest {
assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html"));
assertTrue(rules.isAllowed("http://www.domain.com/f/"));
rules = createRobotRules("Agent4", nutchRobotsTxt.getBytes());
rules = createRobotRules("Agent4", nutchRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/a"));
assertTrue(rules.isAllowed("http://www.domain.com/a/"));
assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html"));
@ -378,7 +379,7 @@ public class SimpleRobotRulesParserTest {
assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html"));
assertTrue(rules.isAllowed("http://www.domain.com/f/"));
rules = createRobotRules("Agent5", nutchRobotsTxt.getBytes());
rules = createRobotRules("Agent5", nutchRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/a"));
assertTrue(rules.isAllowed("http://www.domain.com/a/"));
assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html"));
@ -400,7 +401,7 @@ public class SimpleRobotRulesParserTest {
assertFalse(rules.isAllowed("http://www.domain.com/foo/bar/baz.html"));
assertTrue(rules.isAllowed("http://www.domain.com/f/"));
rules = createRobotRules("Agent5,Agent2,Agent1,Agent3,*", nutchRobotsTxt.getBytes());
rules = createRobotRules("Agent5,Agent2,Agent1,Agent3,*", nutchRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.domain.com/a"));
assertFalse(rules.isAllowed("http://www.domain.com/a/"));
assertFalse(rules.isAllowed("http://www.domain.com/a/bloh/foo.html"));
@ -424,18 +425,18 @@ public class SimpleRobotRulesParserTest {
}
@Test
public void testHtmlMarkupInRobotsTxt() throws MalformedURLException {
public void testHtmlMarkupInRobotsTxt() throws MalformedURLException, UnsupportedEncodingException {
final String htmlRobotsTxt = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2 Final//EN\"><HTML>\n" + "<HEAD>\n" + "<TITLE>/robots.txt</TITLE>\n" + "</HEAD>\n" + "<BODY>\n"
+ "User-agent: anybot<BR>\n" + "Disallow: <BR>\n" + "Crawl-Delay: 10<BR>\n" + "\n" + "User-agent: *<BR>\n" + "Disallow: /<BR>\n" + "Crawl-Delay: 30<BR>\n" + "\n" + "</BODY>\n"
+ "</HTML>\n";
BaseRobotRules rules;
rules = createRobotRules("anybot", htmlRobotsTxt.getBytes());
rules = createRobotRules("anybot", htmlRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/index.html"));
assertEquals(10000, rules.getCrawlDelay());
rules = createRobotRules("bogusbot", htmlRobotsTxt.getBytes());
rules = createRobotRules("bogusbot", htmlRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.domain.com/index.html"));
assertEquals(30000, rules.getCrawlDelay());
}
@ -450,39 +451,39 @@ public class SimpleRobotRulesParserTest {
}
@Test
public void testHeritrixCases() throws MalformedURLException {
public void testHeritrixCases() throws MalformedURLException, UnsupportedEncodingException {
final String heritrixRobotsTxt = "User-agent: *\n" + "Disallow: /cgi-bin/\n" + "Disallow: /details/software\n" + "\n" + "User-agent: denybot\n" + "Disallow: /\n" + "\n"
+ "User-agent: allowbot1\n" + "Disallow: \n" + "\n" + "User-agent: allowbot2\n" + "Disallow: /foo\n" + "Allow: /\n" + "\n" + "User-agent: delaybot\n" + "Disallow: /\n"
+ "Crawl-Delay: 20\n" + "Allow: /images/\n";
BaseRobotRules rules;
rules = createRobotRules("Mozilla allowbot1 99.9", heritrixRobotsTxt.getBytes());
rules = createRobotRules("Mozilla allowbot1 99.9", heritrixRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/path"));
assertTrue(rules.isAllowed("http://www.domain.com/"));
rules = createRobotRules("Mozilla allowbot2 99.9", heritrixRobotsTxt.getBytes());
rules = createRobotRules("Mozilla allowbot2 99.9", heritrixRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/path"));
assertTrue(rules.isAllowed("http://www.domain.com/"));
assertFalse(rules.isAllowed("http://www.domain.com/foo"));
rules = createRobotRules("Mozilla denybot 99.9", heritrixRobotsTxt.getBytes());
rules = createRobotRules("Mozilla denybot 99.9", heritrixRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.domain.com/path"));
assertFalse(rules.isAllowed("http://www.domain.com/"));
assertEquals(BaseRobotRules.UNSET_CRAWL_DELAY, rules.getCrawlDelay());
rules = createRobotRules("Mozilla anonbot 99.9", heritrixRobotsTxt.getBytes());
rules = createRobotRules("Mozilla anonbot 99.9", heritrixRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/path"));
assertFalse(rules.isAllowed("http://www.domain.com/cgi-bin/foo.pl"));
rules = createRobotRules("Mozilla delaybot 99.9", heritrixRobotsTxt.getBytes());
rules = createRobotRules("Mozilla delaybot 99.9", heritrixRobotsTxt.getBytes("UTF-8"));
assertEquals(20000, rules.getCrawlDelay());
}
@Test
public void testCaseSensitivePaths() throws MalformedURLException {
public void testCaseSensitivePaths() throws MalformedURLException, UnsupportedEncodingException {
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Allow: /AnyPage.html" + CRLF + "Allow: /somepage.html" + CRLF + "Disallow: /";
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/AnyPage.html"));
assertFalse(rules.isAllowed("http://www.domain.com/anypage.html"));
assertTrue(rules.isAllowed("http://www.domain.com/somepage.html"));
@ -490,76 +491,76 @@ public class SimpleRobotRulesParserTest {
}
@Test
public void testEmptyDisallow() throws MalformedURLException {
public void testEmptyDisallow() throws MalformedURLException, UnsupportedEncodingException {
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow:";
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
}
@Test
public void testEmptyAllow() throws MalformedURLException {
public void testEmptyAllow() throws MalformedURLException, UnsupportedEncodingException {
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Allow:";
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
}
@Test
public void testMultiWildcard() throws MalformedURLException {
public void testMultiWildcard() throws MalformedURLException, UnsupportedEncodingException {
// Make sure we only take the first wildcard entry.
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + CRLF + "User-agent: *" + CRLF + "Disallow: /";
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.domain.com/index.html"));
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
}
@Test
public void testMultiMatches() throws MalformedURLException {
public void testMultiMatches() throws MalformedURLException, UnsupportedEncodingException {
// Make sure we only take the first record that matches.
final String simpleRobotsTxt = "User-agent: crawlerbot" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + CRLF + "User-agent: crawler" + CRLF + "Disallow: /";
BaseRobotRules rules = createRobotRules("crawlerbot", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("crawlerbot", simpleRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.domain.com/index.html"));
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
}
@Test
public void testMultiAgentNames() throws MalformedURLException {
public void testMultiAgentNames() throws MalformedURLException, UnsupportedEncodingException {
// When there are more than one agent name on a line.
final String simpleRobotsTxt = "User-agent: crawler1 crawler2" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /";
BaseRobotRules rules = createRobotRules("crawler2", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("crawler2", simpleRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.domain.com/index.html"));
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
}
@Test
public void testMultiWordAgentName() throws MalformedURLException {
public void testMultiWordAgentName() throws MalformedURLException, UnsupportedEncodingException {
// When the user agent name has a space in it.
final String simpleRobotsTxt = "User-agent: Download Ninja" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /";
BaseRobotRules rules = createRobotRules("Download Ninja", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("Download Ninja", simpleRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.domain.com/index.html"));
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
}
@Test
public void testUnsupportedFields() throws MalformedURLException {
public void testUnsupportedFields() throws MalformedURLException, UnsupportedEncodingException {
// When we have a new field type that we don't know about.
final String simpleRobotsTxt = "User-agent: crawler1" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + "newfield: 234" + CRLF + "User-agent: crawler2" + CRLF + "Disallow: /";
BaseRobotRules rules = createRobotRules("crawler2", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("crawler2", simpleRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.domain.com/anypage.html"));
}
@Test
public void testAcapFields() throws MalformedURLException {
public void testAcapFields() throws MalformedURLException, UnsupportedEncodingException {
final String robotsTxt = "acap-crawler: *" + CRLF + "acap-disallow-crawl: /ultima_ora/";
SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
parser.parseContent("url", robotsTxt.getBytes(), "text/plain", "foobot");
parser.parseContent("url", robotsTxt.getBytes("UTF-8"), "text/plain", "foobot");
assertEquals(0, parser.getNumWarnings());
}
@ -597,34 +598,34 @@ public class SimpleRobotRulesParserTest {
}
@Test
public void testCrawlDelay() {
public void testCrawlDelay() throws UnsupportedEncodingException {
final String delayRules1RobotsTxt = "User-agent: bixo" + CR + "Crawl-delay: 10" + CR + "User-agent: foobot" + CR + "Crawl-delay: 20" + CR + "User-agent: *" + CR + "Disallow:/baz" + CR;
BaseRobotRules rules = createRobotRules("bixo", delayRules1RobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("bixo", delayRules1RobotsTxt.getBytes("UTF-8"));
long crawlDelay = rules.getCrawlDelay();
assertEquals("testing crawl delay for agent bixo - rule 1", 10000, crawlDelay);
final String delayRules2RobotsTxt = "User-agent: foobot" + CR + "Crawl-delay: 20" + CR + "User-agent: *" + CR + "Disallow:/baz" + CR;
rules = createRobotRules("bixo", delayRules2RobotsTxt.getBytes());
rules = createRobotRules("bixo", delayRules2RobotsTxt.getBytes("UTF-8"));
crawlDelay = rules.getCrawlDelay();
assertEquals("testing crawl delay for agent bixo - rule 2", BaseRobotRules.UNSET_CRAWL_DELAY, crawlDelay);
}
@Test
public void testBigCrawlDelay() throws MalformedURLException {
public void testBigCrawlDelay() throws MalformedURLException, UnsupportedEncodingException {
final String robotsTxt = "User-agent: *" + CR + "Crawl-delay: 3600" + CR + "Disallow:" + CR;
BaseRobotRules rules = createRobotRules("bixo", robotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("bixo", robotsTxt.getBytes("UTF-8"));
assertFalse("disallow all if huge crawl delay", rules.isAllowed("http://www.domain.com/"));
}
@Test
public void testBrokenKrugleRobotsTxtFile() throws MalformedURLException {
public void testBrokenKrugleRobotsTxtFile() throws MalformedURLException, UnsupportedEncodingException {
final String krugleRobotsTxt = "User-agent: *" + CR + "Disallow: /maintenance.html" + CR + "Disallow: /perl/" + CR + "Disallow: /cgi-bin/" + CR + "Disallow: /examples/" + CR
+ "Crawl-delay: 3" + CR + "" + CR + "User-agent: googlebot" + CR + "Crawl-delay: 1" + CR + "" + CR + "User-agent: qihoobot" + CR + "Disallow: /";
BaseRobotRules rules = createRobotRules("googlebot/2.1", krugleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("googlebot/2.1", krugleRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.krugle.com/examples/index.html"));
}
@ -647,10 +648,10 @@ public class SimpleRobotRulesParserTest {
}
@Test
public void testFloatingPointCrawlDelay() throws MalformedURLException {
public void testFloatingPointCrawlDelay() throws MalformedURLException, UnsupportedEncodingException {
final String robotsTxt = "User-agent: *" + CR + "Crawl-delay: 0.5" + CR + "Disallow:" + CR;
BaseRobotRules rules = createRobotRules("bixo", robotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("bixo", robotsTxt.getBytes("UTF-8"));
assertEquals(500, rules.getCrawlDelay());
}
@ -703,7 +704,7 @@ public class SimpleRobotRulesParserTest {
assertEquals("Found sitemap", 3, rules.getSitemaps().size());
// check that the last one is not lowercase only
String url = rules.getSitemaps().get(2);
boolean lowercased = url.equals(url.toLowerCase());
boolean lowercased = url.equals(url.toLowerCase(Locale.getDefault()));
assertFalse("Sitemap case check", lowercased);
}
@ -749,7 +750,7 @@ public class SimpleRobotRulesParserTest {
public void testAllowBeforeDisallow() throws Exception {
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /fish" + CRLF + "Allow: /fish" + CRLF;
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.fict.com/fish"));
}
@ -758,16 +759,16 @@ public class SimpleRobotRulesParserTest {
public void testSpacesInMultipleUserAgentNames() throws Exception {
final String simpleRobotsTxt = "User-agent: One, Two, Three" + CRLF + "Disallow: /" + CRLF + "" + CRLF + "User-agent: *" + CRLF + "Allow: /" + CRLF;
BaseRobotRules rules = createRobotRules("One", simpleRobotsTxt.getBytes());
BaseRobotRules rules = createRobotRules("One", simpleRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.fict.com/fish"));
rules = createRobotRules("Two", simpleRobotsTxt.getBytes());
rules = createRobotRules("Two", simpleRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.fict.com/fish"));
rules = createRobotRules("Three", simpleRobotsTxt.getBytes());
rules = createRobotRules("Three", simpleRobotsTxt.getBytes("UTF-8"));
assertFalse(rules.isAllowed("http://www.fict.com/fish"));
rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
assertTrue(rules.isAllowed("http://www.fict.com/fish"));
}

View File

@ -3,6 +3,7 @@ package crawlercommons.sitemaps;
import static org.junit.Assert.*;
import java.text.SimpleDateFormat;
import java.util.Locale;
import java.util.TimeZone;
import org.junit.Test;
@ -14,7 +15,7 @@ public class AbstractSiteMapTest {
assertNull(AbstractSiteMap.convertToDate("blah"));
assertNull(AbstractSiteMap.convertToDate(null));
SimpleDateFormat isoFormatNoTimezone = new SimpleDateFormat("yyyyMMdd");
SimpleDateFormat isoFormatNoTimezone = new SimpleDateFormat("yyyyMMdd", Locale.getDefault());
// For formats where there's no time zone information, the time zone is
// undefined, so we can
@ -23,7 +24,7 @@ public class AbstractSiteMapTest {
assertEquals("20140601", isoFormatNoTimezone.format(AbstractSiteMap.convertToDate("2014-06")));
assertEquals("20140603", isoFormatNoTimezone.format(AbstractSiteMap.convertToDate("2014-06-03")));
SimpleDateFormat isoFormat = new SimpleDateFormat("yyyyMMdd'T'HHmmss");
SimpleDateFormat isoFormat = new SimpleDateFormat("yyyyMMdd'T'HHmmss", Locale.getDefault());
isoFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
// Complete date plus hours and minutes
@ -38,7 +39,7 @@ public class AbstractSiteMapTest {
// Complete date plus hours, minutes, seconds and a decimal fraction of
// a second
SimpleDateFormat isoFormatWithFractionSeconds = new SimpleDateFormat("yyyyMMdd'T'HHmmss.S");
SimpleDateFormat isoFormatWithFractionSeconds = new SimpleDateFormat("yyyyMMdd'T'HHmmss.S", Locale.getDefault());
isoFormatWithFractionSeconds.setTimeZone(TimeZone.getTimeZone("UTC"));
assertEquals("20140603T103045.820", isoFormatWithFractionSeconds.format(AbstractSiteMap.convertToDate("2014-06-03T10:30:45.82+00:00")));

View File

@ -21,10 +21,11 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.TimeZone;
import java.util.Locale;
import org.apache.commons.io.IOUtils;
import org.junit.After;
@ -32,11 +33,15 @@ import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.junit.Assert.*;
@RunWith(JUnit4.class)
public class SiteMapParserTest {
private static final Logger LOG = LoggerFactory.getLogger(SiteMapParserTest.class);
@Before
public void setUp() throws Exception {
@ -79,11 +84,10 @@ public class SiteMapParserTest {
@Test
public void testFullDateFormat() {
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00");
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00", Locale.getDefault());
Date date = new Date();
System.out.println(format.format(date));
System.out.println(SiteMap.getFullDateFormat().format(date));
LOG.info(format.format(date));
LOG.info(SiteMap.getFullDateFormat().format(date));
}
@Test
@ -91,7 +95,7 @@ public class SiteMapParserTest {
SiteMapParser parser = new SiteMapParser();
String contentType = "text/plain";
String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11";
byte[] content = scontent.getBytes();
byte[] content = scontent.getBytes("UTF-8");
URL url = new URL("http://www.example.com/sitemap.txt");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
@ -106,7 +110,7 @@ public class SiteMapParserTest {
public void testSitemapTXTWithXMLExt() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11";
byte[] content = scontent.getBytes();
byte[] content = scontent.getBytes("UTF-8");
URL url = new URL("http://www.example.com/sitemap.xml");
String contentType = "text/plain";
@ -160,7 +164,7 @@ public class SiteMapParserTest {
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")
.append("<url><!-- This file is not a valid XML file --></url>").append("<url><loc> http://cs.harding.edu/fmccown/sitemaps/something.html</loc>")
.append("</url><!-- missing opening url tag --></url></urlset>");
byte[] content = scontent.toString().getBytes();
byte[] content = scontent.toString().getBytes("UTF-8");
URL url = new URL("http://www.example.com/sitemapindex.xml");
parser.parseSiteMap(contentType, content, url); // This Sitemap contains
@ -224,7 +228,7 @@ public class SiteMapParserTest {
StringBuilder scontent = new StringBuilder(1024);
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
.append("<loc>http://www.example.com/</loc>").append("</url>").append("</urlset>");
byte[] content = scontent.toString().getBytes();
byte[] content = scontent.toString().getBytes("UTF-8");
URL url = new URL("http://www.example.com/subsection/sitemap.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
@ -245,8 +249,9 @@ public class SiteMapParserTest {
assertFalse(sm.getSiteMapUrls().iterator().next().isValid());
}
/** Returns a good simple default XML sitemap as a byte array */
private byte[] getXMLSitemapAsBytes() {
/** Returns a good simple default XML sitemap as a byte array
* @throws UnsupportedEncodingException */
private byte[] getXMLSitemapAsBytes() throws UnsupportedEncodingException {
StringBuilder scontent = new StringBuilder(1024);
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
.append(" <loc>http://www.example.com/</loc>").append(" <lastmod>2005-01-01</lastmod>").append(" <changefreq>monthly</changefreq>").append(" <priority>0.8</priority>")
@ -257,6 +262,6 @@ public class SiteMapParserTest {
.append(" <loc><url><![CDATA[http://www.example.com/catalog?item=83&amp;desc=vacation_usa]]></url></loc>").append(" <lastmod>2004-11-23</lastmod>").append("</url>")
.append("</urlset>");
return scontent.toString().getBytes();
return scontent.toString().getBytes("UTF-8");
}
}

View File

@ -56,14 +56,14 @@ public class RedirectResponseHandler extends AbstractHttpHandler {
String content = "redirected content";
response.setContentLength(content.length());
response.getOutputStream().write(content.getBytes());
response.getOutputStream().write(content.getBytes("UTF-8"));
} else {
response.setStatus(HttpStatus.SC_OK);
response.setContentType("text/plain");
String content = "other content";
response.setContentLength(content.length());
response.getOutputStream().write(content.getBytes());
response.getOutputStream().write(content.getBytes("UTF-8"));
}
}
}