mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-18 18:06:05 +02:00
Upgrade to JDK 1.7 compiler version and introduce Maven forbidden API's plugin
This commit is contained in:
parent
827b073d12
commit
ba5906ec40
84
pom.xml
84
pom.xml
|
@ -1,4 +1,22 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!--
|
||||
Licensed to the Apache Software Foundation (ASF) under one
|
||||
or more contributor license agreements. See the NOTICE file
|
||||
distributed with this work for additional information
|
||||
regarding copyright ownership. The ASF licenses this file
|
||||
to you under the Apache License, Version 2.0 (the
|
||||
"License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing,
|
||||
software distributed under the License is distributed on an
|
||||
"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations
|
||||
under the License.
|
||||
-->
|
||||
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
|
||||
|
||||
<parent>
|
||||
|
@ -31,7 +49,7 @@
|
|||
<connection>scm:git:git://github.com/crawler-commons/crawler-commons.git</connection>
|
||||
<developerConnection>scm:git:git@github.com:crawler-commons/crawler-commons.git</developerConnection>
|
||||
<tag>HEAD</tag>
|
||||
</scm>
|
||||
</scm>
|
||||
|
||||
<distributionManagement>
|
||||
<repository>
|
||||
|
@ -49,7 +67,7 @@
|
|||
<mailingLists>
|
||||
<mailingList>
|
||||
<name>Project Mailing List</name>
|
||||
<post>crawler-commons [at] googlecode [dot] com</post>
|
||||
<post>crawler-commons [at] googlegroups [dot] com</post>
|
||||
</mailingList>
|
||||
</mailingLists>
|
||||
|
||||
|
@ -135,6 +153,32 @@
|
|||
<!--autoVersionSubmodules>true</autoVersionSubmodules -->
|
||||
</configuration>
|
||||
</plugin>
|
||||
<!--This plugin's configuration is used to store Eclipse m2e settings only. It has no influence on the Maven build itself.-->
|
||||
<plugin>
|
||||
<groupId>org.eclipse.m2e</groupId>
|
||||
<artifactId>lifecycle-mapping</artifactId>
|
||||
<version>1.0.0</version>
|
||||
<configuration>
|
||||
<lifecycleMappingMetadata>
|
||||
<pluginExecutions>
|
||||
<pluginExecution>
|
||||
<pluginExecutionFilter>
|
||||
<groupId>de.thetaphi</groupId>
|
||||
<artifactId>forbiddenapis</artifactId>
|
||||
<versionRange>[1.8,)</versionRange>
|
||||
<goals>
|
||||
<goal>testCheck</goal>
|
||||
<goal>check</goal>
|
||||
</goals>
|
||||
</pluginExecutionFilter>
|
||||
<action>
|
||||
<ignore></ignore>
|
||||
</action>
|
||||
</pluginExecution>
|
||||
</pluginExecutions>
|
||||
</lifecycleMappingMetadata>
|
||||
</configuration>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</pluginManagement>
|
||||
|
||||
|
@ -183,7 +227,34 @@
|
|||
<configFile>${project.basedir}/doc/eclipse-formatter.xml</configFile>
|
||||
</configuration>
|
||||
</plugin>
|
||||
|
||||
<plugin>
|
||||
<groupId>de.thetaphi</groupId>
|
||||
<artifactId>forbiddenapis</artifactId>
|
||||
<version>1.8</version>
|
||||
<configuration>
|
||||
<!-- disallow undocumented classes like sun.misc.Unsafe: -->
|
||||
<internalRuntimeForbidden>true</internalRuntimeForbidden>
|
||||
<!--
|
||||
if the used Java version is too new,
|
||||
don't fail, just do nothing:
|
||||
-->
|
||||
<failOnUnsupportedJava>false</failOnUnsupportedJava>
|
||||
<bundledSignatures>
|
||||
<bundledSignature>jdk-unsafe</bundledSignature>
|
||||
<bundledSignature>jdk-deprecated</bundledSignature>
|
||||
<bundledSignature>jdk-system-out</bundledSignature>
|
||||
<!--bundledSignature>commons-io-unsafe-${commons-io.version}</bundledSignature-->
|
||||
</bundledSignatures>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
<goals>
|
||||
<goal>check</goal>
|
||||
<goal>testCheck</goal>
|
||||
</goals>
|
||||
</execution>
|
||||
</executions>
|
||||
</plugin>
|
||||
</plugins>
|
||||
</build>
|
||||
|
||||
|
@ -300,11 +371,12 @@
|
|||
|
||||
<!-- General Properties -->
|
||||
<implementation.build>${scmBranch}@r${buildNumber}</implementation.build>
|
||||
<javac.src.version>1.6</javac.src.version>
|
||||
<javac.target.version>1.6</javac.target.version>
|
||||
<javac.src.version>1.7</javac.src.version>
|
||||
<javac.target.version>1.7</javac.target.version>
|
||||
<maven.compiler.target>1.7</maven.compiler.target>
|
||||
<maven.build.timestamp.format>yyyy-MM-dd HH:mm:ssZ</maven.build.timestamp.format>
|
||||
<skipTests>false</skipTests>
|
||||
<assembly.finalName>apache-${project.build.finalName}</assembly.finalName>
|
||||
<assembly.finalName>${project.build.finalName}</assembly.finalName>
|
||||
</properties>
|
||||
|
||||
<dependencies>
|
||||
|
|
|
@ -133,7 +133,7 @@ public abstract class BaseFetchException extends Exception {
|
|||
|
||||
@Override
|
||||
public void printStackTrace() {
|
||||
_exception.printStackTrace();
|
||||
_exception.getMessage();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
|
||||
package crawlercommons.fetcher;
|
||||
|
||||
import java.nio.charset.Charset;
|
||||
import java.security.InvalidParameterException;
|
||||
import java.util.Arrays;
|
||||
|
||||
|
@ -174,7 +175,7 @@ public class FetchedResult {
|
|||
report.append(" FetchedUrl : " + getFetchedUrl() + "\n");
|
||||
report.append(" ContentType : " + getContentType() + "\n");
|
||||
report.append(" ContentLength : " + getContentLength() + "\n");
|
||||
report.append(" Content : " + new String(getContent()) + "\n"); // byte
|
||||
report.append(" Content : " + new String(getContent(), Charset.defaultCharset()) + "\n"); // byte
|
||||
// array
|
||||
// to
|
||||
// string
|
||||
|
|
|
@ -31,6 +31,7 @@ import java.security.NoSuchAlgorithmException;
|
|||
import java.security.cert.CertificateException;
|
||||
import java.security.cert.X509Certificate;
|
||||
import java.util.HashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
|
||||
|
@ -519,7 +520,7 @@ public class SimpleHttpFetcher extends BaseHttpFetcher {
|
|||
} catch (HttpFetchException e) {
|
||||
// Don't bother generating a trace for a 404 (not found)
|
||||
if (LOGGER.isTraceEnabled() && (e.getHttpStatus() != HttpStatus.SC_NOT_FOUND)) {
|
||||
LOGGER.trace(String.format("Exception fetching %s (%s)", url, e.getMessage()));
|
||||
LOGGER.trace(String.format(Locale.getDefault(), "Exception fetching %s (%s)", url, e.getMessage()));
|
||||
}
|
||||
|
||||
throw e;
|
||||
|
@ -527,11 +528,11 @@ public class SimpleHttpFetcher extends BaseHttpFetcher {
|
|||
// Don't bother reporting that we bailed because the mime-type
|
||||
// wasn't one that we wanted.
|
||||
if (e.getAbortReason() != AbortedFetchReason.INVALID_MIMETYPE) {
|
||||
LOGGER.debug(String.format("Exception fetching %s (%s)", url, e.getMessage()));
|
||||
LOGGER.debug(String.format(Locale.getDefault(), "Exception fetching %s (%s)", url, e.getMessage()));
|
||||
}
|
||||
throw e;
|
||||
} catch (BaseFetchException e) {
|
||||
LOGGER.debug(String.format("Exception fetching %s (%s)", url, e.getMessage()));
|
||||
LOGGER.debug(String.format(Locale.getDefault(), "Exception fetching %s (%s)", url, e.getMessage()));
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
|
@ -547,7 +548,7 @@ public class SimpleHttpFetcher extends BaseHttpFetcher {
|
|||
return doRequest(request, url, payload);
|
||||
} catch (BaseFetchException e) {
|
||||
if (LOGGER.isTraceEnabled()) {
|
||||
LOGGER.trace(String.format("Exception fetching %s", url), e);
|
||||
LOGGER.trace(String.format(Locale.getDefault(), "Exception fetching %s", url), e);
|
||||
}
|
||||
throw e;
|
||||
}
|
||||
|
@ -675,7 +676,7 @@ public class SimpleHttpFetcher extends BaseHttpFetcher {
|
|||
|
||||
throw new RedirectFetchException(url, redirectUrl, mre.getReason());
|
||||
} else if (e.getCause() instanceof RedirectException) {
|
||||
e.printStackTrace();
|
||||
LOGGER.error(e.getMessage());
|
||||
throw new RedirectFetchException(url, extractRedirectedUrl(url, localContext), RedirectExceptionReason.TOO_MANY_REDIRECTS);
|
||||
} else {
|
||||
throw new IOFetchException(url, e);
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
package crawlercommons.fetcher.http;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Locale;
|
||||
|
||||
import crawlercommons.CrawlerCommons;
|
||||
|
||||
|
@ -103,6 +104,6 @@ public class UserAgent implements Serializable {
|
|||
public String getUserAgentString() {
|
||||
// Mozilla/5.0 (compatible; mycrawler/1.0; +http://www.mydomain.com;
|
||||
// mycrawler@mydomain.com)
|
||||
return String.format("%s (compatible; %s%s; +%s; %s)", _browserVersion, getAgentName(), _crawlerVersion, _webAddress, _emailAddress);
|
||||
return String.format(Locale.getDefault(), "%s (compatible; %s%s; +%s; %s)", _browserVersion, getAgentName(), _crawlerVersion, _webAddress, _emailAddress);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,10 +18,10 @@
|
|||
package crawlercommons.robots;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.URI;
|
||||
import java.net.URL;
|
||||
import java.net.URLDecoder;
|
||||
import java.util.HashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.StringTokenizer;
|
||||
import java.util.regex.Matcher;
|
||||
|
@ -188,7 +188,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
static {
|
||||
for (RobotDirective directive : RobotDirective.values()) {
|
||||
if (!directive.isSpecial()) {
|
||||
String prefix = directive.name().toLowerCase().replaceAll("_", "-");
|
||||
String prefix = directive.name().toLowerCase(Locale.getDefault()).replaceAll("_", "-");
|
||||
DIRECTIVE_PREFIX.put(prefix, directive);
|
||||
}
|
||||
}
|
||||
|
@ -220,7 +220,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
* @return robot command found on line
|
||||
*/
|
||||
private static RobotToken tokenize(String line) {
|
||||
String lowerLine = line.toLowerCase();
|
||||
String lowerLine = line.toLowerCase(Locale.getDefault());
|
||||
for (String prefix : DIRECTIVE_PREFIX.keySet()) {
|
||||
int prefixLength = prefix.length();
|
||||
if (lowerLine.startsWith(prefix)) {
|
||||
|
@ -336,7 +336,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
}
|
||||
|
||||
// Decide if we need to do special HTML processing.
|
||||
boolean isHtmlType = ((contentType != null) && contentType.toLowerCase().startsWith("text/html"));
|
||||
boolean isHtmlType = ((contentType != null) && contentType.toLowerCase(Locale.getDefault()).startsWith("text/html"));
|
||||
|
||||
// If it looks like it contains HTML, but doesn't have a user agent
|
||||
// field, then
|
||||
|
@ -366,7 +366,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
// an empty
|
||||
// string between the \r and \n.
|
||||
StringTokenizer lineParser = new StringTokenizer(contentAsStr, "\n\r\u0085\u2028\u2029");
|
||||
ParseState parseState = new ParseState(url, robotName.toLowerCase());
|
||||
ParseState parseState = new ParseState(url, robotName.toLowerCase(Locale.getDefault()));
|
||||
boolean keepGoing = true;
|
||||
|
||||
while (keepGoing && lineParser.hasMoreTokens()) {
|
||||
|
@ -425,7 +425,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
break;
|
||||
|
||||
case MISSING:
|
||||
reportWarning(String.format("Unknown line in robots.txt file (size %d): %s", content.length, line), url);
|
||||
reportWarning(String.format(Locale.getDefault(), "Unknown line in robots.txt file (size %d): %s", content.length, line), url);
|
||||
parseState.setFinishedAgentFields(true);
|
||||
break;
|
||||
|
||||
|
@ -496,7 +496,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
|
||||
// Handle the case when there are multiple target names are passed
|
||||
// TODO should we do lowercase comparison of target name? Assuming yes.
|
||||
String[] targetNames = state.getTargetName().toLowerCase().split(",");
|
||||
String[] targetNames = state.getTargetName().toLowerCase(Locale.getDefault()).split(",");
|
||||
|
||||
for (int count = 0; count < targetNames.length; count++) {
|
||||
// Extract possible match names from our target agent name, since it
|
||||
|
@ -508,7 +508,7 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
String[] agentNames = token.getData().split("[ \t,]");
|
||||
for (String agentName : agentNames) {
|
||||
// TODO should we do case-insensitive matching? Probably yes.
|
||||
agentName = agentName.trim().toLowerCase();
|
||||
agentName = agentName.trim().toLowerCase(Locale.getDefault());
|
||||
if (agentName.isEmpty()) {
|
||||
// Ignore empty names
|
||||
} else if (agentName.equals("*") && !state.isMatchedWildcard()) {
|
||||
|
|
|
@ -21,8 +21,8 @@ import java.net.URL;
|
|||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Collection;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
import java.util.TimeZone;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
@ -42,14 +42,14 @@ public abstract class AbstractSiteMap {
|
|||
private static final ThreadLocal<DateFormat> W3C_NO_SECONDS_FORMAT = new ThreadLocal<DateFormat>() {
|
||||
|
||||
protected DateFormat initialValue() {
|
||||
return new SimpleDateFormat("yyyy-MM-dd'T'HH:mmZ");
|
||||
return new SimpleDateFormat("yyyy-MM-dd'T'HH:mmZ", Locale.getDefault());
|
||||
}
|
||||
};
|
||||
|
||||
private static final ThreadLocal<DateFormat> W3C_FULLDATE_FORMAT = new ThreadLocal<DateFormat>() {
|
||||
|
||||
protected DateFormat initialValue() {
|
||||
SimpleDateFormat result = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ");
|
||||
SimpleDateFormat result = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.getDefault());
|
||||
result.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
return result;
|
||||
}
|
||||
|
|
|
@ -5,17 +5,21 @@ import java.net.URL;
|
|||
import java.util.Collection;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Sitemap Tool for recursively fetching all URL's from a sitemap (and all of
|
||||
* it's children)
|
||||
**/
|
||||
public class SiteMapTester {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(SiteMapTester.class);
|
||||
private static SiteMapParser parser = new SiteMapParser(false);
|
||||
|
||||
public static void main(String[] args) throws IOException, UnknownFormatException {
|
||||
if (args.length < 1) {
|
||||
System.err.println("Usage: SiteMapTester <URL_TO_TEST> [MIME_TYPE]");
|
||||
LOG.error("Usage: SiteMapTester <URL_TO_TEST> [MIME_TYPE]");
|
||||
} else {
|
||||
URL url = new URL(args[0]);
|
||||
String mt = (args.length > 1) ? args[1] : null;
|
||||
|
@ -47,7 +51,7 @@ public class SiteMapTester {
|
|||
} else {
|
||||
Collection<SiteMapURL> links = ((SiteMap) sm).getSiteMapUrls();
|
||||
for (SiteMapURL smu : links) {
|
||||
System.out.println(smu.getUrl());
|
||||
LOG.info(smu.getUrl().toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import org.slf4j.LoggerFactory;
|
|||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
|
||||
/**
|
||||
* The SitemapUrl class represents a URL found in a Sitemap.
|
||||
|
@ -226,7 +227,7 @@ public class SiteMapURL {
|
|||
public void setChangeFrequency(String changeFreq) {
|
||||
|
||||
if (changeFreq != null) {
|
||||
changeFreq = changeFreq.toUpperCase();
|
||||
changeFreq = changeFreq.toUpperCase(Locale.getDefault());
|
||||
|
||||
if (changeFreq.contains("ALWAYS")) {
|
||||
this.changeFreq = ChangeFrequency.ALWAYS;
|
||||
|
|
|
@ -25,6 +25,8 @@ import java.net.IDN;
|
|||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.Locale;
|
||||
import java.nio.charset.Charset;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -79,7 +81,7 @@ public class EffectiveTldFinder {
|
|||
if (null == effective_tld_data_stream && null != this.getClass().getResource(ETLD_DATA)) {
|
||||
effective_tld_data_stream = this.getClass().getResourceAsStream(ETLD_DATA);
|
||||
}
|
||||
BufferedReader input = new BufferedReader(new InputStreamReader(effective_tld_data_stream));
|
||||
BufferedReader input = new BufferedReader(new InputStreamReader(effective_tld_data_stream, Charset.defaultCharset()));
|
||||
String line = null;
|
||||
while (null != (line = input.readLine())) {
|
||||
if (line.length() == 0 || (line.length() > 1 && line.startsWith(COMMENT))) {
|
||||
|
@ -148,8 +150,8 @@ public class EffectiveTldFinder {
|
|||
*/
|
||||
public static String getAssignedDomain(String hostname) {
|
||||
EffectiveTLD etld = getEffectiveTLD(hostname);
|
||||
if (null == etld || etld.getDomain() == hostname.toLowerCase()) {
|
||||
return hostname.toLowerCase();
|
||||
if (null == etld || etld.getDomain() == hostname.toLowerCase(Locale.getDefault())) {
|
||||
return hostname.toLowerCase(Locale.getDefault());
|
||||
}
|
||||
String domain = hostname.replaceFirst(".*?([^.]+\\.)" + etld.getDomain() + "$", "$1" + etld.getDomain());
|
||||
return domain;
|
||||
|
@ -201,7 +203,7 @@ public class EffectiveTldFinder {
|
|||
|
||||
private String asciiConvert(String str) {
|
||||
if (isAscii(str)) {
|
||||
return str.toLowerCase();
|
||||
return str.toLowerCase(Locale.getDefault());
|
||||
}
|
||||
return IDN.toASCII(str);
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ package crawlercommons.url;
|
|||
import java.net.URL;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashSet;
|
||||
import java.util.Locale;
|
||||
import java.util.Set;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
@ -76,7 +77,7 @@ public class PaidLevelDomain {
|
|||
}
|
||||
|
||||
int firstHostPiece = 0;
|
||||
if (ccTLDs.contains(subNames[numPieces - 1].toLowerCase())) {
|
||||
if (ccTLDs.contains(subNames[numPieces - 1].toLowerCase(Locale.getDefault()))) {
|
||||
// We have a country code at the end. See if the preceding piece is
|
||||
// either
|
||||
// a two-letter name (country code or funky short gTLD), or one of
|
||||
|
@ -85,15 +86,15 @@ public class PaidLevelDomain {
|
|||
if (subNames[numPieces - 2].length() <= 2) {
|
||||
// Must be xxx.co.jp format
|
||||
firstHostPiece = numPieces - 3;
|
||||
} else if (gTLDs.contains(subNames[numPieces - 2].toLowerCase())) {
|
||||
} else if (gTLDs.contains(subNames[numPieces - 2].toLowerCase(Locale.getDefault()))) {
|
||||
// Must be xxx.com.mx format
|
||||
firstHostPiece = numPieces - 3;
|
||||
} else {
|
||||
// Must be xxx.it format
|
||||
firstHostPiece = numPieces - 2;
|
||||
}
|
||||
} else if (gTLDs.contains(subNames[numPieces - 1].toLowerCase())) {
|
||||
if (ccTLDs.contains(subNames[numPieces - 2].toLowerCase())) {
|
||||
} else if (gTLDs.contains(subNames[numPieces - 1].toLowerCase(Locale.getDefault()))) {
|
||||
if (ccTLDs.contains(subNames[numPieces - 2].toLowerCase(Locale.getDefault()))) {
|
||||
// Must be xxx.de.com format
|
||||
firstHostPiece = numPieces - 3;
|
||||
} else {
|
||||
|
|
|
@ -16,22 +16,29 @@
|
|||
*/
|
||||
package crawlercommons.fetcher;
|
||||
|
||||
import java.io.UnsupportedEncodingException;
|
||||
|
||||
import org.apache.tika.metadata.Metadata;
|
||||
import org.junit.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* @author lmcgibbn
|
||||
*
|
||||
*/
|
||||
public class FetchedResultTest {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(FetchedResultTest.class);
|
||||
|
||||
/**
|
||||
* Test method for {@link crawlercommons.fetcher.FetchedResult#report()}.
|
||||
* This does not actually test anything but simply allows us to see what a
|
||||
* generated report would look like.
|
||||
* @throws UnsupportedEncodingException
|
||||
*/
|
||||
@Test
|
||||
public void testPrintReport() {
|
||||
public void testPrintReport() throws UnsupportedEncodingException {
|
||||
Metadata headerMetadata = new Metadata();
|
||||
headerMetadata.add(Metadata.CONTENT_DISPOSITION, "This is content disposition");
|
||||
headerMetadata.add(Metadata.CONTENT_ENCODING, "This is the encoding");
|
||||
|
@ -52,8 +59,8 @@ public class FetchedResultTest {
|
|||
"http://en.wikipedia.org/wiki/Glasgow", // redirectedUrl
|
||||
System.currentTimeMillis(), // fetchTime
|
||||
headerMetadata, new String("Glasgow (/ˈɡlɑːzɡoʊ, ˈɡlæz-/;[4] Scots: Glesca; Scottish Gaelic: Glaschu) "
|
||||
+ "is the largest city in Scotland, and the third largest in the United Kingdom.").getBytes(), "ScotsText", 2014, load, "http://en.wikipedia.org/wiki/Glasgow",
|
||||
+ "is the largest city in Scotland, and the third largest in the United Kingdom.").getBytes("UTF-8"), "ScotsText", 2014, load, "http://en.wikipedia.org/wiki/Glasgow",
|
||||
0, "wikipedia.org", 200, "");
|
||||
System.out.println(result.report());
|
||||
LOG.error(result.report());
|
||||
}
|
||||
}
|
|
@ -25,6 +25,7 @@ import static org.junit.Assert.fail;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.net.ConnectException;
|
||||
import java.nio.charset.Charset;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
|
@ -118,7 +119,7 @@ public class SimpleHttpFetcherTest {
|
|||
|
||||
String content = "redirected";
|
||||
response.setContentLength(content.length());
|
||||
response.getOutputStream().write(content.getBytes());
|
||||
response.getOutputStream().write(content.getBytes("UTF-8"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -148,7 +149,7 @@ public class SimpleHttpFetcherTest {
|
|||
response.setContentType("text/plain");
|
||||
|
||||
response.setContentLength(content.length());
|
||||
response.getOutputStream().write(content.getBytes());
|
||||
response.getOutputStream().write(content.getBytes("UTF-8"));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -170,7 +171,7 @@ public class SimpleHttpFetcherTest {
|
|||
}
|
||||
|
||||
response.setContentLength(content.length());
|
||||
response.getOutputStream().write(content.getBytes());
|
||||
response.getOutputStream().write(content.getBytes("UTF-8"));
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -380,7 +381,7 @@ public class SimpleHttpFetcherTest {
|
|||
BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT);
|
||||
String url = "http://localhost:8089/";
|
||||
FetchedResult result = fetcher.get(url);
|
||||
String contentStr = new String(result.getContent(), 0, result.getContentLength());
|
||||
String contentStr = new String(result.getContent(), 0, result.getContentLength(), Charset.defaultCharset());
|
||||
assertTrue(englishContent.equals(contentStr));
|
||||
}
|
||||
|
||||
|
|
|
@ -114,7 +114,7 @@ public class RobotUtilsTest {
|
|||
|
||||
BaseHttpFetcher fetcher = Mockito.mock(BaseHttpFetcher.class);
|
||||
FetchedResult result = Mockito.mock(FetchedResult.class);
|
||||
Mockito.when(result.getContent()).thenReturn(simpleRobotsTxt.getBytes());
|
||||
Mockito.when(result.getContent()).thenReturn(simpleRobotsTxt.getBytes("UTF-8"));
|
||||
Mockito.when(fetcher.get(Mockito.any(String.class))).thenReturn(result);
|
||||
UserAgent userAgent = new UserAgent("testAgent", "crawler@domain.com", "http://www.domain.com");
|
||||
Mockito.when(fetcher.getUserAgent()).thenReturn(userAgent);
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.io.InputStream;
|
|||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Locale;
|
||||
|
||||
import javax.servlet.http.HttpServletResponse;
|
||||
|
||||
|
@ -45,8 +46,8 @@ public class SimpleRobotRulesParserTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testEmptyRules() throws MalformedURLException {
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", "".getBytes());
|
||||
public void testEmptyRules() throws MalformedURLException, UnsupportedEncodingException {
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", "".getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
}
|
||||
|
||||
|
@ -54,7 +55,7 @@ public class SimpleRobotRulesParserTest {
|
|||
public void testQueryParamInDisallow() throws Exception {
|
||||
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /index.cfm?fuseaction=sitesearch.results*";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://searchservice.domain.com/index.cfm?fuseaction=sitesearch.results&type=People&qry=california&pg=2"));
|
||||
}
|
||||
|
||||
|
@ -64,7 +65,7 @@ public class SimpleRobotRulesParserTest {
|
|||
// Test for /fish
|
||||
final String simpleRobotsTxt1 = "User-agent: *" + CRLF + "Disallow: /fish" + CRLF;
|
||||
|
||||
BaseRobotRules rule1 = createRobotRules("Any-darn-crawler", simpleRobotsTxt1.getBytes());
|
||||
BaseRobotRules rule1 = createRobotRules("Any-darn-crawler", simpleRobotsTxt1.getBytes("UTF-8"));
|
||||
assertFalse(rule1.isAllowed("http://www.fict.com/fish"));
|
||||
assertFalse(rule1.isAllowed("http://www.fict.com/fish.html"));
|
||||
assertFalse(rule1.isAllowed("http://www.fict.com/fish/salmon.html"));
|
||||
|
@ -80,7 +81,7 @@ public class SimpleRobotRulesParserTest {
|
|||
// Test for /fish*
|
||||
final String simpleRobotsTxt2 = "User-agent: *" + CRLF + "Disallow: /fish*" + CRLF;
|
||||
|
||||
BaseRobotRules rule2 = createRobotRules("Any-darn-crawler", simpleRobotsTxt2.getBytes());
|
||||
BaseRobotRules rule2 = createRobotRules("Any-darn-crawler", simpleRobotsTxt2.getBytes("UTF-8"));
|
||||
assertFalse(rule2.isAllowed("http://www.fict.com/fish"));
|
||||
assertFalse(rule2.isAllowed("http://www.fict.com/fish.html"));
|
||||
assertFalse(rule2.isAllowed("http://www.fict.com/fish/salmon.html"));
|
||||
|
@ -96,7 +97,7 @@ public class SimpleRobotRulesParserTest {
|
|||
// Test for /fish/
|
||||
final String simpleRobotsTxt3 = "User-agent: *" + CRLF + "Disallow: /fish/" + CRLF;
|
||||
|
||||
BaseRobotRules rule3 = createRobotRules("Any-darn-crawler", simpleRobotsTxt3.getBytes());
|
||||
BaseRobotRules rule3 = createRobotRules("Any-darn-crawler", simpleRobotsTxt3.getBytes("UTF-8"));
|
||||
assertFalse(rule3.isAllowed("http://www.fict.com/fish/"));
|
||||
assertFalse(rule3.isAllowed("http://www.fict.com/fish/?id=anything"));
|
||||
assertFalse(rule3.isAllowed("http://www.fict.com/fish/salmon.htm"));
|
||||
|
@ -108,7 +109,7 @@ public class SimpleRobotRulesParserTest {
|
|||
// Test for /*.php
|
||||
final String simpleRobotsTxt4 = "User-agent: *" + CRLF + "Disallow: /*.php" + CRLF;
|
||||
|
||||
BaseRobotRules rule4 = createRobotRules("Any-darn-crawler", simpleRobotsTxt4.getBytes());
|
||||
BaseRobotRules rule4 = createRobotRules("Any-darn-crawler", simpleRobotsTxt4.getBytes("UTF-8"));
|
||||
assertFalse(rule4.isAllowed("http://www.fict.com/filename.php"));
|
||||
assertFalse(rule4.isAllowed("http://www.fict.com/folder/filename.php"));
|
||||
assertFalse(rule4.isAllowed("http://www.fict.com/folder/filename.php?parameters"));
|
||||
|
@ -121,7 +122,7 @@ public class SimpleRobotRulesParserTest {
|
|||
// Test for /*.php$
|
||||
final String simpleRobotsTxt5 = "User-agent: *" + CRLF + "Disallow: /*.php$" + CRLF;
|
||||
|
||||
BaseRobotRules rule5 = createRobotRules("Any-darn-crawler", simpleRobotsTxt5.getBytes());
|
||||
BaseRobotRules rule5 = createRobotRules("Any-darn-crawler", simpleRobotsTxt5.getBytes("UTF-8"));
|
||||
assertFalse(rule5.isAllowed("http://www.fict.com/filename.php"));
|
||||
assertFalse(rule5.isAllowed("http://www.fict.com/folder/filename.php"));
|
||||
|
||||
|
@ -133,7 +134,7 @@ public class SimpleRobotRulesParserTest {
|
|||
// Test for /fish*.php
|
||||
final String simpleRobotsTxt6 = "User-agent: *" + CRLF + "Disallow: /fish*.php" + CRLF;
|
||||
|
||||
BaseRobotRules rule6 = createRobotRules("Any-darn-crawler", simpleRobotsTxt6.getBytes());
|
||||
BaseRobotRules rule6 = createRobotRules("Any-darn-crawler", simpleRobotsTxt6.getBytes("UTF-8"));
|
||||
assertFalse(rule6.isAllowed("http://www.fict.com/fish.php"));
|
||||
assertFalse(rule6.isAllowed("http://www.fict.com/fishheads/catfish.php?parameters"));
|
||||
|
||||
|
@ -142,35 +143,35 @@ public class SimpleRobotRulesParserTest {
|
|||
// Test rule with multiple '*' characters
|
||||
final String simpleRobotsTxt7 = "User-agent: *" + CRLF + "Disallow: /*fish*.php" + CRLF;
|
||||
|
||||
BaseRobotRules rule7 = createRobotRules("Any-darn-crawler", simpleRobotsTxt7.getBytes());
|
||||
BaseRobotRules rule7 = createRobotRules("Any-darn-crawler", simpleRobotsTxt7.getBytes("UTF-8"));
|
||||
assertFalse(rule7.isAllowed("http://www.fict.com/fish.php"));
|
||||
assertFalse(rule7.isAllowed("http://www.fict.com/superfishheads/catfish.php?parameters"));
|
||||
assertTrue(rule7.isAllowed("http://www.fict.com/fishheads/catfish.htm"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCommentedOutLines() throws MalformedURLException {
|
||||
public void testCommentedOutLines() throws MalformedURLException, UnsupportedEncodingException {
|
||||
final String simpleRobotsTxt = "#user-agent: testAgent" + LF + LF + "#allow: /index.html" + LF + "#allow: /test" + LF + LF + "#user-agent: test" + LF + LF + "#allow: /index.html" + LF
|
||||
+ "#disallow: /test" + LF + LF + "#user-agent: someAgent" + LF + LF + "#disallow: /index.html" + LF + "#disallow: /test" + LF + LF;
|
||||
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
Assert.assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRobotsTxtAlwaysAllowed() throws MalformedURLException {
|
||||
public void testRobotsTxtAlwaysAllowed() throws MalformedURLException, UnsupportedEncodingException {
|
||||
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("any-darn-crawler", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/robots.txt"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAgentNotListed() throws MalformedURLException {
|
||||
public void testAgentNotListed() throws MalformedURLException, UnsupportedEncodingException {
|
||||
// Access is assumed to be allowed, if no rules match an agent.
|
||||
final String simpleRobotsTxt = "User-agent: crawler1" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + CRLF + "User-agent: crawler2" + CRLF + "Disallow: /";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("crawler3", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("crawler3", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/index.html"));
|
||||
}
|
||||
|
@ -184,26 +185,26 @@ public class SimpleRobotRulesParserTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testSimplestAllowAll() throws MalformedURLException {
|
||||
public void testSimplestAllowAll() throws MalformedURLException, UnsupportedEncodingException {
|
||||
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow:";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMixedEndings() throws MalformedURLException {
|
||||
public void testMixedEndings() throws MalformedURLException, UnsupportedEncodingException {
|
||||
final String mixedEndingsRobotsTxt = "# /robots.txt for http://www.fict.org/" + CRLF + "# comments to webmaster@fict.org" + CR + LF + "User-agent: unhipbot" + LF + "Disallow: /" + CR + ""
|
||||
+ CRLF + "User-agent: webcrawler" + LF + "User-agent: excite" + CR + "Disallow: " + "\u0085" + CR + "User-agent: *" + CRLF + "Disallow: /org/plans.html" + LF + "Allow: /org/"
|
||||
+ CR + "Allow: /serv" + CRLF + "Allow: /~mak" + LF + "Disallow: /" + CRLF;
|
||||
|
||||
BaseRobotRules rules;
|
||||
|
||||
rules = createRobotRules("WebCrawler/3.0", mixedEndingsRobotsTxt.getBytes());
|
||||
rules = createRobotRules("WebCrawler/3.0", mixedEndingsRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.fict.org/"));
|
||||
assertTrue(rules.isAllowed("http://www.fict.org/index.html"));
|
||||
|
||||
rules = createRobotRules("Unknown/1.0", mixedEndingsRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Unknown/1.0", mixedEndingsRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.fict.org/"));
|
||||
assertFalse(rules.isAllowed("http://www.fict.org/index.html"));
|
||||
assertTrue(rules.isAllowed("http://www.fict.org/robots.txt"));
|
||||
|
@ -219,7 +220,7 @@ public class SimpleRobotRulesParserTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testRfpCases() throws MalformedURLException {
|
||||
public void testRfpCases() throws MalformedURLException, UnsupportedEncodingException {
|
||||
// Run through all of the tests that are part of the robots.txt RFP
|
||||
// http://www.robotstxt.org/norobots-rfc.txt
|
||||
final String rfpExampleRobotsTxt = "# /robots.txt for http://www.fict.org/" + CRLF + "# comments to webmaster@fict.org" + CRLF + CRLF + "User-agent: unhipbot" + CRLF + "Disallow: /" + CRLF
|
||||
|
@ -228,7 +229,7 @@ public class SimpleRobotRulesParserTest {
|
|||
|
||||
BaseRobotRules rules;
|
||||
|
||||
rules = createRobotRules("UnhipBot/0.1", rfpExampleRobotsTxt.getBytes());
|
||||
rules = createRobotRules("UnhipBot/0.1", rfpExampleRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.fict.org/"));
|
||||
assertFalse(rules.isAllowed("http://www.fict.org/index.html"));
|
||||
assertTrue(rules.isAllowed("http://www.fict.org/robots.txt"));
|
||||
|
@ -241,7 +242,7 @@ public class SimpleRobotRulesParserTest {
|
|||
assertFalse(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html"));
|
||||
assertFalse(rules.isAllowed("http://www.fict.org/%7Emak/mak.html"));
|
||||
|
||||
rules = createRobotRules("WebCrawler/3.0", rfpExampleRobotsTxt.getBytes());
|
||||
rules = createRobotRules("WebCrawler/3.0", rfpExampleRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.fict.org/"));
|
||||
assertTrue(rules.isAllowed("http://www.fict.org/index.html"));
|
||||
assertTrue(rules.isAllowed("http://www.fict.org/robots.txt"));
|
||||
|
@ -254,7 +255,7 @@ public class SimpleRobotRulesParserTest {
|
|||
assertTrue(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html"));
|
||||
assertTrue(rules.isAllowed("http://www.fict.org/%7Emak/mak.html"));
|
||||
|
||||
rules = createRobotRules("Excite/1.0", rfpExampleRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Excite/1.0", rfpExampleRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.fict.org/"));
|
||||
assertTrue(rules.isAllowed("http://www.fict.org/index.html"));
|
||||
assertTrue(rules.isAllowed("http://www.fict.org/robots.txt"));
|
||||
|
@ -267,7 +268,7 @@ public class SimpleRobotRulesParserTest {
|
|||
assertTrue(rules.isAllowed("http://www.fict.org/%7Ejim/jim.html"));
|
||||
assertTrue(rules.isAllowed("http://www.fict.org/%7Emak/mak.html"));
|
||||
|
||||
rules = createRobotRules("Unknown/1.0", rfpExampleRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Unknown/1.0", rfpExampleRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.fict.org/"));
|
||||
assertFalse(rules.isAllowed("http://www.fict.org/index.html"));
|
||||
assertTrue(rules.isAllowed("http://www.fict.org/robots.txt"));
|
||||
|
@ -282,7 +283,7 @@ public class SimpleRobotRulesParserTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testNutchCases() throws MalformedURLException {
|
||||
public void testNutchCases() throws MalformedURLException, UnsupportedEncodingException {
|
||||
// Run through the Nutch test cases.
|
||||
|
||||
final String nutchRobotsTxt = "User-Agent: Agent1 #foo" + CR + "Disallow: /a" + CR + "Disallow: /b/a" + CR + "#Disallow: /c" + CR + "" + CR + "" + CR + "User-Agent: Agent2 Agent3#foo" + CR
|
||||
|
@ -290,7 +291,7 @@ public class SimpleRobotRulesParserTest {
|
|||
|
||||
BaseRobotRules rules;
|
||||
|
||||
rules = createRobotRules("Agent1", nutchRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Agent1", nutchRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/a"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/a/"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/a/bloh/foo.html"));
|
||||
|
@ -312,7 +313,7 @@ public class SimpleRobotRulesParserTest {
|
|||
assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/f/"));
|
||||
|
||||
rules = createRobotRules("Agent2", nutchRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Agent2", nutchRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/a"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/a/"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html"));
|
||||
|
@ -334,7 +335,7 @@ public class SimpleRobotRulesParserTest {
|
|||
assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/f/"));
|
||||
|
||||
rules = createRobotRules("Agent3", nutchRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Agent3", nutchRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/a"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/a/"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html"));
|
||||
|
@ -356,7 +357,7 @@ public class SimpleRobotRulesParserTest {
|
|||
assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/f/"));
|
||||
|
||||
rules = createRobotRules("Agent4", nutchRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Agent4", nutchRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/a"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/a/"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html"));
|
||||
|
@ -378,7 +379,7 @@ public class SimpleRobotRulesParserTest {
|
|||
assertTrue(rules.isAllowed("http://www.domain.com/foo/bar/baz.html"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/f/"));
|
||||
|
||||
rules = createRobotRules("Agent5", nutchRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Agent5", nutchRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/a"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/a/"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/a/bloh/foo.html"));
|
||||
|
@ -400,7 +401,7 @@ public class SimpleRobotRulesParserTest {
|
|||
assertFalse(rules.isAllowed("http://www.domain.com/foo/bar/baz.html"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/f/"));
|
||||
|
||||
rules = createRobotRules("Agent5,Agent2,Agent1,Agent3,*", nutchRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Agent5,Agent2,Agent1,Agent3,*", nutchRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/a"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/a/"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/a/bloh/foo.html"));
|
||||
|
@ -424,18 +425,18 @@ public class SimpleRobotRulesParserTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testHtmlMarkupInRobotsTxt() throws MalformedURLException {
|
||||
public void testHtmlMarkupInRobotsTxt() throws MalformedURLException, UnsupportedEncodingException {
|
||||
final String htmlRobotsTxt = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 3.2 Final//EN\"><HTML>\n" + "<HEAD>\n" + "<TITLE>/robots.txt</TITLE>\n" + "</HEAD>\n" + "<BODY>\n"
|
||||
+ "User-agent: anybot<BR>\n" + "Disallow: <BR>\n" + "Crawl-Delay: 10<BR>\n" + "\n" + "User-agent: *<BR>\n" + "Disallow: /<BR>\n" + "Crawl-Delay: 30<BR>\n" + "\n" + "</BODY>\n"
|
||||
+ "</HTML>\n";
|
||||
|
||||
BaseRobotRules rules;
|
||||
|
||||
rules = createRobotRules("anybot", htmlRobotsTxt.getBytes());
|
||||
rules = createRobotRules("anybot", htmlRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/index.html"));
|
||||
assertEquals(10000, rules.getCrawlDelay());
|
||||
|
||||
rules = createRobotRules("bogusbot", htmlRobotsTxt.getBytes());
|
||||
rules = createRobotRules("bogusbot", htmlRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/index.html"));
|
||||
assertEquals(30000, rules.getCrawlDelay());
|
||||
}
|
||||
|
@ -450,39 +451,39 @@ public class SimpleRobotRulesParserTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testHeritrixCases() throws MalformedURLException {
|
||||
public void testHeritrixCases() throws MalformedURLException, UnsupportedEncodingException {
|
||||
final String heritrixRobotsTxt = "User-agent: *\n" + "Disallow: /cgi-bin/\n" + "Disallow: /details/software\n" + "\n" + "User-agent: denybot\n" + "Disallow: /\n" + "\n"
|
||||
+ "User-agent: allowbot1\n" + "Disallow: \n" + "\n" + "User-agent: allowbot2\n" + "Disallow: /foo\n" + "Allow: /\n" + "\n" + "User-agent: delaybot\n" + "Disallow: /\n"
|
||||
+ "Crawl-Delay: 20\n" + "Allow: /images/\n";
|
||||
|
||||
BaseRobotRules rules;
|
||||
rules = createRobotRules("Mozilla allowbot1 99.9", heritrixRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Mozilla allowbot1 99.9", heritrixRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/path"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/"));
|
||||
|
||||
rules = createRobotRules("Mozilla allowbot2 99.9", heritrixRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Mozilla allowbot2 99.9", heritrixRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/path"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/foo"));
|
||||
|
||||
rules = createRobotRules("Mozilla denybot 99.9", heritrixRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Mozilla denybot 99.9", heritrixRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/path"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/"));
|
||||
assertEquals(BaseRobotRules.UNSET_CRAWL_DELAY, rules.getCrawlDelay());
|
||||
|
||||
rules = createRobotRules("Mozilla anonbot 99.9", heritrixRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Mozilla anonbot 99.9", heritrixRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/path"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/cgi-bin/foo.pl"));
|
||||
|
||||
rules = createRobotRules("Mozilla delaybot 99.9", heritrixRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Mozilla delaybot 99.9", heritrixRobotsTxt.getBytes("UTF-8"));
|
||||
assertEquals(20000, rules.getCrawlDelay());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testCaseSensitivePaths() throws MalformedURLException {
|
||||
public void testCaseSensitivePaths() throws MalformedURLException, UnsupportedEncodingException {
|
||||
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Allow: /AnyPage.html" + CRLF + "Allow: /somepage.html" + CRLF + "Disallow: /";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/AnyPage.html"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/somepage.html"));
|
||||
|
@ -490,76 +491,76 @@ public class SimpleRobotRulesParserTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testEmptyDisallow() throws MalformedURLException {
|
||||
public void testEmptyDisallow() throws MalformedURLException, UnsupportedEncodingException {
|
||||
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow:";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testEmptyAllow() throws MalformedURLException {
|
||||
public void testEmptyAllow() throws MalformedURLException, UnsupportedEncodingException {
|
||||
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Allow:";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiWildcard() throws MalformedURLException {
|
||||
public void testMultiWildcard() throws MalformedURLException, UnsupportedEncodingException {
|
||||
// Make sure we only take the first wildcard entry.
|
||||
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + CRLF + "User-agent: *" + CRLF + "Disallow: /";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/index.html"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiMatches() throws MalformedURLException {
|
||||
public void testMultiMatches() throws MalformedURLException, UnsupportedEncodingException {
|
||||
// Make sure we only take the first record that matches.
|
||||
final String simpleRobotsTxt = "User-agent: crawlerbot" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + CRLF + "User-agent: crawler" + CRLF + "Disallow: /";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("crawlerbot", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("crawlerbot", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/index.html"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiAgentNames() throws MalformedURLException {
|
||||
public void testMultiAgentNames() throws MalformedURLException, UnsupportedEncodingException {
|
||||
// When there are more than one agent name on a line.
|
||||
final String simpleRobotsTxt = "User-agent: crawler1 crawler2" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("crawler2", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("crawler2", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/index.html"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testMultiWordAgentName() throws MalformedURLException {
|
||||
public void testMultiWordAgentName() throws MalformedURLException, UnsupportedEncodingException {
|
||||
// When the user agent name has a space in it.
|
||||
final String simpleRobotsTxt = "User-agent: Download Ninja" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("Download Ninja", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("Download Ninja", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/index.html"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testUnsupportedFields() throws MalformedURLException {
|
||||
public void testUnsupportedFields() throws MalformedURLException, UnsupportedEncodingException {
|
||||
// When we have a new field type that we don't know about.
|
||||
final String simpleRobotsTxt = "User-agent: crawler1" + CRLF + "Disallow: /index.html" + CRLF + "Allow: /" + CRLF + "newfield: 234" + CRLF + "User-agent: crawler2" + CRLF + "Disallow: /";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("crawler2", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("crawler2", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAcapFields() throws MalformedURLException {
|
||||
public void testAcapFields() throws MalformedURLException, UnsupportedEncodingException {
|
||||
final String robotsTxt = "acap-crawler: *" + CRLF + "acap-disallow-crawl: /ultima_ora/";
|
||||
|
||||
SimpleRobotRulesParser parser = new SimpleRobotRulesParser();
|
||||
parser.parseContent("url", robotsTxt.getBytes(), "text/plain", "foobot");
|
||||
parser.parseContent("url", robotsTxt.getBytes("UTF-8"), "text/plain", "foobot");
|
||||
assertEquals(0, parser.getNumWarnings());
|
||||
}
|
||||
|
||||
|
@ -597,34 +598,34 @@ public class SimpleRobotRulesParserTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testCrawlDelay() {
|
||||
public void testCrawlDelay() throws UnsupportedEncodingException {
|
||||
final String delayRules1RobotsTxt = "User-agent: bixo" + CR + "Crawl-delay: 10" + CR + "User-agent: foobot" + CR + "Crawl-delay: 20" + CR + "User-agent: *" + CR + "Disallow:/baz" + CR;
|
||||
|
||||
BaseRobotRules rules = createRobotRules("bixo", delayRules1RobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("bixo", delayRules1RobotsTxt.getBytes("UTF-8"));
|
||||
long crawlDelay = rules.getCrawlDelay();
|
||||
assertEquals("testing crawl delay for agent bixo - rule 1", 10000, crawlDelay);
|
||||
|
||||
final String delayRules2RobotsTxt = "User-agent: foobot" + CR + "Crawl-delay: 20" + CR + "User-agent: *" + CR + "Disallow:/baz" + CR;
|
||||
|
||||
rules = createRobotRules("bixo", delayRules2RobotsTxt.getBytes());
|
||||
rules = createRobotRules("bixo", delayRules2RobotsTxt.getBytes("UTF-8"));
|
||||
crawlDelay = rules.getCrawlDelay();
|
||||
assertEquals("testing crawl delay for agent bixo - rule 2", BaseRobotRules.UNSET_CRAWL_DELAY, crawlDelay);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBigCrawlDelay() throws MalformedURLException {
|
||||
public void testBigCrawlDelay() throws MalformedURLException, UnsupportedEncodingException {
|
||||
final String robotsTxt = "User-agent: *" + CR + "Crawl-delay: 3600" + CR + "Disallow:" + CR;
|
||||
|
||||
BaseRobotRules rules = createRobotRules("bixo", robotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("bixo", robotsTxt.getBytes("UTF-8"));
|
||||
assertFalse("disallow all if huge crawl delay", rules.isAllowed("http://www.domain.com/"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBrokenKrugleRobotsTxtFile() throws MalformedURLException {
|
||||
public void testBrokenKrugleRobotsTxtFile() throws MalformedURLException, UnsupportedEncodingException {
|
||||
final String krugleRobotsTxt = "User-agent: *" + CR + "Disallow: /maintenance.html" + CR + "Disallow: /perl/" + CR + "Disallow: /cgi-bin/" + CR + "Disallow: /examples/" + CR
|
||||
+ "Crawl-delay: 3" + CR + "" + CR + "User-agent: googlebot" + CR + "Crawl-delay: 1" + CR + "" + CR + "User-agent: qihoobot" + CR + "Disallow: /";
|
||||
|
||||
BaseRobotRules rules = createRobotRules("googlebot/2.1", krugleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("googlebot/2.1", krugleRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.krugle.com/examples/index.html"));
|
||||
}
|
||||
|
||||
|
@ -647,10 +648,10 @@ public class SimpleRobotRulesParserTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testFloatingPointCrawlDelay() throws MalformedURLException {
|
||||
public void testFloatingPointCrawlDelay() throws MalformedURLException, UnsupportedEncodingException {
|
||||
final String robotsTxt = "User-agent: *" + CR + "Crawl-delay: 0.5" + CR + "Disallow:" + CR;
|
||||
|
||||
BaseRobotRules rules = createRobotRules("bixo", robotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("bixo", robotsTxt.getBytes("UTF-8"));
|
||||
assertEquals(500, rules.getCrawlDelay());
|
||||
}
|
||||
|
||||
|
@ -703,7 +704,7 @@ public class SimpleRobotRulesParserTest {
|
|||
assertEquals("Found sitemap", 3, rules.getSitemaps().size());
|
||||
// check that the last one is not lowercase only
|
||||
String url = rules.getSitemaps().get(2);
|
||||
boolean lowercased = url.equals(url.toLowerCase());
|
||||
boolean lowercased = url.equals(url.toLowerCase(Locale.getDefault()));
|
||||
assertFalse("Sitemap case check", lowercased);
|
||||
}
|
||||
|
||||
|
@ -749,7 +750,7 @@ public class SimpleRobotRulesParserTest {
|
|||
public void testAllowBeforeDisallow() throws Exception {
|
||||
final String simpleRobotsTxt = "User-agent: *" + CRLF + "Disallow: /fish" + CRLF + "Allow: /fish" + CRLF;
|
||||
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
|
||||
assertTrue(rules.isAllowed("http://www.fict.com/fish"));
|
||||
}
|
||||
|
@ -758,16 +759,16 @@ public class SimpleRobotRulesParserTest {
|
|||
public void testSpacesInMultipleUserAgentNames() throws Exception {
|
||||
final String simpleRobotsTxt = "User-agent: One, Two, Three" + CRLF + "Disallow: /" + CRLF + "" + CRLF + "User-agent: *" + CRLF + "Allow: /" + CRLF;
|
||||
|
||||
BaseRobotRules rules = createRobotRules("One", simpleRobotsTxt.getBytes());
|
||||
BaseRobotRules rules = createRobotRules("One", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.fict.com/fish"));
|
||||
|
||||
rules = createRobotRules("Two", simpleRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Two", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.fict.com/fish"));
|
||||
|
||||
rules = createRobotRules("Three", simpleRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Three", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertFalse(rules.isAllowed("http://www.fict.com/fish"));
|
||||
|
||||
rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes());
|
||||
rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes("UTF-8"));
|
||||
assertTrue(rules.isAllowed("http://www.fict.com/fish"));
|
||||
}
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ package crawlercommons.sitemaps;
|
|||
import static org.junit.Assert.*;
|
||||
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Locale;
|
||||
import java.util.TimeZone;
|
||||
|
||||
import org.junit.Test;
|
||||
|
@ -14,7 +15,7 @@ public class AbstractSiteMapTest {
|
|||
assertNull(AbstractSiteMap.convertToDate("blah"));
|
||||
assertNull(AbstractSiteMap.convertToDate(null));
|
||||
|
||||
SimpleDateFormat isoFormatNoTimezone = new SimpleDateFormat("yyyyMMdd");
|
||||
SimpleDateFormat isoFormatNoTimezone = new SimpleDateFormat("yyyyMMdd", Locale.getDefault());
|
||||
|
||||
// For formats where there's no time zone information, the time zone is
|
||||
// undefined, so we can
|
||||
|
@ -23,7 +24,7 @@ public class AbstractSiteMapTest {
|
|||
assertEquals("20140601", isoFormatNoTimezone.format(AbstractSiteMap.convertToDate("2014-06")));
|
||||
assertEquals("20140603", isoFormatNoTimezone.format(AbstractSiteMap.convertToDate("2014-06-03")));
|
||||
|
||||
SimpleDateFormat isoFormat = new SimpleDateFormat("yyyyMMdd'T'HHmmss");
|
||||
SimpleDateFormat isoFormat = new SimpleDateFormat("yyyyMMdd'T'HHmmss", Locale.getDefault());
|
||||
isoFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
|
||||
// Complete date plus hours and minutes
|
||||
|
@ -38,7 +39,7 @@ public class AbstractSiteMapTest {
|
|||
|
||||
// Complete date plus hours, minutes, seconds and a decimal fraction of
|
||||
// a second
|
||||
SimpleDateFormat isoFormatWithFractionSeconds = new SimpleDateFormat("yyyyMMdd'T'HHmmss.S");
|
||||
SimpleDateFormat isoFormatWithFractionSeconds = new SimpleDateFormat("yyyyMMdd'T'HHmmss.S", Locale.getDefault());
|
||||
isoFormatWithFractionSeconds.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
assertEquals("20140603T103045.820", isoFormatWithFractionSeconds.format(AbstractSiteMap.convertToDate("2014-06-03T10:30:45.82+00:00")));
|
||||
|
||||
|
|
|
@ -21,10 +21,11 @@ import java.io.File;
|
|||
import java.io.FileInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.URL;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
import java.util.TimeZone;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.After;
|
||||
|
@ -32,11 +33,15 @@ import org.junit.Before;
|
|||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.JUnit4;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import static org.junit.Assert.*;
|
||||
|
||||
@RunWith(JUnit4.class)
|
||||
public class SiteMapParserTest {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(SiteMapParserTest.class);
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
|
@ -79,11 +84,10 @@ public class SiteMapParserTest {
|
|||
|
||||
@Test
|
||||
public void testFullDateFormat() {
|
||||
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00");
|
||||
|
||||
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00", Locale.getDefault());
|
||||
Date date = new Date();
|
||||
System.out.println(format.format(date));
|
||||
System.out.println(SiteMap.getFullDateFormat().format(date));
|
||||
LOG.info(format.format(date));
|
||||
LOG.info(SiteMap.getFullDateFormat().format(date));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -91,7 +95,7 @@ public class SiteMapParserTest {
|
|||
SiteMapParser parser = new SiteMapParser();
|
||||
String contentType = "text/plain";
|
||||
String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11";
|
||||
byte[] content = scontent.getBytes();
|
||||
byte[] content = scontent.getBytes("UTF-8");
|
||||
URL url = new URL("http://www.example.com/sitemap.txt");
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
|
@ -106,7 +110,7 @@ public class SiteMapParserTest {
|
|||
public void testSitemapTXTWithXMLExt() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11";
|
||||
byte[] content = scontent.getBytes();
|
||||
byte[] content = scontent.getBytes("UTF-8");
|
||||
URL url = new URL("http://www.example.com/sitemap.xml");
|
||||
String contentType = "text/plain";
|
||||
|
||||
|
@ -160,7 +164,7 @@ public class SiteMapParserTest {
|
|||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")
|
||||
.append("<url><!-- This file is not a valid XML file --></url>").append("<url><loc> http://cs.harding.edu/fmccown/sitemaps/something.html</loc>")
|
||||
.append("</url><!-- missing opening url tag --></url></urlset>");
|
||||
byte[] content = scontent.toString().getBytes();
|
||||
byte[] content = scontent.toString().getBytes("UTF-8");
|
||||
URL url = new URL("http://www.example.com/sitemapindex.xml");
|
||||
|
||||
parser.parseSiteMap(contentType, content, url); // This Sitemap contains
|
||||
|
@ -224,7 +228,7 @@ public class SiteMapParserTest {
|
|||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
|
||||
.append("<loc>http://www.example.com/</loc>").append("</url>").append("</urlset>");
|
||||
byte[] content = scontent.toString().getBytes();
|
||||
byte[] content = scontent.toString().getBytes("UTF-8");
|
||||
|
||||
URL url = new URL("http://www.example.com/subsection/sitemap.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
|
@ -245,8 +249,9 @@ public class SiteMapParserTest {
|
|||
assertFalse(sm.getSiteMapUrls().iterator().next().isValid());
|
||||
}
|
||||
|
||||
/** Returns a good simple default XML sitemap as a byte array */
|
||||
private byte[] getXMLSitemapAsBytes() {
|
||||
/** Returns a good simple default XML sitemap as a byte array
|
||||
* @throws UnsupportedEncodingException */
|
||||
private byte[] getXMLSitemapAsBytes() throws UnsupportedEncodingException {
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
|
||||
.append(" <loc>http://www.example.com/</loc>").append(" <lastmod>2005-01-01</lastmod>").append(" <changefreq>monthly</changefreq>").append(" <priority>0.8</priority>")
|
||||
|
@ -257,6 +262,6 @@ public class SiteMapParserTest {
|
|||
.append(" <loc><url><![CDATA[http://www.example.com/catalog?item=83&desc=vacation_usa]]></url></loc>").append(" <lastmod>2004-11-23</lastmod>").append("</url>")
|
||||
.append("</urlset>");
|
||||
|
||||
return scontent.toString().getBytes();
|
||||
return scontent.toString().getBytes("UTF-8");
|
||||
}
|
||||
}
|
|
@ -56,14 +56,14 @@ public class RedirectResponseHandler extends AbstractHttpHandler {
|
|||
|
||||
String content = "redirected content";
|
||||
response.setContentLength(content.length());
|
||||
response.getOutputStream().write(content.getBytes());
|
||||
response.getOutputStream().write(content.getBytes("UTF-8"));
|
||||
} else {
|
||||
response.setStatus(HttpStatus.SC_OK);
|
||||
response.setContentType("text/plain");
|
||||
|
||||
String content = "other content";
|
||||
response.setContentLength(content.length());
|
||||
response.getOutputStream().write(content.getBytes());
|
||||
response.getOutputStream().write(content.getBytes("UTF-8"));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue