1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-11 08:16:04 +02:00

Robots parser to always handle absolute sitemap URL even without valid base URL,

fixes #240
This commit is contained in:
Sebastian Nagel 2019-03-26 15:02:43 +01:00
parent ec2ad7c790
commit b449fdf024
4 changed files with 35 additions and 7 deletions

View File

@ -1,6 +1,7 @@
Crawler-Commons Change Log
Current Development 1.1-SNAPSHOT (yyyy-mm-dd)
- [Robots] Robots parser to always handle absolute sitemap URL even without valid base URL (pr3mar, kkrugler, sebastian-nagel) #240
Release 1.0 (2019-03-19)
- [Sitemaps] Unit tests depend on system timezone (kkrugler, sebastian-nagel) #238

View File

@ -38,7 +38,10 @@ public abstract class BaseRobotsParser implements Serializable {
* lower-casing, and the prefix match rule.
*
* @param url
* URL that content was fetched from (for reporting purposes)
* URL that robots.txt content was fetched from. A complete and
* valid URL (e.g., https://example.com/robots.txt) is expected.
* Used to resolve relative sitemap URLs and for
* logging/reporting purposes.
* @param content
* raw bytes from the site's robots.txt file
* @param contentType

View File

@ -737,10 +737,21 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
String sitemap = token.getData();
try {
URL sitemap_url = new URL(new URL(state.getUrl()), sitemap);
String hostname = sitemap_url.getHost();
URL sitemapUrl;
URL base = null;
try {
base = new URL(state.getUrl());
} catch (MalformedURLException e) {
// must try without base URL
}
if (base != null) {
sitemapUrl = new URL(base, sitemap);
} else {
sitemapUrl = new URL(sitemap);
}
String hostname = sitemapUrl.getHost();
if ((hostname != null) && (hostname.length() > 0)) {
state.addSitemap(sitemap_url.toExternalForm());
state.addSitemap(sitemapUrl.toExternalForm());
}
} catch (Exception e) {
reportWarning("Invalid URL with sitemap directive: " + sitemap, state.getUrl());

View File

@ -29,8 +29,6 @@ import java.util.Locale;
import javax.servlet.http.HttpServletResponse;
import junit.framework.Assert;
import org.junit.Test;
public class SimpleRobotRulesParserTest {
@ -154,7 +152,7 @@ public class SimpleRobotRulesParserTest {
+ "#disallow: /test" + LF + LF + "#user-agent: someAgent" + LF + LF + "#disallow: /index.html" + LF + "#disallow: /test" + LF + LF;
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8));
Assert.assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
}
@Test
@ -727,6 +725,21 @@ public class SimpleRobotRulesParserTest {
assertEquals("Found sitemap", 1, rules.getSitemaps().size());
}
@Test
public void testSitemapInvalidBaseUrl() throws Exception {
// test https://github.com/crawler-commons/crawler-commons/issues/240
// - should handle absolute sitemap URL even if base URL isn't valid
final String simpleRobotsTxt = "Sitemap: https://www.example.com/sitemap.xml";
SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
BaseRobotRules rules = robotParser.parseContent("example.com", simpleRobotsTxt.getBytes(UTF_8), "text/plain", "a");
assertEquals(1, rules.getSitemaps().size());
assertEquals("https://www.example.com/sitemap.xml", rules.getSitemaps().get(0));
assertEquals("Found sitemap", 1, rules.getSitemaps().size());
}
@Test
public void testManyUserAgents() throws Exception {
BaseRobotRules rules = createRobotRules("wget", readFile("/robots/many-user-agents.txt"));