mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-11 08:16:04 +02:00
Robots parser to always handle absolute sitemap URL even without valid base URL,
fixes #240
This commit is contained in:
parent
ec2ad7c790
commit
b449fdf024
|
@ -1,6 +1,7 @@
|
|||
Crawler-Commons Change Log
|
||||
|
||||
Current Development 1.1-SNAPSHOT (yyyy-mm-dd)
|
||||
- [Robots] Robots parser to always handle absolute sitemap URL even without valid base URL (pr3mar, kkrugler, sebastian-nagel) #240
|
||||
|
||||
Release 1.0 (2019-03-19)
|
||||
- [Sitemaps] Unit tests depend on system timezone (kkrugler, sebastian-nagel) #238
|
||||
|
|
|
@ -38,7 +38,10 @@ public abstract class BaseRobotsParser implements Serializable {
|
|||
* lower-casing, and the prefix match rule.
|
||||
*
|
||||
* @param url
|
||||
* URL that content was fetched from (for reporting purposes)
|
||||
* URL that robots.txt content was fetched from. A complete and
|
||||
* valid URL (e.g., https://example.com/robots.txt) is expected.
|
||||
* Used to resolve relative sitemap URLs and for
|
||||
* logging/reporting purposes.
|
||||
* @param content
|
||||
* raw bytes from the site's robots.txt file
|
||||
* @param contentType
|
||||
|
|
|
@ -737,10 +737,21 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
|||
|
||||
String sitemap = token.getData();
|
||||
try {
|
||||
URL sitemap_url = new URL(new URL(state.getUrl()), sitemap);
|
||||
String hostname = sitemap_url.getHost();
|
||||
URL sitemapUrl;
|
||||
URL base = null;
|
||||
try {
|
||||
base = new URL(state.getUrl());
|
||||
} catch (MalformedURLException e) {
|
||||
// must try without base URL
|
||||
}
|
||||
if (base != null) {
|
||||
sitemapUrl = new URL(base, sitemap);
|
||||
} else {
|
||||
sitemapUrl = new URL(sitemap);
|
||||
}
|
||||
String hostname = sitemapUrl.getHost();
|
||||
if ((hostname != null) && (hostname.length() > 0)) {
|
||||
state.addSitemap(sitemap_url.toExternalForm());
|
||||
state.addSitemap(sitemapUrl.toExternalForm());
|
||||
}
|
||||
} catch (Exception e) {
|
||||
reportWarning("Invalid URL with sitemap directive: " + sitemap, state.getUrl());
|
||||
|
|
|
@ -29,8 +29,6 @@ import java.util.Locale;
|
|||
|
||||
import javax.servlet.http.HttpServletResponse;
|
||||
|
||||
import junit.framework.Assert;
|
||||
|
||||
import org.junit.Test;
|
||||
|
||||
public class SimpleRobotRulesParserTest {
|
||||
|
@ -154,7 +152,7 @@ public class SimpleRobotRulesParserTest {
|
|||
+ "#disallow: /test" + LF + LF + "#user-agent: someAgent" + LF + LF + "#disallow: /index.html" + LF + "#disallow: /test" + LF + LF;
|
||||
|
||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8));
|
||||
Assert.assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -727,6 +725,21 @@ public class SimpleRobotRulesParserTest {
|
|||
assertEquals("Found sitemap", 1, rules.getSitemaps().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapInvalidBaseUrl() throws Exception {
|
||||
// test https://github.com/crawler-commons/crawler-commons/issues/240
|
||||
// - should handle absolute sitemap URL even if base URL isn't valid
|
||||
|
||||
final String simpleRobotsTxt = "Sitemap: https://www.example.com/sitemap.xml";
|
||||
|
||||
SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
|
||||
BaseRobotRules rules = robotParser.parseContent("example.com", simpleRobotsTxt.getBytes(UTF_8), "text/plain", "a");
|
||||
|
||||
assertEquals(1, rules.getSitemaps().size());
|
||||
assertEquals("https://www.example.com/sitemap.xml", rules.getSitemaps().get(0));
|
||||
assertEquals("Found sitemap", 1, rules.getSitemaps().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testManyUserAgents() throws Exception {
|
||||
BaseRobotRules rules = createRobotRules("wget", readFile("/robots/many-user-agents.txt"));
|
||||
|
|
Loading…
Reference in New Issue