mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-06-04 14:16:05 +02:00
Robots parser to always handle absolute sitemap URL even without valid base URL,
fixes #240
This commit is contained in:
parent
ec2ad7c790
commit
b449fdf024
|
@ -1,6 +1,7 @@
|
||||||
Crawler-Commons Change Log
|
Crawler-Commons Change Log
|
||||||
|
|
||||||
Current Development 1.1-SNAPSHOT (yyyy-mm-dd)
|
Current Development 1.1-SNAPSHOT (yyyy-mm-dd)
|
||||||
|
- [Robots] Robots parser to always handle absolute sitemap URL even without valid base URL (pr3mar, kkrugler, sebastian-nagel) #240
|
||||||
|
|
||||||
Release 1.0 (2019-03-19)
|
Release 1.0 (2019-03-19)
|
||||||
- [Sitemaps] Unit tests depend on system timezone (kkrugler, sebastian-nagel) #238
|
- [Sitemaps] Unit tests depend on system timezone (kkrugler, sebastian-nagel) #238
|
||||||
|
|
|
@ -38,7 +38,10 @@ public abstract class BaseRobotsParser implements Serializable {
|
||||||
* lower-casing, and the prefix match rule.
|
* lower-casing, and the prefix match rule.
|
||||||
*
|
*
|
||||||
* @param url
|
* @param url
|
||||||
* URL that content was fetched from (for reporting purposes)
|
* URL that robots.txt content was fetched from. A complete and
|
||||||
|
* valid URL (e.g., https://example.com/robots.txt) is expected.
|
||||||
|
* Used to resolve relative sitemap URLs and for
|
||||||
|
* logging/reporting purposes.
|
||||||
* @param content
|
* @param content
|
||||||
* raw bytes from the site's robots.txt file
|
* raw bytes from the site's robots.txt file
|
||||||
* @param contentType
|
* @param contentType
|
||||||
|
|
|
@ -737,10 +737,21 @@ public class SimpleRobotRulesParser extends BaseRobotsParser {
|
||||||
|
|
||||||
String sitemap = token.getData();
|
String sitemap = token.getData();
|
||||||
try {
|
try {
|
||||||
URL sitemap_url = new URL(new URL(state.getUrl()), sitemap);
|
URL sitemapUrl;
|
||||||
String hostname = sitemap_url.getHost();
|
URL base = null;
|
||||||
|
try {
|
||||||
|
base = new URL(state.getUrl());
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
// must try without base URL
|
||||||
|
}
|
||||||
|
if (base != null) {
|
||||||
|
sitemapUrl = new URL(base, sitemap);
|
||||||
|
} else {
|
||||||
|
sitemapUrl = new URL(sitemap);
|
||||||
|
}
|
||||||
|
String hostname = sitemapUrl.getHost();
|
||||||
if ((hostname != null) && (hostname.length() > 0)) {
|
if ((hostname != null) && (hostname.length() > 0)) {
|
||||||
state.addSitemap(sitemap_url.toExternalForm());
|
state.addSitemap(sitemapUrl.toExternalForm());
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
reportWarning("Invalid URL with sitemap directive: " + sitemap, state.getUrl());
|
reportWarning("Invalid URL with sitemap directive: " + sitemap, state.getUrl());
|
||||||
|
|
|
@ -29,8 +29,6 @@ import java.util.Locale;
|
||||||
|
|
||||||
import javax.servlet.http.HttpServletResponse;
|
import javax.servlet.http.HttpServletResponse;
|
||||||
|
|
||||||
import junit.framework.Assert;
|
|
||||||
|
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
public class SimpleRobotRulesParserTest {
|
public class SimpleRobotRulesParserTest {
|
||||||
|
@ -154,7 +152,7 @@ public class SimpleRobotRulesParserTest {
|
||||||
+ "#disallow: /test" + LF + LF + "#user-agent: someAgent" + LF + LF + "#disallow: /index.html" + LF + "#disallow: /test" + LF + LF;
|
+ "#disallow: /test" + LF + LF + "#user-agent: someAgent" + LF + LF + "#disallow: /index.html" + LF + "#disallow: /test" + LF + LF;
|
||||||
|
|
||||||
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8));
|
BaseRobotRules rules = createRobotRules("Any-darn-crawler", simpleRobotsTxt.getBytes(UTF_8));
|
||||||
Assert.assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
assertTrue(rules.isAllowed("http://www.domain.com/anypage.html"));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
@ -727,6 +725,21 @@ public class SimpleRobotRulesParserTest {
|
||||||
assertEquals("Found sitemap", 1, rules.getSitemaps().size());
|
assertEquals("Found sitemap", 1, rules.getSitemaps().size());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testSitemapInvalidBaseUrl() throws Exception {
|
||||||
|
// test https://github.com/crawler-commons/crawler-commons/issues/240
|
||||||
|
// - should handle absolute sitemap URL even if base URL isn't valid
|
||||||
|
|
||||||
|
final String simpleRobotsTxt = "Sitemap: https://www.example.com/sitemap.xml";
|
||||||
|
|
||||||
|
SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();
|
||||||
|
BaseRobotRules rules = robotParser.parseContent("example.com", simpleRobotsTxt.getBytes(UTF_8), "text/plain", "a");
|
||||||
|
|
||||||
|
assertEquals(1, rules.getSitemaps().size());
|
||||||
|
assertEquals("https://www.example.com/sitemap.xml", rules.getSitemaps().get(0));
|
||||||
|
assertEquals("Found sitemap", 1, rules.getSitemaps().size());
|
||||||
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testManyUserAgents() throws Exception {
|
public void testManyUserAgents() throws Exception {
|
||||||
BaseRobotRules rules = createRobotRules("wget", readFile("/robots/many-user-agents.txt"));
|
BaseRobotRules rules = createRobotRules("wget", readFile("/robots/many-user-agents.txt"));
|
||||||
|
|
Loading…
Reference in New Issue