1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-07 07:16:02 +02:00

[Sitemaps] Disable support for DTDs in sitemaps by default

- update change log
- apply code formatting
- add support for parsing sitemaps with DTD in SiteMapTester
This commit is contained in:
Sebastian Nagel 2022-03-02 15:47:19 +01:00
parent 273ac6ac7e
commit 23ee0634dc
4 changed files with 38 additions and 29 deletions

View File

@ -1,6 +1,7 @@
Crawler-Commons Change Log Crawler-Commons Change Log
Current Development 1.3-SNAPSHOT (yyyy-mm-dd) Current Development 1.3-SNAPSHOT (yyyy-mm-dd)
- [Sitemaps] Disable support for DTDs in XML sitemaps and feeds by default (Kenneth Wong) #371
- Migrate Continuous Integration from Travis to GitHub Actions (Valery Yatsynovich) #333 - Migrate Continuous Integration from Travis to GitHub Actions (Valery Yatsynovich) #333
- Upgrade dependencies (dependabot, Richard Zowalla) #334, #339, #345, #346, #347, #350, #369 - Upgrade dependencies (dependabot, Richard Zowalla) #334, #339, #345, #346, #347, #350, #369
- Upgrade Maven plugins (dependabot, Richard Zowalla, sebastian-nagel) #328, #329, #330, #331, #335, #336, #337, #338, #340, #341, #343, #356, #363. #364, #366 - Upgrade Maven plugins (dependabot, Richard Zowalla, sebastian-nagel) #328, #329, #330, #331, #335, #336, #337, #338, #340, #341, #343, #356, #363. #364, #366

View File

@ -101,9 +101,9 @@ public class SiteMapParser {
protected Map<String, Extension> extensionNamespaces = new HashMap<>(); protected Map<String, Extension> extensionNamespaces = new HashMap<>();
private MimeTypeDetector mimeTypeDetector; private MimeTypeDetector mimeTypeDetector;
/** /**
* Option to allow DTD when parsing site map * Option to allow DTDs in sitemaps.
*/ */
private boolean allowDocTypeDefinitions = false; private boolean allowDocTypeDefinitions = false;
@ -145,6 +145,16 @@ public class SiteMapParser {
this.mimeTypeDetector = new MimeTypeDetector(); this.mimeTypeDetector = new MimeTypeDetector();
} }
/**
* Sets if the parser allows a DTD in sitemaps or feeds.
*
* @param allowDocTypeDefinitions
* true if allowed. Default is false.
*/
public void setAllowDocTypeDefinitions(boolean allowDocTypeDefinitions) {
this.allowDocTypeDefinitions = allowDocTypeDefinitions;
}
/** /**
* @return whether invalid URLs will be rejected (where invalid means that * @return whether invalid URLs will be rejected (where invalid means that
* the URL is not under the base URL, see <a href= * the URL is not under the base URL, see <a href=
@ -169,7 +179,7 @@ public class SiteMapParser {
* specification, or any accepted namespace (see * specification, or any accepted namespace (see
* {@link #addAcceptedNamespace(String)}). Note enabling strict namespace * {@link #addAcceptedNamespace(String)}). Note enabling strict namespace
* checking always adds the namespace defined by the current sitemap * checking always adds the namespace defined by the current sitemap
* specificiation ({@link Namespace#SITEMAP}) to the list of accepted * specification ({@link Namespace#SITEMAP}) to the list of accepted
* namespaces. * namespaces.
* *
* @param s * @param s
@ -247,6 +257,7 @@ public class SiteMapParser {
public void setURLFilter(URLFilter filter) { public void setURLFilter(URLFilter filter) {
urlFilter = filter::filter; urlFilter = filter::filter;
} }
/** /**
* Returns a SiteMap or SiteMapIndex given an online sitemap URL * Returns a SiteMap or SiteMapIndex given an online sitemap URL
* *
@ -669,12 +680,4 @@ public class SiteMapParser {
public static boolean urlIsValid(String sitemapBaseUrl, String testUrl) { public static boolean urlIsValid(String sitemapBaseUrl, String testUrl) {
return testUrl.startsWith(sitemapBaseUrl); return testUrl.startsWith(sitemapBaseUrl);
} }
/**
* Set if the parser allow DTD
* @param allowDocTypeDefinitions true if allowed. Default is false.
*/
public void setAllowDocTypeDefinitions(boolean allowDocTypeDefinitions) {
this.allowDocTypeDefinitions = allowDocTypeDefinitions;
}
} }

View File

@ -46,6 +46,8 @@ public class SiteMapTester {
LOG.error("Java properties:"); LOG.error("Java properties:");
LOG.error(" sitemap.strictNamespace"); LOG.error(" sitemap.strictNamespace");
LOG.error(" if true sitemaps are required to use the standard namespace URI"); LOG.error(" if true sitemaps are required to use the standard namespace URI");
LOG.error(" sitemap.allow.dtd");
LOG.error(" if true sitemaps are allowed to include a DTD");
LOG.error(" sitemap.extensions"); LOG.error(" sitemap.extensions");
LOG.error(" if true enable sitemap extension parsing"); LOG.error(" if true enable sitemap extension parsing");
LOG.error(" sitemap.filter.urls"); LOG.error(" sitemap.filter.urls");
@ -71,6 +73,9 @@ public class SiteMapTester {
boolean strictNamespace = Boolean.getBoolean("sitemap.strictNamespace"); boolean strictNamespace = Boolean.getBoolean("sitemap.strictNamespace");
saxParser.setStrictNamespace(strictNamespace); saxParser.setStrictNamespace(strictNamespace);
boolean allowDTD = Boolean.getBoolean("sitemap.allow.dtd");
saxParser.setAllowDocTypeDefinitions(allowDTD);
boolean enableExtensions = Boolean.getBoolean("sitemap.extensions"); boolean enableExtensions = Boolean.getBoolean("sitemap.extensions");
if (enableExtensions) { if (enableExtensions) {
saxParser.enableExtensions(); saxParser.enableExtensions();

View File

@ -141,45 +141,45 @@ public class SiteMapParserTest {
Assertions.assertThrows(UnknownFormatException.class, Assertions.assertThrows(UnknownFormatException.class,
() -> parser.parseSiteMap(contentType, content, url)); () -> parser.parseSiteMap(contentType, content, url));
} }
@Test @Test
public void testSitemapXXEWithDocTypeAllowed() throws UnknownFormatException, IOException { public void testSitemapXXEWithDocTypeAllowed() throws UnknownFormatException, IOException {
// A file on disk that would be read if we were vulnerable to XXE // A file on disk that would be read if we were vulnerable to XXE
File doNotVisit = new File("src/test/resources/sitemaps/do-not-visit.txt"); File doNotVisit = new File("src/test/resources/sitemaps/do-not-visit.txt");
// Create a sitemap with an external entity referring to the local file // Create a sitemap with an external entity referring to the local file
SiteMapParser parser = new SiteMapParser(); SiteMapParser parser = new SiteMapParser();
parser.setAllowDocTypeDefinitions(true); parser.setAllowDocTypeDefinitions(true);
String contentType = "text/xml"; String contentType = "text/xml";
StringBuilder scontent = new StringBuilder(1024); StringBuilder scontent = new StringBuilder(1024);
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") // scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") //
.append("<!DOCTYPE urlset [\n") // .append("<!DOCTYPE urlset [\n") //
.append(" <!ENTITY test SYSTEM \"file://" + doNotVisit.getAbsolutePath() + "\">\n") // .append(" <!ENTITY test SYSTEM \"file://" + doNotVisit.getAbsolutePath() + "\">\n") //
.append("]>\n") // .append("]>\n") //
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n") // .append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n") //
.append(" <url>\n") // .append(" <url>\n") //
.append(" <loc>http://www.example.com/visit-here</loc>\n") // .append(" <loc>http://www.example.com/visit-here</loc>\n") //
.append(" <lastmod>2019-06-19</lastmod>\n") // .append(" <lastmod>2019-06-19</lastmod>\n") //
.append(" </url>\n") // .append(" </url>\n") //
.append(" <url>\n") // .append(" <url>\n") //
.append(" <loc>&test;</loc>\n") // .append(" <loc>&test;</loc>\n") //
.append(" <lastmod>2019-06-19</lastmod>\n") // .append(" <lastmod>2019-06-19</lastmod>\n") //
.append(" </url>\n") // .append(" </url>\n") //
.append("</urlset>"); .append("</urlset>");
byte[] content = scontent.toString().getBytes(UTF_8); byte[] content = scontent.toString().getBytes(UTF_8);
URL url = new URL("http://www.example.com/sitemap.xxe.xml"); URL url = new URL("http://www.example.com/sitemap.xxe.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url); AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(SitemapType.XML, asm.getType()); assertEquals(SitemapType.XML, asm.getType());
assertEquals(true, asm instanceof SiteMap); assertEquals(true, asm instanceof SiteMap);
assertEquals(true, asm.isProcessed()); assertEquals(true, asm.isProcessed());
SiteMap sm = (SiteMap) asm; SiteMap sm = (SiteMap) asm;
// Should only return a single valid URL and ignore the external entity // Should only return a single valid URL and ignore the external entity
assertEquals(1, sm.getSiteMapUrls().size()); assertEquals(1, sm.getSiteMapUrls().size());
assertEquals(new URL("http://www.example.com/visit-here"), sm.getSiteMapUrls().iterator().next().getUrl()); assertEquals(new URL("http://www.example.com/visit-here"), sm.getSiteMapUrls().iterator().next().getUrl());
} }
@Test @Test
public void testSitemapXIncludeDisabled() throws UnknownFormatException, IOException { public void testSitemapXIncludeDisabled() throws UnknownFormatException, IOException {
// A file on disk that would be read if we were vulnerable to XInclude // A file on disk that would be read if we were vulnerable to XInclude