From 23ee0634dc0ec8bf756f8f1240958ef84235d379 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Wed, 2 Mar 2022 15:47:19 +0100 Subject: [PATCH] [Sitemaps] Disable support for DTDs in sitemaps by default - update change log - apply code formatting - add support for parsing sitemaps with DTD in SiteMapTester --- CHANGES.txt | 1 + .../sitemaps/SiteMapParser.java | 25 +++++++------ .../sitemaps/SiteMapTester.java | 5 +++ .../sitemaps/SiteMapParserTest.java | 36 +++++++++---------- 4 files changed, 38 insertions(+), 29 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 1161ff0..5e28a5e 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,6 +1,7 @@ Crawler-Commons Change Log Current Development 1.3-SNAPSHOT (yyyy-mm-dd) +- [Sitemaps] Disable support for DTDs in XML sitemaps and feeds by default (Kenneth Wong) #371 - Migrate Continuous Integration from Travis to GitHub Actions (Valery Yatsynovich) #333 - Upgrade dependencies (dependabot, Richard Zowalla) #334, #339, #345, #346, #347, #350, #369 - Upgrade Maven plugins (dependabot, Richard Zowalla, sebastian-nagel) #328, #329, #330, #331, #335, #336, #337, #338, #340, #341, #343, #356, #363. #364, #366 diff --git a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java index 6ddcdde..da2e7e8 100644 --- a/src/main/java/crawlercommons/sitemaps/SiteMapParser.java +++ b/src/main/java/crawlercommons/sitemaps/SiteMapParser.java @@ -101,9 +101,9 @@ public class SiteMapParser { protected Map extensionNamespaces = new HashMap<>(); private MimeTypeDetector mimeTypeDetector; - + /** - * Option to allow DTD when parsing site map + * Option to allow DTDs in sitemaps. */ private boolean allowDocTypeDefinitions = false; @@ -145,6 +145,16 @@ public class SiteMapParser { this.mimeTypeDetector = new MimeTypeDetector(); } + /** + * Sets if the parser allows a DTD in sitemaps or feeds. + * + * @param allowDocTypeDefinitions + * true if allowed. Default is false. + */ + public void setAllowDocTypeDefinitions(boolean allowDocTypeDefinitions) { + this.allowDocTypeDefinitions = allowDocTypeDefinitions; + } + /** * @return whether invalid URLs will be rejected (where invalid means that * the URL is not under the base URL, see parser.parseSiteMap(contentType, content, url)); } - + @Test public void testSitemapXXEWithDocTypeAllowed() throws UnknownFormatException, IOException { // A file on disk that would be read if we were vulnerable to XXE File doNotVisit = new File("src/test/resources/sitemaps/do-not-visit.txt"); - + // Create a sitemap with an external entity referring to the local file SiteMapParser parser = new SiteMapParser(); parser.setAllowDocTypeDefinitions(true); String contentType = "text/xml"; StringBuilder scontent = new StringBuilder(1024); scontent.append("\n") // - .append("\n") // - .append("]>\n") // - .append("\n") // - .append(" \n") // - .append(" http://www.example.com/visit-here\n") // - .append(" 2019-06-19\n") // - .append(" \n") // - .append(" \n") // - .append(" &test;\n") // - .append(" 2019-06-19\n") // - .append(" \n") // - .append(""); + .append("\n") // + .append("]>\n") // + .append("\n") // + .append(" \n") // + .append(" http://www.example.com/visit-here\n") // + .append(" 2019-06-19\n") // + .append(" \n") // + .append(" \n") // + .append(" &test;\n") // + .append(" 2019-06-19\n") // + .append(" \n") // + .append(""); byte[] content = scontent.toString().getBytes(UTF_8); - + URL url = new URL("http://www.example.com/sitemap.xxe.xml"); AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url); assertEquals(SitemapType.XML, asm.getType()); assertEquals(true, asm instanceof SiteMap); assertEquals(true, asm.isProcessed()); SiteMap sm = (SiteMap) asm; - + // Should only return a single valid URL and ignore the external entity assertEquals(1, sm.getSiteMapUrls().size()); assertEquals(new URL("http://www.example.com/visit-here"), sm.getSiteMapUrls().iterator().next().getUrl()); } - + @Test public void testSitemapXIncludeDisabled() throws UnknownFormatException, IOException { // A file on disk that would be read if we were vulnerable to XInclude