mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-07 07:16:02 +02:00
[Sitemaps] Disable support for DTDs in sitemaps by default
- update change log - apply code formatting - add support for parsing sitemaps with DTD in SiteMapTester
This commit is contained in:
parent
273ac6ac7e
commit
23ee0634dc
|
@ -1,6 +1,7 @@
|
||||||
Crawler-Commons Change Log
|
Crawler-Commons Change Log
|
||||||
|
|
||||||
Current Development 1.3-SNAPSHOT (yyyy-mm-dd)
|
Current Development 1.3-SNAPSHOT (yyyy-mm-dd)
|
||||||
|
- [Sitemaps] Disable support for DTDs in XML sitemaps and feeds by default (Kenneth Wong) #371
|
||||||
- Migrate Continuous Integration from Travis to GitHub Actions (Valery Yatsynovich) #333
|
- Migrate Continuous Integration from Travis to GitHub Actions (Valery Yatsynovich) #333
|
||||||
- Upgrade dependencies (dependabot, Richard Zowalla) #334, #339, #345, #346, #347, #350, #369
|
- Upgrade dependencies (dependabot, Richard Zowalla) #334, #339, #345, #346, #347, #350, #369
|
||||||
- Upgrade Maven plugins (dependabot, Richard Zowalla, sebastian-nagel) #328, #329, #330, #331, #335, #336, #337, #338, #340, #341, #343, #356, #363. #364, #366
|
- Upgrade Maven plugins (dependabot, Richard Zowalla, sebastian-nagel) #328, #329, #330, #331, #335, #336, #337, #338, #340, #341, #343, #356, #363. #364, #366
|
||||||
|
|
|
@ -101,9 +101,9 @@ public class SiteMapParser {
|
||||||
protected Map<String, Extension> extensionNamespaces = new HashMap<>();
|
protected Map<String, Extension> extensionNamespaces = new HashMap<>();
|
||||||
|
|
||||||
private MimeTypeDetector mimeTypeDetector;
|
private MimeTypeDetector mimeTypeDetector;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Option to allow DTD when parsing site map
|
* Option to allow DTDs in sitemaps.
|
||||||
*/
|
*/
|
||||||
private boolean allowDocTypeDefinitions = false;
|
private boolean allowDocTypeDefinitions = false;
|
||||||
|
|
||||||
|
@ -145,6 +145,16 @@ public class SiteMapParser {
|
||||||
this.mimeTypeDetector = new MimeTypeDetector();
|
this.mimeTypeDetector = new MimeTypeDetector();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Sets if the parser allows a DTD in sitemaps or feeds.
|
||||||
|
*
|
||||||
|
* @param allowDocTypeDefinitions
|
||||||
|
* true if allowed. Default is false.
|
||||||
|
*/
|
||||||
|
public void setAllowDocTypeDefinitions(boolean allowDocTypeDefinitions) {
|
||||||
|
this.allowDocTypeDefinitions = allowDocTypeDefinitions;
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return whether invalid URLs will be rejected (where invalid means that
|
* @return whether invalid URLs will be rejected (where invalid means that
|
||||||
* the URL is not under the base URL, see <a href=
|
* the URL is not under the base URL, see <a href=
|
||||||
|
@ -169,7 +179,7 @@ public class SiteMapParser {
|
||||||
* specification, or any accepted namespace (see
|
* specification, or any accepted namespace (see
|
||||||
* {@link #addAcceptedNamespace(String)}). Note enabling strict namespace
|
* {@link #addAcceptedNamespace(String)}). Note enabling strict namespace
|
||||||
* checking always adds the namespace defined by the current sitemap
|
* checking always adds the namespace defined by the current sitemap
|
||||||
* specificiation ({@link Namespace#SITEMAP}) to the list of accepted
|
* specification ({@link Namespace#SITEMAP}) to the list of accepted
|
||||||
* namespaces.
|
* namespaces.
|
||||||
*
|
*
|
||||||
* @param s
|
* @param s
|
||||||
|
@ -247,6 +257,7 @@ public class SiteMapParser {
|
||||||
public void setURLFilter(URLFilter filter) {
|
public void setURLFilter(URLFilter filter) {
|
||||||
urlFilter = filter::filter;
|
urlFilter = filter::filter;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns a SiteMap or SiteMapIndex given an online sitemap URL
|
* Returns a SiteMap or SiteMapIndex given an online sitemap URL
|
||||||
*
|
*
|
||||||
|
@ -669,12 +680,4 @@ public class SiteMapParser {
|
||||||
public static boolean urlIsValid(String sitemapBaseUrl, String testUrl) {
|
public static boolean urlIsValid(String sitemapBaseUrl, String testUrl) {
|
||||||
return testUrl.startsWith(sitemapBaseUrl);
|
return testUrl.startsWith(sitemapBaseUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
|
||||||
* Set if the parser allow DTD
|
|
||||||
* @param allowDocTypeDefinitions true if allowed. Default is false.
|
|
||||||
*/
|
|
||||||
public void setAllowDocTypeDefinitions(boolean allowDocTypeDefinitions) {
|
|
||||||
this.allowDocTypeDefinitions = allowDocTypeDefinitions;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,6 +46,8 @@ public class SiteMapTester {
|
||||||
LOG.error("Java properties:");
|
LOG.error("Java properties:");
|
||||||
LOG.error(" sitemap.strictNamespace");
|
LOG.error(" sitemap.strictNamespace");
|
||||||
LOG.error(" if true sitemaps are required to use the standard namespace URI");
|
LOG.error(" if true sitemaps are required to use the standard namespace URI");
|
||||||
|
LOG.error(" sitemap.allow.dtd");
|
||||||
|
LOG.error(" if true sitemaps are allowed to include a DTD");
|
||||||
LOG.error(" sitemap.extensions");
|
LOG.error(" sitemap.extensions");
|
||||||
LOG.error(" if true enable sitemap extension parsing");
|
LOG.error(" if true enable sitemap extension parsing");
|
||||||
LOG.error(" sitemap.filter.urls");
|
LOG.error(" sitemap.filter.urls");
|
||||||
|
@ -71,6 +73,9 @@ public class SiteMapTester {
|
||||||
boolean strictNamespace = Boolean.getBoolean("sitemap.strictNamespace");
|
boolean strictNamespace = Boolean.getBoolean("sitemap.strictNamespace");
|
||||||
saxParser.setStrictNamespace(strictNamespace);
|
saxParser.setStrictNamespace(strictNamespace);
|
||||||
|
|
||||||
|
boolean allowDTD = Boolean.getBoolean("sitemap.allow.dtd");
|
||||||
|
saxParser.setAllowDocTypeDefinitions(allowDTD);
|
||||||
|
|
||||||
boolean enableExtensions = Boolean.getBoolean("sitemap.extensions");
|
boolean enableExtensions = Boolean.getBoolean("sitemap.extensions");
|
||||||
if (enableExtensions) {
|
if (enableExtensions) {
|
||||||
saxParser.enableExtensions();
|
saxParser.enableExtensions();
|
||||||
|
|
|
@ -141,45 +141,45 @@ public class SiteMapParserTest {
|
||||||
Assertions.assertThrows(UnknownFormatException.class,
|
Assertions.assertThrows(UnknownFormatException.class,
|
||||||
() -> parser.parseSiteMap(contentType, content, url));
|
() -> parser.parseSiteMap(contentType, content, url));
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSitemapXXEWithDocTypeAllowed() throws UnknownFormatException, IOException {
|
public void testSitemapXXEWithDocTypeAllowed() throws UnknownFormatException, IOException {
|
||||||
// A file on disk that would be read if we were vulnerable to XXE
|
// A file on disk that would be read if we were vulnerable to XXE
|
||||||
File doNotVisit = new File("src/test/resources/sitemaps/do-not-visit.txt");
|
File doNotVisit = new File("src/test/resources/sitemaps/do-not-visit.txt");
|
||||||
|
|
||||||
// Create a sitemap with an external entity referring to the local file
|
// Create a sitemap with an external entity referring to the local file
|
||||||
SiteMapParser parser = new SiteMapParser();
|
SiteMapParser parser = new SiteMapParser();
|
||||||
parser.setAllowDocTypeDefinitions(true);
|
parser.setAllowDocTypeDefinitions(true);
|
||||||
String contentType = "text/xml";
|
String contentType = "text/xml";
|
||||||
StringBuilder scontent = new StringBuilder(1024);
|
StringBuilder scontent = new StringBuilder(1024);
|
||||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") //
|
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") //
|
||||||
.append("<!DOCTYPE urlset [\n") //
|
.append("<!DOCTYPE urlset [\n") //
|
||||||
.append(" <!ENTITY test SYSTEM \"file://" + doNotVisit.getAbsolutePath() + "\">\n") //
|
.append(" <!ENTITY test SYSTEM \"file://" + doNotVisit.getAbsolutePath() + "\">\n") //
|
||||||
.append("]>\n") //
|
.append("]>\n") //
|
||||||
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n") //
|
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n") //
|
||||||
.append(" <url>\n") //
|
.append(" <url>\n") //
|
||||||
.append(" <loc>http://www.example.com/visit-here</loc>\n") //
|
.append(" <loc>http://www.example.com/visit-here</loc>\n") //
|
||||||
.append(" <lastmod>2019-06-19</lastmod>\n") //
|
.append(" <lastmod>2019-06-19</lastmod>\n") //
|
||||||
.append(" </url>\n") //
|
.append(" </url>\n") //
|
||||||
.append(" <url>\n") //
|
.append(" <url>\n") //
|
||||||
.append(" <loc>&test;</loc>\n") //
|
.append(" <loc>&test;</loc>\n") //
|
||||||
.append(" <lastmod>2019-06-19</lastmod>\n") //
|
.append(" <lastmod>2019-06-19</lastmod>\n") //
|
||||||
.append(" </url>\n") //
|
.append(" </url>\n") //
|
||||||
.append("</urlset>");
|
.append("</urlset>");
|
||||||
byte[] content = scontent.toString().getBytes(UTF_8);
|
byte[] content = scontent.toString().getBytes(UTF_8);
|
||||||
|
|
||||||
URL url = new URL("http://www.example.com/sitemap.xxe.xml");
|
URL url = new URL("http://www.example.com/sitemap.xxe.xml");
|
||||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||||
assertEquals(SitemapType.XML, asm.getType());
|
assertEquals(SitemapType.XML, asm.getType());
|
||||||
assertEquals(true, asm instanceof SiteMap);
|
assertEquals(true, asm instanceof SiteMap);
|
||||||
assertEquals(true, asm.isProcessed());
|
assertEquals(true, asm.isProcessed());
|
||||||
SiteMap sm = (SiteMap) asm;
|
SiteMap sm = (SiteMap) asm;
|
||||||
|
|
||||||
// Should only return a single valid URL and ignore the external entity
|
// Should only return a single valid URL and ignore the external entity
|
||||||
assertEquals(1, sm.getSiteMapUrls().size());
|
assertEquals(1, sm.getSiteMapUrls().size());
|
||||||
assertEquals(new URL("http://www.example.com/visit-here"), sm.getSiteMapUrls().iterator().next().getUrl());
|
assertEquals(new URL("http://www.example.com/visit-here"), sm.getSiteMapUrls().iterator().next().getUrl());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testSitemapXIncludeDisabled() throws UnknownFormatException, IOException {
|
public void testSitemapXIncludeDisabled() throws UnknownFormatException, IOException {
|
||||||
// A file on disk that would be read if we were vulnerable to XInclude
|
// A file on disk that would be read if we were vulnerable to XInclude
|
||||||
|
|
Loading…
Reference in New Issue