mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-22 02:56:03 +02:00
Fix XXE vulnerability in Sitemap parser
- add unit test to verify that the parser is not vulnerable to XInclude attacks - apply code formatter - add changelog entry
This commit is contained in:
parent
2b66ad2060
commit
4841242390
|
@ -1,6 +1,7 @@
|
|||
Crawler-Commons Change Log
|
||||
|
||||
Current Development 1.2-SNAPSHOT (yyyy-mm-dd)
|
||||
- [Sitemaps] Fix XXE vulnerability in Sitemap parser (kovyrin) #323
|
||||
- [URLs] Sorting the Query Parameters (aecio) #246, #309
|
||||
- [URLs] Allows to (optionally) remove common irrelevant query parameters (aecio) #309
|
||||
- [Sitemaps] Allow to normalize URLs in sitemaps (murderinc, sebastian-nagel) #305
|
||||
|
|
|
@ -33,7 +33,7 @@ import java.util.Locale;
|
|||
|
||||
/** SiteMap or SiteMapIndex **/
|
||||
@SuppressWarnings("serial")
|
||||
public abstract class AbstractSiteMap implements Serializable {
|
||||
public abstract class AbstractSiteMap implements Serializable {
|
||||
|
||||
/** Various Sitemap types */
|
||||
public enum SitemapType {
|
||||
|
|
|
@ -581,8 +581,8 @@ public class SiteMapParser {
|
|||
|
||||
SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||
|
||||
// disable validation and avoid that remote DTDs, schemas, etc. are
|
||||
// fetched
|
||||
// disable validation and avoid that DTDs, schemas, XML snippets, etc.
|
||||
// are fetched from remote servers or the local file system
|
||||
factory.setValidating(false);
|
||||
factory.setXIncludeAware(false);
|
||||
|
||||
|
|
|
@ -40,7 +40,8 @@ public class LinkAttributes extends ExtensionMetadata {
|
|||
public static final String HREF = "href";
|
||||
|
||||
/**
|
||||
* Specifies the prefix used when adding Link Attribute parameters to the Map returned by asMap
|
||||
* Specifies the prefix used when adding Link Attribute parameters to the
|
||||
* Map returned by asMap
|
||||
*/
|
||||
private static final String PARAMS_PREFIX = "params.%s";
|
||||
|
||||
|
|
|
@ -22,10 +22,11 @@ import java.util.HashMap;
|
|||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
|
||||
|
||||
/**
|
||||
* Data model for Google's extension to the sitemap protocol regarding news
|
||||
* indexing, as per <a href="http://www.google.com/schemas/sitemap-news/0.9">http://www.google.com/schemas/sitemap-news/0.9</a>.
|
||||
* indexing, as per <a
|
||||
* href="http://www.google.com/schemas/sitemap-news/0.9">http
|
||||
* ://www.google.com/schemas/sitemap-news/0.9</a>.
|
||||
*/
|
||||
@SuppressWarnings("serial")
|
||||
public class NewsAttributes extends ExtensionMetadata {
|
||||
|
@ -69,8 +70,9 @@ public class NewsAttributes extends ExtensionMetadata {
|
|||
private String title;
|
||||
|
||||
/**
|
||||
* News keywords found under news/keywords (optional)
|
||||
* See <a href="https://support.google.com/news/publisher/answer/116037">https://support.google.com/news/publisher/answer/116037</a> for examples
|
||||
* News keywords found under news/keywords (optional) See <a
|
||||
* href="https://support.google.com/news/publisher/answer/116037"
|
||||
* >https://support.google.com/news/publisher/answer/116037</a> for examples
|
||||
*/
|
||||
private String[] keywords;
|
||||
|
||||
|
|
|
@ -58,7 +58,7 @@ public class VideoAttributes extends ExtensionMetadata {
|
|||
public static final String RESTRICTED_PLATFORMS = "restricted_platforms";
|
||||
public static final String IS_LIVE = "is_live";
|
||||
public static final String DURATION = "duration";
|
||||
|
||||
|
||||
/**
|
||||
* Video thumbnail URL found under video/thumbnail_loc (required)
|
||||
*/
|
||||
|
|
|
@ -31,8 +31,6 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.temporal.ChronoField;
|
||||
import java.util.ArrayList;
|
||||
|
@ -119,24 +117,24 @@ public class SiteMapParserTest {
|
|||
// A file on disk that would be read if we were vulnerable to XXE
|
||||
File doNotVisit = new File("src/test/resources/sitemaps/do-not-visit.txt");
|
||||
|
||||
// Create a sitemap with an external entity referring to the local temporary file
|
||||
// Create a sitemap with an external entity referring to the local file
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
String contentType = "text/xml";
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
|
||||
.append("<!DOCTYPE urlset [\n")
|
||||
.append(" <!ENTITY test SYSTEM \"file://" + doNotVisit.toPath().toAbsolutePath() + "\">\n")
|
||||
.append("]>\n")
|
||||
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n")
|
||||
.append(" <url>\n")
|
||||
.append(" <loc>http://www.example.com/visit-here</loc>\n")
|
||||
.append(" <lastmod>2019-06-19</lastmod>\n")
|
||||
.append(" </url>\n")
|
||||
.append(" <url>\n")
|
||||
.append(" <loc>&test;</loc>\n")
|
||||
.append(" <lastmod>2019-06-19</lastmod>\n")
|
||||
.append(" </url>\n")
|
||||
.append("</urlset>");
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") //
|
||||
.append("<!DOCTYPE urlset [\n") //
|
||||
.append(" <!ENTITY test SYSTEM \"file://" + doNotVisit.getAbsolutePath() + "\">\n") //
|
||||
.append("]>\n") //
|
||||
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n") //
|
||||
.append(" <url>\n") //
|
||||
.append(" <loc>http://www.example.com/visit-here</loc>\n") //
|
||||
.append(" <lastmod>2019-06-19</lastmod>\n") //
|
||||
.append(" </url>\n") //
|
||||
.append(" <url>\n") //
|
||||
.append(" <loc>&test;</loc>\n") //
|
||||
.append(" <lastmod>2019-06-19</lastmod>\n") //
|
||||
.append(" </url>\n") //
|
||||
.append("</urlset>");
|
||||
byte[] content = scontent.toString().getBytes(UTF_8);
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap.xxe.xml");
|
||||
|
@ -146,7 +144,44 @@ public class SiteMapParserTest {
|
|||
assertEquals(true, asm.isProcessed());
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
||||
// Should only return a single valid URL and ignore the external entity one
|
||||
// Should only return a single valid URL and ignore the external entity
|
||||
assertEquals(1, sm.getSiteMapUrls().size());
|
||||
assertEquals(new URL("http://www.example.com/visit-here"), sm.getSiteMapUrls().iterator().next().getUrl());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapXIncludeDisabled() throws UnknownFormatException, IOException {
|
||||
// A file on disk that would be read if we were vulnerable to XInclude
|
||||
// attacks
|
||||
File doNotVisit = new File("src/test/resources/sitemaps/do-not-visit.txt");
|
||||
|
||||
// Create a sitemap containing a XInclude referring to the local file
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
String contentType = "text/xml";
|
||||
StringBuilder scontent = new StringBuilder(1024);
|
||||
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") //
|
||||
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\"\n") //
|
||||
.append(" xmlns:xi=\"http://www.w3.org/2001/XInclude\">") //
|
||||
.append(" <url>\n") //
|
||||
.append(" <loc>http://www.example.com/visit-here</loc>\n") //
|
||||
.append(" <lastmod>2019-06-19</lastmod>\n") //
|
||||
.append(" </url>\n") //
|
||||
.append(" <url>\n") //
|
||||
.append(" <loc><xi:include href=\"file://" + doNotVisit.getAbsolutePath() + "\" parse=\"text\"/></loc>\n") //
|
||||
.append(" <lastmod>2019-06-19</lastmod>\n") //
|
||||
.append(" </url>\n") //
|
||||
.append("</urlset>");
|
||||
byte[] content = scontent.toString().getBytes(UTF_8);
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap.xinclude.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
assertEquals(SitemapType.XML, asm.getType());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
assertEquals(true, asm.isProcessed());
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
||||
// Should only return a single valid URL and ignore the URL contained in
|
||||
// the external entity
|
||||
assertEquals(1, sm.getSiteMapUrls().size());
|
||||
assertEquals(new URL("http://www.example.com/visit-here"), sm.getSiteMapUrls().iterator().next().getUrl());
|
||||
}
|
||||
|
|
|
@ -18,7 +18,8 @@ public class LinkAttributesTest {
|
|||
{
|
||||
put("rel", "alternate");
|
||||
put("hreflang", "de");
|
||||
}});
|
||||
}
|
||||
});
|
||||
Map<String, String[]> map = attributes.asMap();
|
||||
|
||||
assertEquals(attributes.getHref().toString(), map.get(LinkAttributes.HREF)[0]);
|
||||
|
|
Loading…
Reference in New Issue