1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-22 02:56:03 +02:00

Fix XXE vulnerability in Sitemap parser

- add unit test to verify that the parser is not vulnerable
  to XInclude attacks
- apply code formatter
- add changelog entry
This commit is contained in:
Sebastian Nagel 2021-10-01 09:42:47 +02:00
parent 2b66ad2060
commit 4841242390
8 changed files with 68 additions and 28 deletions

View File

@ -1,6 +1,7 @@
Crawler-Commons Change Log
Current Development 1.2-SNAPSHOT (yyyy-mm-dd)
- [Sitemaps] Fix XXE vulnerability in Sitemap parser (kovyrin) #323
- [URLs] Sorting the Query Parameters (aecio) #246, #309
- [URLs] Allows to (optionally) remove common irrelevant query parameters (aecio) #309
- [Sitemaps] Allow to normalize URLs in sitemaps (murderinc, sebastian-nagel) #305

View File

@ -33,7 +33,7 @@ import java.util.Locale;
/** SiteMap or SiteMapIndex **/
@SuppressWarnings("serial")
public abstract class AbstractSiteMap implements Serializable {
public abstract class AbstractSiteMap implements Serializable {
/** Various Sitemap types */
public enum SitemapType {

View File

@ -581,8 +581,8 @@ public class SiteMapParser {
SAXParserFactory factory = SAXParserFactory.newInstance();
// disable validation and avoid that remote DTDs, schemas, etc. are
// fetched
// disable validation and avoid that DTDs, schemas, XML snippets, etc.
// are fetched from remote servers or the local file system
factory.setValidating(false);
factory.setXIncludeAware(false);

View File

@ -40,7 +40,8 @@ public class LinkAttributes extends ExtensionMetadata {
public static final String HREF = "href";
/**
* Specifies the prefix used when adding Link Attribute parameters to the Map returned by asMap
* Specifies the prefix used when adding Link Attribute parameters to the
* Map returned by asMap
*/
private static final String PARAMS_PREFIX = "params.%s";

View File

@ -22,10 +22,11 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Objects;
/**
* Data model for Google's extension to the sitemap protocol regarding news
* indexing, as per <a href="http://www.google.com/schemas/sitemap-news/0.9">http://www.google.com/schemas/sitemap-news/0.9</a>.
* indexing, as per <a
* href="http://www.google.com/schemas/sitemap-news/0.9">http
* ://www.google.com/schemas/sitemap-news/0.9</a>.
*/
@SuppressWarnings("serial")
public class NewsAttributes extends ExtensionMetadata {
@ -69,8 +70,9 @@ public class NewsAttributes extends ExtensionMetadata {
private String title;
/**
* News keywords found under news/keywords (optional)
* See <a href="https://support.google.com/news/publisher/answer/116037">https://support.google.com/news/publisher/answer/116037</a> for examples
* News keywords found under news/keywords (optional) See <a
* href="https://support.google.com/news/publisher/answer/116037"
* >https://support.google.com/news/publisher/answer/116037</a> for examples
*/
private String[] keywords;

View File

@ -58,7 +58,7 @@ public class VideoAttributes extends ExtensionMetadata {
public static final String RESTRICTED_PLATFORMS = "restricted_platforms";
public static final String IS_LIVE = "is_live";
public static final String DURATION = "duration";
/**
* Video thumbnail URL found under video/thumbnail_loc (required)
*/

View File

@ -31,8 +31,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.time.ZonedDateTime;
import java.time.temporal.ChronoField;
import java.util.ArrayList;
@ -119,24 +117,24 @@ public class SiteMapParserTest {
// A file on disk that would be read if we were vulnerable to XXE
File doNotVisit = new File("src/test/resources/sitemaps/do-not-visit.txt");
// Create a sitemap with an external entity referring to the local temporary file
// Create a sitemap with an external entity referring to the local file
SiteMapParser parser = new SiteMapParser();
String contentType = "text/xml";
StringBuilder scontent = new StringBuilder(1024);
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n")
.append("<!DOCTYPE urlset [\n")
.append(" <!ENTITY test SYSTEM \"file://" + doNotVisit.toPath().toAbsolutePath() + "\">\n")
.append("]>\n")
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n")
.append(" <url>\n")
.append(" <loc>http://www.example.com/visit-here</loc>\n")
.append(" <lastmod>2019-06-19</lastmod>\n")
.append(" </url>\n")
.append(" <url>\n")
.append(" <loc>&test;</loc>\n")
.append(" <lastmod>2019-06-19</lastmod>\n")
.append(" </url>\n")
.append("</urlset>");
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") //
.append("<!DOCTYPE urlset [\n") //
.append(" <!ENTITY test SYSTEM \"file://" + doNotVisit.getAbsolutePath() + "\">\n") //
.append("]>\n") //
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">\n") //
.append(" <url>\n") //
.append(" <loc>http://www.example.com/visit-here</loc>\n") //
.append(" <lastmod>2019-06-19</lastmod>\n") //
.append(" </url>\n") //
.append(" <url>\n") //
.append(" <loc>&test;</loc>\n") //
.append(" <lastmod>2019-06-19</lastmod>\n") //
.append(" </url>\n") //
.append("</urlset>");
byte[] content = scontent.toString().getBytes(UTF_8);
URL url = new URL("http://www.example.com/sitemap.xxe.xml");
@ -146,7 +144,44 @@ public class SiteMapParserTest {
assertEquals(true, asm.isProcessed());
SiteMap sm = (SiteMap) asm;
// Should only return a single valid URL and ignore the external entity one
// Should only return a single valid URL and ignore the external entity
assertEquals(1, sm.getSiteMapUrls().size());
assertEquals(new URL("http://www.example.com/visit-here"), sm.getSiteMapUrls().iterator().next().getUrl());
}
@Test
public void testSitemapXIncludeDisabled() throws UnknownFormatException, IOException {
// A file on disk that would be read if we were vulnerable to XInclude
// attacks
File doNotVisit = new File("src/test/resources/sitemaps/do-not-visit.txt");
// Create a sitemap containing a XInclude referring to the local file
SiteMapParser parser = new SiteMapParser();
String contentType = "text/xml";
StringBuilder scontent = new StringBuilder(1024);
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n") //
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\"\n") //
.append(" xmlns:xi=\"http://www.w3.org/2001/XInclude\">") //
.append(" <url>\n") //
.append(" <loc>http://www.example.com/visit-here</loc>\n") //
.append(" <lastmod>2019-06-19</lastmod>\n") //
.append(" </url>\n") //
.append(" <url>\n") //
.append(" <loc><xi:include href=\"file://" + doNotVisit.getAbsolutePath() + "\" parse=\"text\"/></loc>\n") //
.append(" <lastmod>2019-06-19</lastmod>\n") //
.append(" </url>\n") //
.append("</urlset>");
byte[] content = scontent.toString().getBytes(UTF_8);
URL url = new URL("http://www.example.com/sitemap.xinclude.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(SitemapType.XML, asm.getType());
assertEquals(true, asm instanceof SiteMap);
assertEquals(true, asm.isProcessed());
SiteMap sm = (SiteMap) asm;
// Should only return a single valid URL and ignore the URL contained in
// the external entity
assertEquals(1, sm.getSiteMapUrls().size());
assertEquals(new URL("http://www.example.com/visit-here"), sm.getSiteMapUrls().iterator().next().getUrl());
}

View File

@ -18,7 +18,8 @@ public class LinkAttributesTest {
{
put("rel", "alternate");
put("hreflang", "de");
}});
}
});
Map<String, String[]> map = attributes.asMap();
assertEquals(attributes.getHref().toString(), map.get(LinkAttributes.HREF)[0]);