1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-05 06:46:02 +02:00

Sitemaps: avoid calling java.net.URL::equals in equals method of sitemaps and sitemap extensions (#326)

* Sitemaps: avoid calling java.net.URL::equals in equals method of sitemaps and sitemap extensions
(fixes #322)
- compare URL strings to avoid that java.net.URL::equals triggers unwanted and potentially slow
  DNS lookups to resolve the host part. Replace:
  - Objects::equals in equals methods of sitemap extensions
  - URL::equals and URL::hashCode in SiteMapIndex and SiteMapURL
- enable check for URL::equals and URL::hashCode in Forbidden API Checker

* Sitemaps: avoid calling java.net.URL::equals in equals method of sitemaps and sitemap extensions
- avoid NPEs in equals and hashCode methods

* Sitemaps: avoid calling java.net.URL::equals in equals method of sitemaps and sitemap extensions
- avoid NPE, return null as before if null is passed to SitemapIndex::getSitemap
This commit is contained in:
Sebastian Nagel 2021-10-06 11:07:02 +02:00 committed by GitHub
parent ec1f2e54ec
commit 0493878f80
Signed by: GitHub
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 63 additions and 12 deletions

View File

@ -1,6 +1,7 @@
Crawler-Commons Change Log
Current Development 1.2-SNAPSHOT (yyyy-mm-dd)
- [Sitemaps] Avoid calling java.net.URL::equals in equals method of sitemaps and extensions (sebastian-nagel) #322
- [URLs] Provide a builder class to configure the URL normalizer (aecio) #321, #324
- [URLs] Make normalization of IDNs configurable (to ASCII or Unicode) via builder (aecio, sebastian-nagel) #324
- [Sitemaps] Fix XXE vulnerability in Sitemap parser (kovyrin) #323

View File

@ -203,7 +203,7 @@
<plugin>
<groupId>de.thetaphi</groupId>
<artifactId>forbiddenapis</artifactId>
<version>2.3</version>
<version>3.2</version>
<configuration>
<!--
if the used Java version is too new,
@ -215,6 +215,9 @@
<bundledSignature>jdk-deprecated</bundledSignature>
<bundledSignature>jdk-non-portable</bundledSignature>
</bundledSignatures>
<signaturesFiles>
<signaturesFile>src/test/resources/forbidden-apis-signatures.txt</signaturesFile>
</signaturesFiles>
</configuration>
<executions>
<execution>

View File

@ -80,8 +80,12 @@ public class SiteMapIndex extends AbstractSiteMap {
* @return SiteMap corresponding to the URL or null
*/
public AbstractSiteMap getSitemap(URL url) {
if (url == null)
return null;
String u = url.toString();
for (AbstractSiteMap asm : sitemaps) {
if (asm.getUrl().equals(url)) {
URL su = asm.getUrl();
if (su != null && su.toString().equals(u)) {
return asm;
}
}

View File

@ -356,7 +356,10 @@ public class SiteMapURL implements Serializable {
SiteMapURL that = (SiteMapURL) o;
if (!url.equals(that.url))
if (url == that.url)
return true;
if (url == null || that.url == null || !url.toString().equals(that.url.toString()))
return false;
return true;
@ -364,7 +367,9 @@ public class SiteMapURL implements Serializable {
@Override
public int hashCode() {
return url.hashCode();
if (url == null)
return "".hashCode();
return url.toString().hashCode();
}
@Override

View File

@ -19,6 +19,7 @@ package crawlercommons.sitemaps.extension;
import crawlercommons.sitemaps.SiteMapURL;
import java.io.Serializable;
import java.net.URL;
import java.util.Map;
/**
@ -36,4 +37,13 @@ public abstract class ExtensionMetadata implements Serializable {
return true;
}
/**
* Compare URLs by their string representation because calling
* {@link URL#equals(Object)} may trigger an unwanted and potentially slow
* DNS lookup to resolve the host part
*/
protected static boolean urlEquals(URL a, URL b) {
return (a == b) || (a != null && b != null && a.toString().equals(b.toString()));
}
}

View File

@ -132,11 +132,11 @@ public class ImageAttributes extends ExtensionMetadata {
return false;
}
ImageAttributes that = (ImageAttributes) other;
return Objects.equals(loc, that.loc) //
return urlEquals(loc, that.loc) //
&& Objects.equals(caption, that.caption) //
&& Objects.equals(geoLocation, that.geoLocation) //
&& Objects.equals(title, that.title) //
&& Objects.equals(license, that.license);
&& urlEquals(license, that.license);
}
@Override

View File

@ -105,7 +105,7 @@ public class LinkAttributes extends ExtensionMetadata {
return false;
}
LinkAttributes that = (LinkAttributes) other;
return Objects.equals(href, that.href) //
return urlEquals(href, that.href) //
&& Objects.equals(params, that.params);
}

View File

@ -550,11 +550,11 @@ public class VideoAttributes extends ExtensionMetadata {
return false;
}
VideoAttributes that = (VideoAttributes) other;
return Objects.equals(thumbnailLoc, that.thumbnailLoc) //
return urlEquals(thumbnailLoc, that.thumbnailLoc) //
&& Objects.equals(title, that.title) //
&& Objects.equals(description, that.description) //
&& Objects.equals(contentLoc, that.contentLoc) //
&& Objects.equals(playerLoc, that.playerLoc) //
&& urlEquals(contentLoc, that.contentLoc) //
&& urlEquals(playerLoc, that.playerLoc) //
&& Objects.equals(duration, that.duration) //
&& Objects.equals(expirationDate, that.expirationDate) //
&& Objects.equals(rating, that.rating) //
@ -565,7 +565,7 @@ public class VideoAttributes extends ExtensionMetadata {
&& Objects.equals(category, that.category) //
&& Objects.deepEquals(restrictedCountries, that.restrictedCountries) //
&& Objects.deepEquals(allowedCountries, that.allowedCountries) //
&& Objects.equals(galleryLoc, that.galleryLoc) //
&& urlEquals(galleryLoc, that.galleryLoc) //
&& Objects.equals(galleryTitle, that.galleryTitle) //
&& Objects.deepEquals(prices, that.prices) //
&& Objects.equals(requiresSubscription, that.requiresSubscription) //

View File

@ -16,9 +16,14 @@
package crawlercommons.sitemaps;
import java.net.MalformedURLException;
import java.net.URL;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
public class SiteMapIndexTest {
@ -32,4 +37,17 @@ public class SiteMapIndexTest {
assertEquals(2, index.getSitemaps(true).size());
assertEquals(3, index.getSitemaps(false).size());
}
@Test
public void testNPE() {
SiteMapIndex index = new SiteMapIndex();
index.addSitemap(new SiteMap("INVALID", "2020-06-18"));
index.addSitemap(new SiteMap("https://example.com/sitemap1.xml", "2020-06-18"));
try {
assertNotNull(index.getSitemap(new URL("https://example.com/sitemap1.xml")));
} catch (MalformedURLException e) {
// URL is valid
}
assertNull(index.getSitemap(null));
}
}

View File

@ -18,9 +18,9 @@ package crawlercommons.sitemaps;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertFalse;
import static org.junit.jupiter.api.Assertions.assertEquals;
/** Created on 13/10/2014. */
public class SiteMapURLTest {
private SiteMapURL siteMapURL = new SiteMapURL("http://example.com", true);
@ -57,4 +57,12 @@ public class SiteMapURLTest {
siteMapURL.setPriority(Double.NEGATIVE_INFINITY);
assertEquals(SiteMapURL.DEFAULT_PRIORITY, siteMapURL.getPriority(), 0);
}
@Test
public void testNPE() {
SiteMapURL invalid = new SiteMapURL("INVALID_URL", true);
assertFalse(siteMapURL.equals(invalid));
// should not throw an exception
invalid.hashCode();
}
}

View File

@ -0,0 +1,2 @@
java.net.URL#equals(java.lang.Object) @ may trigger a DNS lookup to resolve the host part
java.net.URL#hashCode() @ may trigger a DNS lookup to resolve the host part