mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-05 06:46:02 +02:00
Sitemaps: avoid calling java.net.URL::equals in equals method of sitemaps and sitemap extensions (#326)
* Sitemaps: avoid calling java.net.URL::equals in equals method of sitemaps and sitemap extensions (fixes #322) - compare URL strings to avoid that java.net.URL::equals triggers unwanted and potentially slow DNS lookups to resolve the host part. Replace: - Objects::equals in equals methods of sitemap extensions - URL::equals and URL::hashCode in SiteMapIndex and SiteMapURL - enable check for URL::equals and URL::hashCode in Forbidden API Checker * Sitemaps: avoid calling java.net.URL::equals in equals method of sitemaps and sitemap extensions - avoid NPEs in equals and hashCode methods * Sitemaps: avoid calling java.net.URL::equals in equals method of sitemaps and sitemap extensions - avoid NPE, return null as before if null is passed to SitemapIndex::getSitemap
This commit is contained in:
parent
ec1f2e54ec
commit
0493878f80
|
@ -1,6 +1,7 @@
|
|||
Crawler-Commons Change Log
|
||||
|
||||
Current Development 1.2-SNAPSHOT (yyyy-mm-dd)
|
||||
- [Sitemaps] Avoid calling java.net.URL::equals in equals method of sitemaps and extensions (sebastian-nagel) #322
|
||||
- [URLs] Provide a builder class to configure the URL normalizer (aecio) #321, #324
|
||||
- [URLs] Make normalization of IDNs configurable (to ASCII or Unicode) via builder (aecio, sebastian-nagel) #324
|
||||
- [Sitemaps] Fix XXE vulnerability in Sitemap parser (kovyrin) #323
|
||||
|
|
5
pom.xml
5
pom.xml
|
@ -203,7 +203,7 @@
|
|||
<plugin>
|
||||
<groupId>de.thetaphi</groupId>
|
||||
<artifactId>forbiddenapis</artifactId>
|
||||
<version>2.3</version>
|
||||
<version>3.2</version>
|
||||
<configuration>
|
||||
<!--
|
||||
if the used Java version is too new,
|
||||
|
@ -215,6 +215,9 @@
|
|||
<bundledSignature>jdk-deprecated</bundledSignature>
|
||||
<bundledSignature>jdk-non-portable</bundledSignature>
|
||||
</bundledSignatures>
|
||||
<signaturesFiles>
|
||||
<signaturesFile>src/test/resources/forbidden-apis-signatures.txt</signaturesFile>
|
||||
</signaturesFiles>
|
||||
</configuration>
|
||||
<executions>
|
||||
<execution>
|
||||
|
|
|
@ -80,8 +80,12 @@ public class SiteMapIndex extends AbstractSiteMap {
|
|||
* @return SiteMap corresponding to the URL or null
|
||||
*/
|
||||
public AbstractSiteMap getSitemap(URL url) {
|
||||
if (url == null)
|
||||
return null;
|
||||
String u = url.toString();
|
||||
for (AbstractSiteMap asm : sitemaps) {
|
||||
if (asm.getUrl().equals(url)) {
|
||||
URL su = asm.getUrl();
|
||||
if (su != null && su.toString().equals(u)) {
|
||||
return asm;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -356,7 +356,10 @@ public class SiteMapURL implements Serializable {
|
|||
|
||||
SiteMapURL that = (SiteMapURL) o;
|
||||
|
||||
if (!url.equals(that.url))
|
||||
if (url == that.url)
|
||||
return true;
|
||||
|
||||
if (url == null || that.url == null || !url.toString().equals(that.url.toString()))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
|
@ -364,7 +367,9 @@ public class SiteMapURL implements Serializable {
|
|||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return url.hashCode();
|
||||
if (url == null)
|
||||
return "".hashCode();
|
||||
return url.toString().hashCode();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -19,6 +19,7 @@ package crawlercommons.sitemaps.extension;
|
|||
import crawlercommons.sitemaps.SiteMapURL;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.net.URL;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
|
@ -36,4 +37,13 @@ public abstract class ExtensionMetadata implements Serializable {
|
|||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare URLs by their string representation because calling
|
||||
* {@link URL#equals(Object)} may trigger an unwanted and potentially slow
|
||||
* DNS lookup to resolve the host part
|
||||
*/
|
||||
protected static boolean urlEquals(URL a, URL b) {
|
||||
return (a == b) || (a != null && b != null && a.toString().equals(b.toString()));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -132,11 +132,11 @@ public class ImageAttributes extends ExtensionMetadata {
|
|||
return false;
|
||||
}
|
||||
ImageAttributes that = (ImageAttributes) other;
|
||||
return Objects.equals(loc, that.loc) //
|
||||
return urlEquals(loc, that.loc) //
|
||||
&& Objects.equals(caption, that.caption) //
|
||||
&& Objects.equals(geoLocation, that.geoLocation) //
|
||||
&& Objects.equals(title, that.title) //
|
||||
&& Objects.equals(license, that.license);
|
||||
&& urlEquals(license, that.license);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -105,7 +105,7 @@ public class LinkAttributes extends ExtensionMetadata {
|
|||
return false;
|
||||
}
|
||||
LinkAttributes that = (LinkAttributes) other;
|
||||
return Objects.equals(href, that.href) //
|
||||
return urlEquals(href, that.href) //
|
||||
&& Objects.equals(params, that.params);
|
||||
}
|
||||
|
||||
|
|
|
@ -550,11 +550,11 @@ public class VideoAttributes extends ExtensionMetadata {
|
|||
return false;
|
||||
}
|
||||
VideoAttributes that = (VideoAttributes) other;
|
||||
return Objects.equals(thumbnailLoc, that.thumbnailLoc) //
|
||||
return urlEquals(thumbnailLoc, that.thumbnailLoc) //
|
||||
&& Objects.equals(title, that.title) //
|
||||
&& Objects.equals(description, that.description) //
|
||||
&& Objects.equals(contentLoc, that.contentLoc) //
|
||||
&& Objects.equals(playerLoc, that.playerLoc) //
|
||||
&& urlEquals(contentLoc, that.contentLoc) //
|
||||
&& urlEquals(playerLoc, that.playerLoc) //
|
||||
&& Objects.equals(duration, that.duration) //
|
||||
&& Objects.equals(expirationDate, that.expirationDate) //
|
||||
&& Objects.equals(rating, that.rating) //
|
||||
|
@ -565,7 +565,7 @@ public class VideoAttributes extends ExtensionMetadata {
|
|||
&& Objects.equals(category, that.category) //
|
||||
&& Objects.deepEquals(restrictedCountries, that.restrictedCountries) //
|
||||
&& Objects.deepEquals(allowedCountries, that.allowedCountries) //
|
||||
&& Objects.equals(galleryLoc, that.galleryLoc) //
|
||||
&& urlEquals(galleryLoc, that.galleryLoc) //
|
||||
&& Objects.equals(galleryTitle, that.galleryTitle) //
|
||||
&& Objects.deepEquals(prices, that.prices) //
|
||||
&& Objects.equals(requiresSubscription, that.requiresSubscription) //
|
||||
|
|
|
@ -16,9 +16,14 @@
|
|||
|
||||
package crawlercommons.sitemaps;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
|
||||
public class SiteMapIndexTest {
|
||||
|
||||
|
@ -32,4 +37,17 @@ public class SiteMapIndexTest {
|
|||
assertEquals(2, index.getSitemaps(true).size());
|
||||
assertEquals(3, index.getSitemaps(false).size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNPE() {
|
||||
SiteMapIndex index = new SiteMapIndex();
|
||||
index.addSitemap(new SiteMap("INVALID", "2020-06-18"));
|
||||
index.addSitemap(new SiteMap("https://example.com/sitemap1.xml", "2020-06-18"));
|
||||
try {
|
||||
assertNotNull(index.getSitemap(new URL("https://example.com/sitemap1.xml")));
|
||||
} catch (MalformedURLException e) {
|
||||
// URL is valid
|
||||
}
|
||||
assertNull(index.getSitemap(null));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -18,9 +18,9 @@ package crawlercommons.sitemaps;
|
|||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertFalse;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
|
||||
/** Created on 13/10/2014. */
|
||||
public class SiteMapURLTest {
|
||||
private SiteMapURL siteMapURL = new SiteMapURL("http://example.com", true);
|
||||
|
||||
|
@ -57,4 +57,12 @@ public class SiteMapURLTest {
|
|||
siteMapURL.setPriority(Double.NEGATIVE_INFINITY);
|
||||
assertEquals(SiteMapURL.DEFAULT_PRIORITY, siteMapURL.getPriority(), 0);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testNPE() {
|
||||
SiteMapURL invalid = new SiteMapURL("INVALID_URL", true);
|
||||
assertFalse(siteMapURL.equals(invalid));
|
||||
// should not throw an exception
|
||||
invalid.hashCode();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
java.net.URL#equals(java.lang.Object) @ may trigger a DNS lookup to resolve the host part
|
||||
java.net.URL#hashCode() @ may trigger a DNS lookup to resolve the host part
|
Loading…
Reference in New Issue