mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-11 08:16:04 +02:00
- make all sitemap classes including extensions to implement the Serializable interface - extend sitemap parser unit tests to check object serialization on various types of sitemaps (index, Atom/RSS feeds, video sitemaps, etc.)
This commit is contained in:
parent
e9772be011
commit
78d7e7e85f
|
@ -1,6 +1,7 @@
|
|||
Crawler-Commons Change Log
|
||||
|
||||
Current Development 1.1-SNAPSHOT (yyyy-mm-dd)
|
||||
- [sitemaps] Sitemaps to implement Serializable (cdalexndr, sebastian-nagel) #244
|
||||
- [sitemaps] Allow to deduplicate sitemap links in sitemap indexes (sebastian-nagel) #262
|
||||
- [Robots] Upgrade the toString() method of the Base/Simple RobotRules (Avi Hayun) #264
|
||||
- Upgrade GitIgnore (Avi Hayun) #260
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
package crawlercommons.sitemaps;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.net.URL;
|
||||
import java.time.LocalDate;
|
||||
import java.time.Year;
|
||||
|
@ -31,7 +32,8 @@ import java.util.Date;
|
|||
import java.util.Locale;
|
||||
|
||||
/** SiteMap or SiteMapIndex **/
|
||||
public abstract class AbstractSiteMap {
|
||||
@SuppressWarnings("serial")
|
||||
public abstract class AbstractSiteMap implements Serializable {
|
||||
|
||||
/** Various Sitemap types */
|
||||
public enum SitemapType {
|
||||
|
|
|
@ -20,6 +20,7 @@ import java.net.MalformedURLException;
|
|||
import java.net.URL;
|
||||
import java.util.*;
|
||||
|
||||
@SuppressWarnings("serial")
|
||||
public class SiteMap extends AbstractSiteMap {
|
||||
|
||||
/**
|
||||
|
|
|
@ -24,6 +24,7 @@ import java.util.List;
|
|||
import java.util.Set;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
@SuppressWarnings("serial")
|
||||
public class SiteMapIndex extends AbstractSiteMap {
|
||||
|
||||
/** URLs found in this Sitemap Index */
|
||||
|
|
|
@ -19,6 +19,7 @@ package crawlercommons.sitemaps;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.time.ZonedDateTime;
|
||||
|
@ -36,7 +37,8 @@ import crawlercommons.sitemaps.extension.ExtensionMetadata;
|
|||
*
|
||||
* @author fmccown
|
||||
*/
|
||||
public class SiteMapURL {
|
||||
@SuppressWarnings("serial")
|
||||
public class SiteMapURL implements Serializable {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(SiteMapURL.class);
|
||||
public static final double DEFAULT_PRIORITY = 0.5;
|
||||
|
||||
|
|
|
@ -18,13 +18,15 @@ package crawlercommons.sitemaps.extension;
|
|||
|
||||
import crawlercommons.sitemaps.SiteMapURL;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.util.Map;
|
||||
|
||||
/**
|
||||
* Container for attributes of a {@link SiteMapURL} defined by a sitemap
|
||||
* extension.
|
||||
*/
|
||||
public abstract class ExtensionMetadata {
|
||||
@SuppressWarnings("serial")
|
||||
public abstract class ExtensionMetadata implements Serializable {
|
||||
|
||||
public abstract boolean equals(Object other);
|
||||
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.util.Objects;
|
|||
* Data model for Google extension to the sitemap protocol regarding images
|
||||
* indexing, as per http://www.google.com/schemas/sitemap-image/1.1
|
||||
*/
|
||||
@SuppressWarnings("serial")
|
||||
public class ImageAttributes extends ExtensionMetadata {
|
||||
|
||||
public static final String LOC = "loc";
|
||||
|
|
|
@ -34,6 +34,7 @@ import java.util.Objects;
|
|||
* though you might want to keep them in the same order to make them easier for
|
||||
* you to check for mistakes.</blockquote>
|
||||
*/
|
||||
@SuppressWarnings("serial")
|
||||
public class LinkAttributes extends ExtensionMetadata {
|
||||
|
||||
public static final String HREF = "href";
|
||||
|
|
|
@ -26,6 +26,7 @@ import java.util.Map;
|
|||
* <blockquote>Mobile sitemaps just contain an empty "mobile" tag to identify a
|
||||
* URL as having mobile content.</blockquote>
|
||||
*/
|
||||
@SuppressWarnings("serial")
|
||||
public class MobileAttributes extends ExtensionMetadata {
|
||||
|
||||
@Override
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.util.Objects;
|
|||
* Data model for Google's extension to the sitemap protocol regarding news
|
||||
* indexing, as per http://www.google.com/schemas/sitemap-news/0.9
|
||||
*/
|
||||
@SuppressWarnings("serial")
|
||||
public class NewsAttributes extends ExtensionMetadata {
|
||||
|
||||
public static final String NAME = "name";
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
package crawlercommons.sitemaps.extension;
|
||||
|
||||
import java.io.Serializable;
|
||||
import java.net.URL;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.util.Arrays;
|
||||
|
@ -30,6 +31,7 @@ import java.util.Objects;
|
|||
* Data model for Google extension to the sitemap protocol regarding images
|
||||
* indexing, as per http://www.google.com/schemas/sitemap-video/1.1
|
||||
*/
|
||||
@SuppressWarnings("serial")
|
||||
public class VideoAttributes extends ExtensionMetadata {
|
||||
|
||||
public static final String THUMBNAIL_LOC = "thumbnail_loc";
|
||||
|
@ -437,7 +439,7 @@ public class VideoAttributes extends ExtensionMetadata {
|
|||
SD, HD
|
||||
}
|
||||
|
||||
public static final class VideoPrice {
|
||||
public static final class VideoPrice implements Serializable {
|
||||
/**
|
||||
* Video price currency found under video/price[@currency] (required)
|
||||
*/
|
||||
|
|
|
@ -18,11 +18,18 @@ package crawlercommons.sitemaps;
|
|||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.ObjectInputStream;
|
||||
import java.io.ObjectOutputStream;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.time.Instant;
|
||||
import java.time.ZoneId;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.Collection;
|
||||
import java.util.Iterator;
|
||||
import java.util.Locale;
|
||||
import java.util.TimeZone;
|
||||
|
||||
|
@ -126,4 +133,44 @@ public class AbstractSiteMapTest {
|
|||
String datestr2 = SiteMap.W3C_FULLDATE_FORMATTER_UTC.format(date2);
|
||||
assertEquals(datestr1, datestr2, "Failed to format date");
|
||||
}
|
||||
|
||||
/**
|
||||
* Test whether a sitemap is serializable. To be called in sitemap parser
|
||||
* tests on all types of sitemaps (index, with extensions, etc.)
|
||||
*/
|
||||
public static void testSerializable(AbstractSiteMap sitemap) {
|
||||
try {
|
||||
ByteArrayOutputStream bos = new ByteArrayOutputStream();
|
||||
ObjectOutputStream oos = new ObjectOutputStream(bos);
|
||||
oos.writeObject(sitemap);
|
||||
oos.flush();
|
||||
oos.close();
|
||||
|
||||
ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray());
|
||||
ObjectInputStream ois = new ObjectInputStream(bis);
|
||||
AbstractSiteMap s = (AbstractSiteMap) ois.readObject();
|
||||
ois.close();
|
||||
|
||||
assertNotNull(s);
|
||||
assertEquals(sitemap.getClass(), s.getClass());
|
||||
assertEquals(sitemap.getType(), s.getType());
|
||||
assertEquals(sitemap.isIndex(), s.isIndex());
|
||||
assertEquals(sitemap.getLastModified(), s.getLastModified());
|
||||
assertEquals(sitemap.url.toString(), s.url.toString());
|
||||
|
||||
if (sitemap instanceof SiteMap) {
|
||||
assertEquals(((SiteMap) sitemap).getSiteMapUrls(), ((SiteMap) s).getSiteMapUrls());
|
||||
} else if (sitemap instanceof SiteMapIndex) {
|
||||
Collection<AbstractSiteMap> sitemaps1 = ((SiteMapIndex) sitemap).getSitemaps();
|
||||
Collection<AbstractSiteMap> sitemaps2 = ((SiteMapIndex) s).getSitemaps();
|
||||
assertEquals(sitemaps1.size(), sitemaps2.size());
|
||||
Iterator<AbstractSiteMap> i1 = sitemaps1.iterator(), i2 = sitemaps2.iterator();
|
||||
while (i1.hasNext() && i2.hasNext()) {
|
||||
assertEquals(i1.next().getUrl().toString(), i2.next().getUrl().toString());
|
||||
}
|
||||
}
|
||||
} catch (IOException | ClassNotFoundException e) {
|
||||
fail("Failed to serialize sitemap " + sitemap, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,16 +30,21 @@ import static org.junit.jupiter.api.Assertions.*;
|
|||
|
||||
public class SiteMapParserExtensionTest {
|
||||
|
||||
private AbstractSiteMap parse(SiteMapParser parser, String resourcePath, URL url) throws IOException, UnknownFormatException {
|
||||
byte[] content = SiteMapParserTest.getResourceAsBytes(resourcePath);
|
||||
AbstractSiteMap asm = parser.parseSiteMap("text/xml", content, url);
|
||||
AbstractSiteMapTest.testSerializable(asm);
|
||||
return asm;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testVideosSitemap() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
parser.enableExtension(Extension.VIDEO);
|
||||
|
||||
String contentType = "text/xml";
|
||||
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-videos.xml");
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap-video.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-videos.xml", url);
|
||||
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
@ -95,11 +100,9 @@ public class SiteMapParserExtensionTest {
|
|||
SiteMapParser parser = new SiteMapParser();
|
||||
parser.enableExtension(Extension.IMAGE);
|
||||
|
||||
String contentType = "text/xml";
|
||||
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-images.xml");
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap-images.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-images.xml", url);
|
||||
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
@ -127,11 +130,9 @@ public class SiteMapParserExtensionTest {
|
|||
SiteMapParser parser = new SiteMapParser();
|
||||
parser.enableExtension(Extension.LINKS);
|
||||
|
||||
String contentType = "text/xml";
|
||||
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-links.xml");
|
||||
|
||||
URL url = new URL("http://www.example.com/sitemap-links.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-links.xml", url);
|
||||
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
@ -174,11 +175,9 @@ public class SiteMapParserExtensionTest {
|
|||
SiteMapParser parser = new SiteMapParser();
|
||||
parser.enableExtension(Extension.NEWS);
|
||||
|
||||
String contentType = "text/xml";
|
||||
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-news.xml");
|
||||
|
||||
URL url = new URL("http://www.example.org/sitemap-news.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-news.xml", url);
|
||||
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
@ -200,11 +199,9 @@ public class SiteMapParserExtensionTest {
|
|||
SiteMapParser parser = new SiteMapParser();
|
||||
parser.enableExtension(Extension.MOBILE);
|
||||
|
||||
String contentType = "text/xml";
|
||||
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-mobile.xml");
|
||||
|
||||
URL url = new URL("http://www.example.org/sitemap-mobile.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-mobile.xml", url);
|
||||
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
@ -227,11 +224,9 @@ public class SiteMapParserExtensionTest {
|
|||
parser.setStrictNamespace(true);
|
||||
parser.enableExtension(Extension.NEWS);
|
||||
|
||||
String contentType = "text/xml";
|
||||
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/shinpaideshou-news-sitemap.xml");
|
||||
|
||||
URL url = new URL("https://shinpaideshou.wordpress.com/news-sitemap.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/shinpaideshou-news-sitemap.xml", url);
|
||||
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
@ -254,11 +249,9 @@ public class SiteMapParserExtensionTest {
|
|||
parser.enableExtension(Extension.VIDEO);
|
||||
parser.enableExtension(Extension.MOBILE);
|
||||
|
||||
String contentType = "text/xml";
|
||||
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/hebdenbridgetimes-articles-sitemap.xml");
|
||||
|
||||
URL url = new URL("http://www.hebdenbridgetimes.co.uk/sitemap-article-2015-18.xml");
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/hebdenbridgetimes-articles-sitemap.xml", url);
|
||||
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
|
|
|
@ -76,6 +76,7 @@ public class SiteMapParserTest {
|
|||
URL url = new URL("http://www.example.com/sitemapindex.xml");
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
AbstractSiteMapTest.testSerializable(asm);
|
||||
assertEquals(true, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMapIndex);
|
||||
|
||||
|
@ -160,6 +161,7 @@ public class SiteMapParserTest {
|
|||
URL url = new URL("http://www.example.com/sitemap.txt");
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
AbstractSiteMapTest.testSerializable(asm);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
|
@ -207,6 +209,7 @@ public class SiteMapParserTest {
|
|||
URL url = new URL("http://www.example.com/sitemap.xml");
|
||||
|
||||
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
|
||||
AbstractSiteMapTest.testSerializable(asm);
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
|
||||
|
@ -256,7 +259,7 @@ public class SiteMapParserTest {
|
|||
|
||||
/** This Sitemap contains badly formatted XML and can't be read */
|
||||
@Test
|
||||
public void testSitemapParserBrokenXml() throws IOException, UnknownFormatException {
|
||||
public void testSitemapParserBrokenXml() {
|
||||
Assertions.assertThrows(UnknownFormatException.class, () -> {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
String contentType = "text/xml";
|
||||
|
@ -407,7 +410,7 @@ public class SiteMapParserTest {
|
|||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapWithInvalidContent() throws UnknownFormatException, IOException {
|
||||
public void testSitemapWithInvalidContent() {
|
||||
Assertions.assertThrows(UnknownFormatException.class, () -> {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
String contentType = "application/octet-stream";
|
||||
|
@ -453,6 +456,8 @@ public class SiteMapParserTest {
|
|||
URL url = new URL("http://example.org/atom.xml");
|
||||
|
||||
SiteMap sm = (SiteMap) parser.parseSiteMap(content, url);
|
||||
AbstractSiteMapTest.testSerializable(sm);
|
||||
|
||||
assertEquals(1, sm.getSiteMapUrls().size());
|
||||
SiteMapURL smu = sm.getSiteMapUrls().iterator().next();
|
||||
assertEquals(new URL("http://example.org/2003/12/13/atom03"), smu.getUrl());
|
||||
|
@ -468,6 +473,8 @@ public class SiteMapParserTest {
|
|||
URL url = new URL("https://www.example.com/index.php?feed/rss");
|
||||
|
||||
SiteMap sm = (SiteMap) parser.parseSiteMap(content, url);
|
||||
AbstractSiteMapTest.testSerializable(sm);
|
||||
|
||||
assertEquals(4, sm.getSiteMapUrls().size());
|
||||
Iterator<SiteMapURL> it = sm.getSiteMapUrls().iterator();
|
||||
SiteMapURL smu = it.next();
|
||||
|
|
Loading…
Reference in New Issue