1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-11 08:16:04 +02:00

Sitemaps to implement Serializable, fixes #244 (#294)

- make all sitemap classes including extensions to implement the
  Serializable interface
- extend sitemap parser unit tests to check object serialization
  on various types of sitemaps (index, Atom/RSS feeds, video sitemaps,
  etc.)
This commit is contained in:
Sebastian Nagel 2020-06-22 13:51:40 +02:00 committed by GitHub
parent e9772be011
commit 78d7e7e85f
Signed by: GitHub
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 96 additions and 34 deletions

View File

@ -1,6 +1,7 @@
Crawler-Commons Change Log
Current Development 1.1-SNAPSHOT (yyyy-mm-dd)
- [sitemaps] Sitemaps to implement Serializable (cdalexndr, sebastian-nagel) #244
- [sitemaps] Allow to deduplicate sitemap links in sitemap indexes (sebastian-nagel) #262
- [Robots] Upgrade the toString() method of the Base/Simple RobotRules (Avi Hayun) #264
- Upgrade GitIgnore (Avi Hayun) #260

View File

@ -16,6 +16,7 @@
package crawlercommons.sitemaps;
import java.io.Serializable;
import java.net.URL;
import java.time.LocalDate;
import java.time.Year;
@ -31,7 +32,8 @@ import java.util.Date;
import java.util.Locale;
/** SiteMap or SiteMapIndex **/
public abstract class AbstractSiteMap {
@SuppressWarnings("serial")
public abstract class AbstractSiteMap implements Serializable {
/** Various Sitemap types */
public enum SitemapType {

View File

@ -20,6 +20,7 @@ import java.net.MalformedURLException;
import java.net.URL;
import java.util.*;
@SuppressWarnings("serial")
public class SiteMap extends AbstractSiteMap {
/**

View File

@ -24,6 +24,7 @@ import java.util.List;
import java.util.Set;
import java.util.stream.Collectors;
@SuppressWarnings("serial")
public class SiteMapIndex extends AbstractSiteMap {
/** URLs found in this Sitemap Index */

View File

@ -19,6 +19,7 @@ package crawlercommons.sitemaps;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.net.URL;
import java.time.ZonedDateTime;
@ -36,7 +37,8 @@ import crawlercommons.sitemaps.extension.ExtensionMetadata;
*
* @author fmccown
*/
public class SiteMapURL {
@SuppressWarnings("serial")
public class SiteMapURL implements Serializable {
private static final Logger LOG = LoggerFactory.getLogger(SiteMapURL.class);
public static final double DEFAULT_PRIORITY = 0.5;

View File

@ -18,13 +18,15 @@ package crawlercommons.sitemaps.extension;
import crawlercommons.sitemaps.SiteMapURL;
import java.io.Serializable;
import java.util.Map;
/**
* Container for attributes of a {@link SiteMapURL} defined by a sitemap
* extension.
*/
public abstract class ExtensionMetadata {
@SuppressWarnings("serial")
public abstract class ExtensionMetadata implements Serializable {
public abstract boolean equals(Object other);

View File

@ -26,6 +26,7 @@ import java.util.Objects;
* Data model for Google extension to the sitemap protocol regarding images
* indexing, as per http://www.google.com/schemas/sitemap-image/1.1
*/
@SuppressWarnings("serial")
public class ImageAttributes extends ExtensionMetadata {
public static final String LOC = "loc";

View File

@ -34,6 +34,7 @@ import java.util.Objects;
* though you might want to keep them in the same order to make them easier for
* you to check for mistakes.</blockquote>
*/
@SuppressWarnings("serial")
public class LinkAttributes extends ExtensionMetadata {
public static final String HREF = "href";

View File

@ -26,6 +26,7 @@ import java.util.Map;
* <blockquote>Mobile sitemaps just contain an empty "mobile" tag to identify a
* URL as having mobile content.</blockquote>
*/
@SuppressWarnings("serial")
public class MobileAttributes extends ExtensionMetadata {
@Override

View File

@ -27,6 +27,7 @@ import java.util.Objects;
* Data model for Google's extension to the sitemap protocol regarding news
* indexing, as per http://www.google.com/schemas/sitemap-news/0.9
*/
@SuppressWarnings("serial")
public class NewsAttributes extends ExtensionMetadata {
public static final String NAME = "name";

View File

@ -16,6 +16,7 @@
package crawlercommons.sitemaps.extension;
import java.io.Serializable;
import java.net.URL;
import java.time.ZonedDateTime;
import java.util.Arrays;
@ -30,6 +31,7 @@ import java.util.Objects;
* Data model for Google extension to the sitemap protocol regarding images
* indexing, as per http://www.google.com/schemas/sitemap-video/1.1
*/
@SuppressWarnings("serial")
public class VideoAttributes extends ExtensionMetadata {
public static final String THUMBNAIL_LOC = "thumbnail_loc";
@ -437,7 +439,7 @@ public class VideoAttributes extends ExtensionMetadata {
SD, HD
}
public static final class VideoPrice {
public static final class VideoPrice implements Serializable {
/**
* Video price currency found under video/price[@currency] (required)
*/

View File

@ -18,11 +18,18 @@ package crawlercommons.sitemaps;
import org.junit.jupiter.api.Test;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.text.SimpleDateFormat;
import java.time.Instant;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Collection;
import java.util.Iterator;
import java.util.Locale;
import java.util.TimeZone;
@ -126,4 +133,44 @@ public class AbstractSiteMapTest {
String datestr2 = SiteMap.W3C_FULLDATE_FORMATTER_UTC.format(date2);
assertEquals(datestr1, datestr2, "Failed to format date");
}
/**
* Test whether a sitemap is serializable. To be called in sitemap parser
* tests on all types of sitemaps (index, with extensions, etc.)
*/
public static void testSerializable(AbstractSiteMap sitemap) {
try {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
ObjectOutputStream oos = new ObjectOutputStream(bos);
oos.writeObject(sitemap);
oos.flush();
oos.close();
ByteArrayInputStream bis = new ByteArrayInputStream(bos.toByteArray());
ObjectInputStream ois = new ObjectInputStream(bis);
AbstractSiteMap s = (AbstractSiteMap) ois.readObject();
ois.close();
assertNotNull(s);
assertEquals(sitemap.getClass(), s.getClass());
assertEquals(sitemap.getType(), s.getType());
assertEquals(sitemap.isIndex(), s.isIndex());
assertEquals(sitemap.getLastModified(), s.getLastModified());
assertEquals(sitemap.url.toString(), s.url.toString());
if (sitemap instanceof SiteMap) {
assertEquals(((SiteMap) sitemap).getSiteMapUrls(), ((SiteMap) s).getSiteMapUrls());
} else if (sitemap instanceof SiteMapIndex) {
Collection<AbstractSiteMap> sitemaps1 = ((SiteMapIndex) sitemap).getSitemaps();
Collection<AbstractSiteMap> sitemaps2 = ((SiteMapIndex) s).getSitemaps();
assertEquals(sitemaps1.size(), sitemaps2.size());
Iterator<AbstractSiteMap> i1 = sitemaps1.iterator(), i2 = sitemaps2.iterator();
while (i1.hasNext() && i2.hasNext()) {
assertEquals(i1.next().getUrl().toString(), i2.next().getUrl().toString());
}
}
} catch (IOException | ClassNotFoundException e) {
fail("Failed to serialize sitemap " + sitemap, e);
}
}
}

View File

@ -30,16 +30,21 @@ import static org.junit.jupiter.api.Assertions.*;
public class SiteMapParserExtensionTest {
private AbstractSiteMap parse(SiteMapParser parser, String resourcePath, URL url) throws IOException, UnknownFormatException {
byte[] content = SiteMapParserTest.getResourceAsBytes(resourcePath);
AbstractSiteMap asm = parser.parseSiteMap("text/xml", content, url);
AbstractSiteMapTest.testSerializable(asm);
return asm;
}
@Test
public void testVideosSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.VIDEO);
String contentType = "text/xml";
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-videos.xml");
URL url = new URL("http://www.example.com/sitemap-video.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-videos.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
@ -95,11 +100,9 @@ public class SiteMapParserExtensionTest {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.IMAGE);
String contentType = "text/xml";
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-images.xml");
URL url = new URL("http://www.example.com/sitemap-images.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-images.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
@ -127,11 +130,9 @@ public class SiteMapParserExtensionTest {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.LINKS);
String contentType = "text/xml";
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-links.xml");
URL url = new URL("http://www.example.com/sitemap-links.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-links.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
@ -174,11 +175,9 @@ public class SiteMapParserExtensionTest {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.NEWS);
String contentType = "text/xml";
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-news.xml");
URL url = new URL("http://www.example.org/sitemap-news.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-news.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
@ -200,11 +199,9 @@ public class SiteMapParserExtensionTest {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.MOBILE);
String contentType = "text/xml";
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-mobile.xml");
URL url = new URL("http://www.example.org/sitemap-mobile.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-mobile.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
@ -227,11 +224,9 @@ public class SiteMapParserExtensionTest {
parser.setStrictNamespace(true);
parser.enableExtension(Extension.NEWS);
String contentType = "text/xml";
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/shinpaideshou-news-sitemap.xml");
URL url = new URL("https://shinpaideshou.wordpress.com/news-sitemap.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/shinpaideshou-news-sitemap.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
@ -254,11 +249,9 @@ public class SiteMapParserExtensionTest {
parser.enableExtension(Extension.VIDEO);
parser.enableExtension(Extension.MOBILE);
String contentType = "text/xml";
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/hebdenbridgetimes-articles-sitemap.xml");
URL url = new URL("http://www.hebdenbridgetimes.co.uk/sitemap-article-2015-18.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/hebdenbridgetimes-articles-sitemap.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;

View File

@ -76,6 +76,7 @@ public class SiteMapParserTest {
URL url = new URL("http://www.example.com/sitemapindex.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
AbstractSiteMapTest.testSerializable(asm);
assertEquals(true, asm.isIndex());
assertEquals(true, asm instanceof SiteMapIndex);
@ -160,6 +161,7 @@ public class SiteMapParserTest {
URL url = new URL("http://www.example.com/sitemap.txt");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
AbstractSiteMapTest.testSerializable(asm);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
@ -207,6 +209,7 @@ public class SiteMapParserTest {
URL url = new URL("http://www.example.com/sitemap.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
AbstractSiteMapTest.testSerializable(asm);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
@ -256,7 +259,7 @@ public class SiteMapParserTest {
/** This Sitemap contains badly formatted XML and can't be read */
@Test
public void testSitemapParserBrokenXml() throws IOException, UnknownFormatException {
public void testSitemapParserBrokenXml() {
Assertions.assertThrows(UnknownFormatException.class, () -> {
SiteMapParser parser = new SiteMapParser();
String contentType = "text/xml";
@ -407,7 +410,7 @@ public class SiteMapParserTest {
}
@Test
public void testSitemapWithInvalidContent() throws UnknownFormatException, IOException {
public void testSitemapWithInvalidContent() {
Assertions.assertThrows(UnknownFormatException.class, () -> {
SiteMapParser parser = new SiteMapParser();
String contentType = "application/octet-stream";
@ -453,6 +456,8 @@ public class SiteMapParserTest {
URL url = new URL("http://example.org/atom.xml");
SiteMap sm = (SiteMap) parser.parseSiteMap(content, url);
AbstractSiteMapTest.testSerializable(sm);
assertEquals(1, sm.getSiteMapUrls().size());
SiteMapURL smu = sm.getSiteMapUrls().iterator().next();
assertEquals(new URL("http://example.org/2003/12/13/atom03"), smu.getUrl());
@ -468,6 +473,8 @@ public class SiteMapParserTest {
URL url = new URL("https://www.example.com/index.php?feed/rss");
SiteMap sm = (SiteMap) parser.parseSiteMap(content, url);
AbstractSiteMapTest.testSerializable(sm);
assertEquals(4, sm.getSiteMapUrls().size());
Iterator<SiteMapURL> it = sm.getSiteMapUrls().iterator();
SiteMapURL smu = it.next();