1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-09 23:56:04 +02:00
crawler-commons/src/test/java/crawlercommons/sitemaps/SiteMapParserExtensionTest....

328 lines
16 KiB
Java

/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.time.ZonedDateTime;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import org.junit.jupiter.api.Test;
import crawlercommons.sitemaps.extension.Extension;
import crawlercommons.sitemaps.extension.ExtensionMetadata;
import crawlercommons.sitemaps.extension.ImageAttributes;
import crawlercommons.sitemaps.extension.LinkAttributes;
import crawlercommons.sitemaps.extension.MobileAttributes;
import crawlercommons.sitemaps.extension.NewsAttributes;
import crawlercommons.sitemaps.extension.PageMap;
import crawlercommons.sitemaps.extension.PageMapDataObject;
import crawlercommons.sitemaps.extension.VideoAttributes;
public class SiteMapParserExtensionTest {
private AbstractSiteMap parse(SiteMapParser parser, String resourcePath, URL url) throws IOException, UnknownFormatException {
byte[] content = SiteMapParserTest.getResourceAsBytes(resourcePath);
AbstractSiteMap asm = parser.parseSiteMap("text/xml", content, url);
AbstractSiteMapTest.testSerializable(asm);
return asm;
}
@Test
public void testVideosSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.VIDEO);
URL url = new URL("http://www.example.com/sitemap-video.xml");
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-videos.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(3, sm.getSiteMapUrls().size());
Iterator<SiteMapURL> siter = sm.getSiteMapUrls().iterator();
// first <loc> element: nearly all video attributes
VideoAttributes expectedVideoAttributes = new VideoAttributes(new URL("http://www.example.com/thumbs/123.jpg"), "Grilling steaks for summer",
"Alkis shows you how to get perfectly done steaks every time", new URL("http://www.example.com/video123.flv"), new URL("http://www.example.com/videoplayer.swf?video=123"));
expectedVideoAttributes.setDuration(600);
ZonedDateTime dt = ZonedDateTime.parse("2009-11-05T19:20:30+08:00");
expectedVideoAttributes.setExpirationDate(dt);
dt = ZonedDateTime.parse("2007-11-05T19:20:30+08:00");
expectedVideoAttributes.setPublicationDate(dt);
expectedVideoAttributes.setRating(4.2f);
expectedVideoAttributes.setViewCount(12345);
expectedVideoAttributes.setFamilyFriendly(true);
expectedVideoAttributes.setTags(new String[] { "sample_tag1", "sample_tag2" });
expectedVideoAttributes.setAllowedCountries(new String[] { "IE", "GB", "US", "CA" });
expectedVideoAttributes.setGalleryLoc(new URL("http://cooking.example.com"));
expectedVideoAttributes.setGalleryTitle("Cooking Videos");
expectedVideoAttributes.setPrices(new VideoAttributes.VideoPrice[] { new VideoAttributes.VideoPrice("EUR", 1.99f, VideoAttributes.VideoPriceType.own) });
expectedVideoAttributes.setRequiresSubscription(true);
expectedVideoAttributes.setUploader("GrillyMcGrillerson");
expectedVideoAttributes.setUploaderInfo(new URL("http://www.example.com/users/grillymcgrillerson"));
expectedVideoAttributes.setLive(false);
VideoAttributes attr = (VideoAttributes) siter.next().getAttributesForExtension(Extension.VIDEO)[0];
assertNotNull(attr);
assertEquals(expectedVideoAttributes, attr);
// locale-specific number format in <video:price>, test #220
// The current expected behavior is to not handle non-US locale price
// values and set the price value to null if parsing as float value
// fails.
expectedVideoAttributes = new VideoAttributes(new URL("http://www.example.com/thumbs/123-2.jpg"), "Grilling steaks for summer, episode 2",
"Alkis shows you how to get perfectly done steaks every time", new URL("http://www.example.com/video123-2.flv"), null);
expectedVideoAttributes.setPrices(new VideoAttributes.VideoPrice[] { new VideoAttributes.VideoPrice("EUR", null, VideoAttributes.VideoPriceType.own) });
attr = (VideoAttributes) siter.next().getAttributesForExtension(Extension.VIDEO)[0];
assertNotNull(attr);
assertEquals(expectedVideoAttributes, attr);
// empty price, only type (purchase or rent) is indicated, see #221
expectedVideoAttributes = new VideoAttributes(new URL("http://www.example.com/thumbs/123-3.jpg"), "Grilling steaks for summer, episode 3",
"Alkis shows you how to get perfectly done steaks every time", new URL("http://www.example.com/video123-3.flv"), null);
expectedVideoAttributes.setPrices(new VideoAttributes.VideoPrice[] { new VideoAttributes.VideoPrice(null, null, VideoAttributes.VideoPriceType.rent) });
attr = (VideoAttributes) siter.next().getAttributesForExtension(Extension.VIDEO)[0];
assertNotNull(attr);
assertEquals(expectedVideoAttributes, attr);
}
@Test
public void testImageSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.IMAGE);
URL url = new URL("http://www.example.com/sitemap-images.xml");
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-images.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(1, sm.getSiteMapUrls().size());
ImageAttributes imageAttributes1 = new ImageAttributes(new URL("http://example.com/image.jpg"));
ImageAttributes imageAttributes2 = new ImageAttributes(new URL("http://example.com/photo.jpg"));
imageAttributes2.setCaption("This is the caption.");
imageAttributes2.setGeoLocation("Limerick, Ireland");
imageAttributes2.setTitle("Example photo shot in Limerick, Ireland");
imageAttributes2.setLicense(new URL("https://creativecommons.org/licenses/by/4.0/legalcode"));
for (SiteMapURL su : sm.getSiteMapUrls()) {
assertNotNull(su.getAttributesForExtension(Extension.IMAGE));
ExtensionMetadata[] attrs = su.getAttributesForExtension(Extension.IMAGE);
ImageAttributes attr = (ImageAttributes) attrs[0];
assertEquals(imageAttributes1, attr);
attr = (ImageAttributes) attrs[1];
assertEquals(imageAttributes2, attr);
}
}
@SuppressWarnings("serial")
@Test
public void testXHTMLLinksSitemap() throws UnknownFormatException, IOException, MalformedURLException {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.LINKS);
URL url = new URL("http://www.example.com/sitemap-links.xml");
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-links.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(3, sm.getSiteMapUrls().size());
// all three pages share the same links attributes
LinkAttributes[] linkAttributes = new LinkAttributes[] { new LinkAttributes(new URL("http://www.example.com/deutsch/")),
new LinkAttributes(new URL("http://www.example.com/schweiz-deutsch/")), new LinkAttributes(new URL("http://www.example.com/english/")) };
linkAttributes[0].setParams(new HashMap<String, String>() {
{
put("rel", "alternate");
put("hreflang", "de");
}
});
linkAttributes[1].setParams(new HashMap<String, String>() {
{
put("rel", "alternate");
put("hreflang", "de-ch");
}
});
linkAttributes[2].setParams(new HashMap<String, String>() {
{
put("rel", "alternate");
put("hreflang", "en");
}
});
for (SiteMapURL su : sm.getSiteMapUrls()) {
assertNotNull(su.getAttributesForExtension(Extension.LINKS));
ExtensionMetadata[] attrs = su.getAttributesForExtension(Extension.LINKS);
assertEquals(linkAttributes.length, attrs.length);
for (int i = 0; i < linkAttributes.length; i++) {
LinkAttributes attr = (LinkAttributes) attrs[i];
assertEquals(linkAttributes[i], attr);
}
}
}
@Test
public void testNewsSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.NEWS);
URL url = new URL("http://www.example.org/sitemap-news.xml");
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-news.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(1, sm.getSiteMapUrls().size());
ZonedDateTime dt = ZonedDateTime.parse("2008-12-23T00:00:00+00:00");
NewsAttributes expectedNewsAttributes = new NewsAttributes("The Example Times", "en", dt, "Companies A, B in Merger Talks");
expectedNewsAttributes.setKeywords(new String[] { "business", "merger", "acquisition", "A", "B" });
expectedNewsAttributes.setGenres(new NewsAttributes.NewsGenre[] { NewsAttributes.NewsGenre.PressRelease, NewsAttributes.NewsGenre.Blog });
expectedNewsAttributes.setStockTickers(new String[] { "NASDAQ:A", "NASDAQ:B" });
for (SiteMapURL su : sm.getSiteMapUrls()) {
assertNotNull(su.getAttributesForExtension(Extension.NEWS));
NewsAttributes attr = (NewsAttributes) su.getAttributesForExtension(Extension.NEWS)[0];
assertEquals(expectedNewsAttributes, attr);
}
}
@Test
public void testMobileSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.MOBILE);
URL url = new URL("http://www.example.org/sitemap-mobile.xml");
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/sitemap-mobile.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
for (SiteMapURL su : sm.getSiteMapUrls()) {
URL u = su.getUrl();
ExtensionMetadata[] attrs = su.getAttributesForExtension(Extension.MOBILE);
if (u.getPath().contains("mobile-friendly")) {
assertNotNull(attrs);
MobileAttributes attr = (MobileAttributes) attrs[0];
assertNotNull(attr);
} else {
assertTrue(attrs == null || attrs.length == 0);
}
}
}
@Test
public void testShinpaideshuNewsSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.setStrictNamespace(true);
parser.enableExtension(Extension.NEWS);
URL url = new URL("https://shinpaideshou.wordpress.com/news-sitemap.xml");
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/shinpaideshou-news-sitemap.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(3, sm.getSiteMapUrls().size());
for (SiteMapURL su : sm.getSiteMapUrls()) {
assertNotNull(su.getAttributesForExtension(Extension.NEWS));
NewsAttributes attr = (NewsAttributes) su.getAttributesForExtension(Extension.NEWS)[0];
assertNotNull(attr.getName());
assertNotNull(attr.getPublicationDateTime());
assertEquals(2017, attr.getPublicationDateTime().getYear());
}
}
@Test
public void testHebdenbridgetimesArticlesSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.setStrictNamespace(true);
parser.enableExtension(Extension.NEWS);
parser.enableExtension(Extension.IMAGE);
parser.enableExtension(Extension.VIDEO);
parser.enableExtension(Extension.MOBILE);
URL url = new URL("http://www.hebdenbridgetimes.co.uk/sitemap-article-2015-18.xml");
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/hebdenbridgetimes-articles-sitemap.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(74, sm.getSiteMapUrls().size());
}
@Test
public void testPageMapSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.setStrictNamespace(true);
parser.enableExtension(Extension.PAGEMAPS);
String urlStr = "http://www.example.com/pagemaps-sitemap.xml";
URL url = new URL(urlStr);
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/pagemaps-sitemap.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(2, sm.getSiteMapUrls().size());
assertEquals(urlStr, sm.getUrl().toString());
// System.out.println(sm.toString());
for (SiteMapURL u : sm.getSiteMapUrls()) {
System.out.println(u.toString());
for (Entry<Extension, ExtensionMetadata[]> x : u.getAttributes().entrySet()) {
assertEquals(Extension.PAGEMAPS, x.getKey());
System.out.println(x.getValue().getClass());
PageMap pageMap = (PageMap) x.getValue()[0];
List<PageMapDataObject> dataObjects = pageMap.getPageMapDataObjects();
PageMapDataObject dataObject;
switch (u.getUrl().toString()) {
case "http://www.example.com/foo":
assertEquals(2, dataObjects.size());
dataObject = dataObjects.get(0);
assertEquals("document", dataObject.getType());
assertEquals("one", dataObject.getId());
assertEquals("Doc One", dataObject.getAttribute("name"));
assertEquals("3.5", dataObject.getAttribute("review"));
dataObject = dataObjects.get(1);
assertEquals("image", dataObject.getType());
assertNull(dataObject.getId());
assertEquals("http://www.example.com/foo.gif", dataObject.getAttribute("image_src"));
break;
case "http://www.example.com/bar":
assertEquals(1, dataObjects.size());
dataObject = dataObjects.get(0);
assertEquals("document", dataObject.getType());
assertEquals("two", dataObject.getId());
assertEquals("Doc Two", dataObject.getAttribute("name"));
assertEquals("4.0", dataObject.getAttribute("review"));
break;
}
System.out.println(x.getKey() + ": " + Arrays.toString(x.getValue()));
System.out.println(x.getValue().length);
}
}
}
}