1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-06-10 15:46:04 +02:00
crawler-commons/src/test/java/crawlercommons/sitemaps/SiteMapParserTest.java

267 lines
13 KiB
Java

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import org.apache.commons.io.IOUtils;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.junit.Assert.*;
@RunWith(JUnit4.class)
public class SiteMapParserTest {
private static final Logger LOG = LoggerFactory.getLogger(SiteMapParserTest.class);
@Before
public void setUp() throws Exception {
}
@After
public void tearDown() throws Exception {
}
@Test
public void testSitemapIndex() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String contentType = "text/xml";
StringBuilder scontent = new StringBuilder(1024);
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<sitemap>")
.append(" <loc>http://www.example.com/sitemap1.xml.gz</loc>").append(" <lastmod>2004-10-01T18:23:17+00:00</lastmod>").append("</sitemap>").append("<sitemap>")
.append(" <loc>http://www.example.com/sitemap2.xml.gz</loc>").append(" <lastmod>2005-01-01</lastmod>").append("</sitemap>").append("</sitemapindex>");
byte[] content = scontent.toString().getBytes("UTF-8");
URL url = new URL("http://www.example.com/sitemapindex.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(true, asm.isIndex());
assertEquals(true, asm instanceof SiteMapIndex);
SiteMapIndex smi = (SiteMapIndex) asm;
assertEquals(2, smi.getSitemaps().size());
AbstractSiteMap currentSiteMap = smi.getSitemap(new URL("http://www.example.com/sitemap1.xml.gz"));
assertNotNull(currentSiteMap);
assertEquals("http://www.example.com/sitemap1.xml.gz", currentSiteMap.getUrl().toString());
assertEquals(SiteMap.convertToDate("2004-10-01T18:23:17+00:00"), currentSiteMap.getLastModified());
assertTrue(currentSiteMap.toString().contains("T18:23"));
currentSiteMap = smi.getSitemap(new URL("http://www.example.com/sitemap2.xml.gz"));
assertNotNull(currentSiteMap);
assertEquals("http://www.example.com/sitemap2.xml.gz", currentSiteMap.getUrl().toString());
assertEquals(SiteMap.convertToDate("2005-01-01"), currentSiteMap.getLastModified());
}
@Test
public void testFullDateFormat() {
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00", Locale.getDefault());
Date date = new Date();
LOG.info(format.format(date));
LOG.info(SiteMap.getFullDateFormat().format(date));
}
@Test
public void testSitemapTXT() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String contentType = "text/plain";
String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11";
byte[] content = scontent.getBytes("UTF-8");
URL url = new URL("http://www.example.com/sitemap.txt");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(2, sm.getSiteMapUrls().size());
}
@Test
public void testSitemapTXTWithXMLExt() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11";
byte[] content = scontent.getBytes("UTF-8");
URL url = new URL("http://www.example.com/sitemap.xml");
String contentType = "text/plain";
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(2, sm.getSiteMapUrls().size());
}
@Test
public void testSitemapXML() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String contentType = "text/xml";
byte[] content = getXMLSitemapAsBytes();
URL url = new URL("http://www.example.com/sitemap.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(5, sm.getSiteMapUrls().size());
}
@Test
public void testSitemapXMLMediaTypes() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
byte[] content = getXMLSitemapAsBytes();
URL url = new URL("http://www.example.com/sitemap.nonXmlExt");
final String[] XML_CONTENT_TYPES = new String[] { "text/xml", "application/x-xml", "application/xml", "application/atom+xml", "application/rss+xml" };
for (String contentType : XML_CONTENT_TYPES) {
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(5, sm.getSiteMapUrls().size());
}
}
/**
* This Sitemap contains badly formatted XML and can't be read
* */
@Test(expected = UnknownFormatException.class)
public void testSitemapParserBrokenXml() throws IOException, UnknownFormatException {
SiteMapParser parser = new SiteMapParser();
String contentType = "text/xml";
StringBuilder scontent = new StringBuilder(1024);
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")
.append("<url><!-- This file is not a valid XML file --></url>").append("<url><loc> http://cs.harding.edu/fmccown/sitemaps/something.html</loc>")
.append("</url><!-- missing opening url tag --></url></urlset>");
byte[] content = scontent.toString().getBytes("UTF-8");
URL url = new URL("http://www.example.com/sitemapindex.xml");
parser.parseSiteMap(contentType, content, url); // This Sitemap contains
// badly formatted XML
// and can't be read
}
@Test
public void testSitemapGZ() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String contentType = "application/gzip";
File gzSitemapFile = new File("src/test/resources/xmlSitemap.gz");
InputStream is = new FileInputStream(gzSitemapFile);
byte[] content = IOUtils.toByteArray(is);
URL url = new URL("http://www.example.com/sitemap.xml.gz");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(5, sm.getSiteMapUrls().size());
}
@Test
public void testSitemapGZMediaTypes() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
File gzSitemapFile = new File("src/test/resources/xmlSitemap.gz");
InputStream is = new FileInputStream(gzSitemapFile);
byte[] content = IOUtils.toByteArray(is);
final String[] GZ_CONTENT_TYPES = new String[] { "application/gzip", "application/x-gzip", "application/x-gunzip", "application/gzipped", "application/gzip-compressed", "gzip/document" };
for (String contentType : GZ_CONTENT_TYPES) {
URL url = new URL("http://www.example.com/sitemap");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(5, sm.getSiteMapUrls().size());
}
}
@Test(expected = UnknownFormatException.class)
public void testSitemapWithOctetMediaType() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String contentType = "application/octet-stream";
byte[] content = getXMLSitemapAsBytes();
URL url = new URL("http://www.example.com/sitemap");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(5, sm.getSiteMapUrls().size());
}
@Test
public void testLenientParser() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String contentType = "text/xml";
StringBuilder scontent = new StringBuilder(1024);
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
.append("<loc>http://www.example.com/</loc>").append("</url>").append("</urlset>");
byte[] content = scontent.toString().getBytes("UTF-8");
URL url = new URL("http://www.example.com/subsection/sitemap.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(0, sm.getSiteMapUrls().size());
// Now try again with lenient parsing. We should get one invalid URL
parser = new SiteMapParser(false);
asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
sm = (SiteMap) asm;
assertEquals(1, sm.getSiteMapUrls().size());
assertFalse(sm.getSiteMapUrls().iterator().next().isValid());
}
/** Returns a good simple default XML sitemap as a byte array
* @throws UnsupportedEncodingException */
private byte[] getXMLSitemapAsBytes() throws UnsupportedEncodingException {
StringBuilder scontent = new StringBuilder(1024);
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>").append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">").append("<url>")
.append(" <loc>http://www.example.com/</loc>").append(" <lastmod>2005-01-01</lastmod>").append(" <changefreq>monthly</changefreq>").append(" <priority>0.8</priority>")
.append("</url>").append("<url>").append(" <loc>http://www.example.com/catalog?item=12&amp;desc=vacation_hawaii</loc>").append(" <changefreq>weekly</changefreq>")
.append("</url>").append("<url>").append(" <loc>http://www.example.com/catalog?item=73&amp;desc=vacation_new_zealand</loc>").append(" <lastmod>2004-12-23</lastmod>")
.append(" <changefreq>weekly</changefreq>").append("</url>").append("<url>").append(" <loc>http://www.example.com/catalog?item=74&amp;desc=vacation_newfoundland</loc>")
.append(" <lastmod>2004-12-23T18:00:15+00:00</lastmod>").append(" <priority>0.3</priority>").append("</url>").append("<url>")
.append(" <loc><url><![CDATA[http://www.example.com/catalog?item=83&amp;desc=vacation_usa]]></url></loc>").append(" <lastmod>2004-11-23</lastmod>").append("</url>")
.append("</urlset>");
return scontent.toString().getBytes("UTF-8");
}
}