1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-06-19 09:58:58 +02:00

Upgrade to JUnit v4 conventions in SiteMapParser

This commit is contained in:
lewis.mcgibbney@gmail.com 2014-05-29 21:02:16 +00:00
parent cb71c5502a
commit 94c3ed4068

View File

@ -20,12 +20,16 @@ package crawlercommons.sitemaps;
import java.io.IOException;
import java.net.URL;
import junit.framework.TestCase;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
public class SiteMapParserTest extends TestCase {
import static org.junit.Assert.*;
@RunWith(JUnit4.class)
public class SiteMapParserTest {
@Before
public void setUp() throws Exception {
@ -35,21 +39,32 @@ public class SiteMapParserTest extends TestCase {
public void tearDown() throws Exception {
}
@Test
public void testSitemapIndex() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String contentType = "text/xml";
String scontent = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">" + "<sitemap>"
+ " <loc>http://www.example.com/sitemap1.xml.gz</loc>" + " <lastmod>2004-10-01T18:23:17+00:00</lastmod>" + " </sitemap>" + "<sitemap>"
+ " <loc>http://www.example.com/sitemap2.xml.gz</loc>" + " <lastmod>2005-01-01</lastmod>" + " </sitemap>" + " </sitemapindex>";
byte[] content = scontent.getBytes();
StringBuilder scontent = new StringBuilder(1024);
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
.append("<sitemapindex xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")
.append("<sitemap>")
.append(" <loc>http://www.example.com/sitemap1.xml.gz</loc>")
.append(" <lastmod>2004-10-01T18:23:17+00:00</lastmod>")
.append("</sitemap>")
.append("<sitemap>")
.append(" <loc>http://www.example.com/sitemap2.xml.gz</loc>")
.append(" <lastmod>2005-01-01</lastmod>")
.append("</sitemap>")
.append("</sitemapindex>");
byte[] content = scontent.toString().getBytes();
URL url = new URL("http://www.example.com/sitemapindex.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(true, asm.isIndex());
assertEquals(true, asm instanceof SiteMapIndex);
SiteMapIndex smi = (SiteMapIndex) asm;
assertEquals(2, smi.getSitemaps().size());
AbstractSiteMap currentSiteMap = smi.getSitemap(new URL("http://www.example.com/sitemap1.xml.gz"));
assertNotNull(currentSiteMap);
assertEquals("http://www.example.com/sitemap1.xml.gz", currentSiteMap.getUrl().toString());
@ -61,87 +76,91 @@ public class SiteMapParserTest extends TestCase {
assertEquals(SiteMap.convertToDate("2005-01-01"), currentSiteMap.getLastModified());
}
@Test
public void testSitemapXML() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String contentType = "text/xml";
String scontent = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>" + "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">" + " <url>" + "<loc>http://www.example.com/</loc>"
+ "<lastmod>2005-01-01</lastmod>" + "<changefreq>monthly</changefreq>" + "<priority>0.8</priority>" + "</url>" + "<url>"
+ "<loc>http://www.example.com/catalog?item=12&amp;desc=vacation_hawaii</loc>" + "<changefreq>weekly</changefreq>" + "</url>" + "<url>"
+ "<loc>http://www.example.com/catalog?item=73&amp;desc=vacation_new_zealand</loc>" + "<lastmod>2004-12-23</lastmod>" + "<changefreq>weekly</changefreq>" + "</url>" + "<url>"
+ "<loc>http://www.example.com/catalog?item=74&amp;desc=vacation_newfoundland</loc>" + "<lastmod>2004-12-23T18:00:15+00:00</lastmod>" + "<priority>0.3</priority>" + "</url>"
+ "<url>" + "<loc>http://www.example.com/catalog?item=83&amp;desc=vacation_usa</loc>" + "<lastmod>2004-11-23</lastmod>" + "</url>" + "</urlset>";
byte[] content = scontent.getBytes();
URL url = new URL("http://www.example.com/sitemap.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(5, sm.getSiteMapUrls().size());
}
@Test
public void testSitemapTXT() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String contentType = "text/plain";
String scontent = "http://www.example.com/catalog?item=1\nhttp://www.example.com/catalog?item=11";
byte[] content = scontent.getBytes();
URL url = new URL("http://www.example.com/sitemap.txt");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(2, sm.getSiteMapUrls().size());
}
public void testSitemapParserBrokenXml() throws IOException {
// This Sitemap contains badly formatted XML and can't be read
/**
* This Sitemap contains badly formatted XML and can't be read
* */
@Test (expected = UnknownFormatException.class)
public void testSitemapParserBrokenXml() throws IOException, UnknownFormatException {
SiteMapParser parser = new SiteMapParser();
String contentType = "text/xml";
String scontent = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><urlset " + "xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\"><url>"
+ "<!-- This file is not a valid XML file --></url><url><loc>" + "http://cs.harding.edu/fmccown/sitemaps/something.html</loc>"
+ "</url><!-- missing opening url tag --></url></urlset>";
byte[] content = scontent.getBytes();
StringBuilder scontent = new StringBuilder(1024);
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")
.append("<url><!-- This file is not a valid XML file --></url>")
.append("<url><loc> http://cs.harding.edu/fmccown/sitemaps/something.html</loc>")
.append("</url><!-- missing opening url tag --></url></urlset>");
byte[] content = scontent.toString().getBytes();
URL url = new URL("http://www.example.com/sitemapindex.xml");
try {
parser.parseSiteMap(contentType, content, url);
fail("Test failed: UnknownFormatException not thrown");
} catch (UnknownFormatException e) {
// If Exception is thrown Test was successful
}
parser.parseSiteMap(contentType, content, url); // This Sitemap contains badly formatted XML and can't be read
}
@Test
public void testLenientParser() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String contentType = "text/xml";
StringBuilder scontent = new StringBuilder(1024);
scontent.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>")
.append("<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">")
.append("<url>")
.append("<loc>http://www.example.com/</loc>")
.append("</url>")
.append("</urlset>");
byte[] content = scontent.toString().getBytes();
String scontent = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>"
+ "<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\">"
+ " <url>" + "<loc>http://www.example.com/</loc>"
+ " </url>"
+ "</urlset>";
byte[] content = scontent.getBytes();
URL url = new URL("http://www.example.com/subsection/sitemap.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(0, sm.getSiteMapUrls().size());
// Now try again with lenient parsing. We should get one invalid URL
parser = new SiteMapParser(false);
asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
sm = (SiteMap) asm;
assertEquals(1, sm.getSiteMapUrls().size());
assertFalse(sm.getSiteMapUrls().iterator().next().isValid());
}
}
}