1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-25 11:46:03 +02:00

Use the Java 8 date and time API (java.time.*) to parse dates in sitemaps (#217)

* Use the Java 8 date and time API (java.time.*) to parse dates in sitemaps
- use thread-safe DateTimeFormatter instead of ThreadLocal<DateFormat>
- simplify parsing of RSS publication dates
- remove obsolete regex pattern to catch dates with time zone
  but without seconds (covered by DateTimeFormatter.ISO_OFFSET_DATE_TIME)
- extend unit tests

* Fix Javadoc error and warnings, update change log

* Remove obsolete dependency to jaxb-api
- import of javax.xml.bind.DatatypeConverter has been removed
  by updating to Java 8 date and time API
This commit is contained in:
Sebastian Nagel 2018-09-24 11:09:58 +02:00 committed by Julien Nioche
parent bef1b8437e
commit 9318de951f
9 changed files with 221 additions and 123 deletions

View File

@ -1,6 +1,7 @@
Crawler-Commons Change Log
Current Development 0.11-SNAPSHOT (yyyy-mm-dd)
- [Sitemaps] Use the Java 8 date and time API (java.time.*) to parse dates in sitemaps (sebastian-nagel) #217
- [Robots] Fix for handling URLs with query parameters but no path (kkrugler) #215
Release 0.10 (2018-06-05)

View File

@ -322,7 +322,6 @@
<mockito-core.version>1.8.0</mockito-core.version>
<jetty.version>5.1.10</jetty.version>
<servlet-api.version>2.5</servlet-api.version>
<jaxb-api.version>2.2.11</jaxb-api.version>
<!-- Maven Plugin Dependencies -->
<maven-compiler-plugin.version>2.3.2</maven-compiler-plugin.version>
@ -364,13 +363,6 @@
<version>${slf4j-api.version}</version>
</dependency>
<!-- see https://github.com/crawler-commons/crawler-commons/issues/196 -->
<dependency>
<groupId>javax.xml.bind</groupId>
<artifactId>jaxb-api</artifactId>
<version>${jaxb-api.version}</version>
</dependency>
<!-- Test dependencies -->
<dependency>

View File

@ -17,16 +17,18 @@
package crawlercommons.sitemaps;
import java.net.URL;
import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.time.LocalDate;
import java.time.Year;
import java.time.YearMonth;
import java.time.ZoneId;
import java.time.ZoneOffset;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.time.format.DateTimeParseException;
import java.time.temporal.ChronoField;
import java.time.temporal.TemporalAccessor;
import java.util.Date;
import java.util.Locale;
import java.util.TimeZone;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.bind.DatatypeConverter;
/** SiteMap or SiteMapIndex **/
public abstract class AbstractSiteMap {
@ -36,41 +38,23 @@ public abstract class AbstractSiteMap {
INDEX, XML, ATOM, RSS, TEXT
};
// 1997-07-16T19:20+01:00
private static final Pattern W3C_NO_SECONDS_PATTERN = Pattern.compile("(\\d\\d\\d\\d\\-\\d\\d\\-\\d\\dT\\d\\d:\\d\\d)(\\-|\\+)(\\d\\d):(\\d\\d)");
private static final ThreadLocal<DateFormat> W3C_NO_SECONDS_FORMAT = new ThreadLocal<DateFormat>() {
protected DateFormat initialValue() {
return new SimpleDateFormat("yyyy-MM-dd'T'HH:mmZ", Locale.ROOT);
}
};
private static final ThreadLocal<DateFormat> W3C_FULLDATE_FORMAT = new ThreadLocal<DateFormat>() {
protected DateFormat initialValue() {
SimpleDateFormat result = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX", Locale.ROOT);
result.setTimeZone(TimeZone.getTimeZone("UTC"));
return result;
}
};
private static final ThreadLocal<DateFormat> W3C_FULLDATE_FORMAT_WITH_OFFSET = new ThreadLocal<DateFormat>() {
protected DateFormat initialValue() {
SimpleDateFormat result = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
result.setTimeZone(TimeZone.getTimeZone("UTC"));
return result;
}
};
protected static final ZoneId TIME_ZONE_UTC = ZoneId.of(ZoneOffset.UTC.toString());
/**
* The set of date-time formats which could be used as pubDate in RSS.
* DateTimeFormatter for parsing dates in ISO-8601 format
*/
private static final ThreadLocal<DateFormat[]> RSS_DATE_FORMATS = new ThreadLocal<DateFormat[]>() {
@Override
protected DateFormat[] initialValue() {
return new DateFormat[] { new SimpleDateFormat("EEE, dd MMM yy HH:mm:ss Z", Locale.ROOT), new SimpleDateFormat("dd MMM yy HH:mm:ss Z", Locale.ROOT),
new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z", Locale.ROOT), new SimpleDateFormat("dd MMM yyyy HH:mm:ss Z", Locale.ROOT) };
}
};
public static final DateTimeFormatter W3C_FULLDATE_FORMATTER = DateTimeFormatter.ISO_OFFSET_DATE_TIME;
/**
* DateTimeFormatter to format dates in ISO-8601 format (UTC time zone 'Z')
*/
public static final DateTimeFormatter W3C_FULLDATE_FORMATTER_UTC = DateTimeFormatter.ISO_INSTANT;
/**
* DateTimeFormatter for parsing short dates ('1997', '1997-07',
* '1997-07-16') without daytime and time zone
*/
public static final DateTimeFormatter W3C_SHORTDATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy[-MM[-dd]]", Locale.ROOT).withZone(TIME_ZONE_UTC);
/** W3C date the Sitemap was last modified */
private Date lastModified;
@ -87,10 +71,6 @@ public abstract class AbstractSiteMap {
lastModified = null;
}
public static DateFormat getFullDateFormat() {
return W3C_FULLDATE_FORMAT.get();
}
public boolean isIndex() {
return (type == SitemapType.INDEX);
};
@ -135,7 +115,7 @@ public abstract class AbstractSiteMap {
/**
* @param lastModified
* - the lastModified to set
* the last-modified date
*/
public void setLastModified(Date lastModified) {
this.lastModified = lastModified;
@ -143,7 +123,16 @@ public abstract class AbstractSiteMap {
/**
* @param lastModified
* - the lastModified to set
* the last-modified date and time
*/
public void setLastModified(ZonedDateTime lastModified) {
this.lastModified = Date.from(lastModified.toInstant());
}
/**
* @param lastModified
* the last-modified date time. If parsing of the given date time
* fails, the last-modified field is set to null.
*/
public void setLastModified(String lastModified) {
this.lastModified = SiteMap.convertToDate(lastModified);
@ -160,49 +149,84 @@ public abstract class AbstractSiteMap {
* Convert the given date (given in an acceptable DateFormat), null if the
* date is not in the correct format.
*
* <p>
* Dates must follow the <a href="https://www.w3.org/TR/NOTE-datetime">W3C
* Datetime format</a> which is similar to <a
* href="https://en.wikipedia.org/wiki/ISO_8601">ISO-8601</a> but allows
* dates with different precisions:</p>
*
* <pre>
* Year:
* YYYY (eg 1997)
* Year and month:
* YYYY-MM (eg 1997-07)
* Complete date:
* YYYY-MM-DD (eg 1997-07-16)
* Complete date plus hours and minutes:
* YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00)
* Complete date plus hours, minutes and seconds:
* YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00)
* Complete date plus hours, minutes, seconds and a decimal fraction of a second
* YYYY-MM-DDThh:mm:ss.sTZD (eg 1997-07-16T19:20:30.45+01:00)
* </pre>
*
* @param date
* - the date to be parsed
* @return the Date equivalent or NULL when encountering an unparsable date
* string argument
* @return the zoned date time equivalent to the date string or NULL parsing
* failed
*/
public static Date convertToDate(String date) {
public static ZonedDateTime convertToZonedDateTime(String date) {
if (date == null) {
return null;
}
// full date including daytime and optional time zone
try {
return getFullDateFormat().parse(date);
} catch (ParseException e1) {
return W3C_FULLDATE_FORMATTER.parse(date, ZonedDateTime::from);
} catch (DateTimeParseException e) {
// fall-through and try date without daytime
}
// dates without daytime
try {
return DatatypeConverter.parseDateTime(date).getTime();
} catch (IllegalArgumentException e) {
// See if it's the one W3C case that the javax.xml.bind
// implementation (incorrectly) doesn't handle.
Matcher m = W3C_NO_SECONDS_PATTERN.matcher(date);
if (m.matches()) {
try {
// Convert to a format that Java can parse, which means
// time zone has to be "-/+HHMM", not "+/-HH:MM"
StringBuffer mungedDate = new StringBuffer(m.group(1));
mungedDate.append(m.group(2));
mungedDate.append(m.group(3));
mungedDate.append(m.group(4));
return W3C_NO_SECONDS_FORMAT.get().parse(mungedDate.toString());
} catch (ParseException e2) {
return null;
}
} else {
return null;
TemporalAccessor ta = W3C_SHORTDATE_FORMATTER.parse(date);
LocalDate ldt = null;
if (ta.isSupported(ChronoField.DAY_OF_MONTH)) {
ldt = LocalDate.from(ta);
} else if (ta.isSupported(ChronoField.MONTH_OF_YEAR)) {
ldt = YearMonth.from(ta).atDay(1);
} else if (ta.isSupported(ChronoField.YEAR)) {
ldt = Year.from(ta).atDay(1);
}
if (ldt != null) {
return ldt.atStartOfDay(TIME_ZONE_UTC);
}
} catch (DateTimeParseException e) {
}
return null;
}
/**
* Converts pubDate of RSS to the string representation which could be
* parsed in {@link #convertToDate(String)} method.
* See {@link #convertToZonedDateTime(String)}.
*
* @param date
* the date string to convert
* @return returns the date or null if parsing of the date string fails
*/
public static Date convertToDate(String date) {
ZonedDateTime zdt = convertToZonedDateTime(date);
if (zdt == null) {
return null;
}
return Date.from(zdt.toInstant());
}
/**
* Converts pubDate of RSS to the ISO-8601 instant format, e.g.,
* '2017-01-05T12:34:54Z' in UTC / GMT time zone, see
* {@link DateTimeFormatter#ISO_INSTANT}.
*
* @param pubDate
* - date time of pubDate in RFC822
@ -213,19 +237,37 @@ public abstract class AbstractSiteMap {
if (pubDate == null) {
return null;
}
Date date = null;
for (DateFormat format : RSS_DATE_FORMATS.get()) {
try {
date = format.parse(pubDate);
break;
} catch (ParseException ex) {
// try next one
}
}
if (date == null) {
ZonedDateTime zdt = parseRSSTimestamp(pubDate);
if (zdt == null) {
return pubDate;
}
return W3C_FULLDATE_FORMAT_WITH_OFFSET.get().format(date);
return W3C_FULLDATE_FORMATTER_UTC.format(zdt);
}
/**
* Parse pubDate of RSS feeds.
*
* @param pubDate
* - date time of pubDate in RFC822
* @return date time or null if parsing failed
*/
public static ZonedDateTime parseRSSTimestamp(String pubDate) {
ZonedDateTime zdt = null;
try {
zdt = DateTimeFormatter.RFC_1123_DATE_TIME.parse(pubDate, ZonedDateTime::from);
} catch (DateTimeParseException ex) {
return null;
}
if (zdt.getYear() <= 99 && zdt.getYear() >= 0) {
// adjust two-digit years: RFC 1123 requires a fully-specified year,
// while RFC 822 allows two digits
if (zdt.getYear() >= 80) {
// assume 19yy - RFC 822 has been publish in 1982
zdt = zdt.plusYears(1900);
} else {
zdt = zdt.plusYears(2000);
}
}
return zdt;
}
}

View File

@ -95,8 +95,8 @@ public class SiteMap extends AbstractSiteMap {
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("url = \"").append(url).append("\", lastMod = ").append((getLastModified() == null) ? "null" : SiteMap.getFullDateFormat().format(getLastModified())).append(", type = ")
.append(getType()).append(", processed = ").append(isProcessed()).append(", urlListSize = ").append(urlList.size());
sb.append("url = \"").append(url).append("\", lastMod = ").append((getLastModified() == null) ? "null" : SiteMap.W3C_FULLDATE_FORMATTER_UTC.format(getLastModified().toInstant()))
.append(", type = ").append(getType()).append(", processed = ").append(isProcessed()).append(", urlListSize = ").append(urlList.size());
return sb.toString();
}

View File

@ -119,12 +119,15 @@ public class SiteMapParser {
}
/**
* Sets the parser to allow any namespace or just the one from the
* Sets the parser to allow any XML namespace or just the one from the
* specification, or any accepted namespace (see
* {@link #addAcceptedNamespace(String)}). Note enabling strict namespace
* checking always adds the namespace defined by the current sitemap
* specificiation ({@link Namespace#SITEMAP}) to the list of accepted
* namespaces.
*
* @param s
* if true enable strict namespace-checking, disable if false
*/
public void setStrictNamespace(boolean s) {
strictNamespace = s;
@ -137,6 +140,7 @@ public class SiteMapParser {
* Add namespace URI to set of accepted namespaces.
*
* @param namespaceUri
* URI of the accepted XML namespace
*/
public void addAcceptedNamespace(String namespaceUri) {
acceptedNamespaces.add(namespaceUri);
@ -146,6 +150,7 @@ public class SiteMapParser {
* Add namespace URIs to set of accepted namespaces.
*
* @param namespaceUris
* array of accepted XML namespace URIs
*/
public void addAcceptedNamespace(String[] namespaceUris) {
for (String namespaceUri : namespaceUris) {
@ -553,7 +558,9 @@ public class SiteMapParser {
* are valid.
*
* @param sitemapBaseUrl
* the base URL of the sitemap
* @param testUrl
* the URL to be tested
* @return true if testUrl is under sitemapBaseUrl, false otherwise
*/
public static boolean urlIsValid(String sitemapBaseUrl, String testUrl) {

View File

@ -21,6 +21,7 @@ import org.slf4j.LoggerFactory;
import java.net.MalformedURLException;
import java.net.URL;
import java.time.ZonedDateTime;
import java.util.Date;
import java.util.Locale;
@ -92,6 +93,10 @@ public class SiteMapURL {
setPriority(priority);
}
public SiteMapURL(URL url, ZonedDateTime lastModified, ChangeFrequency changeFreq, double priority, boolean valid) {
this(url, Date.from(lastModified.toInstant()), changeFreq, priority, valid);
}
/**
* Return the URL.
*
@ -140,7 +145,7 @@ public class SiteMapURL {
* Set when this URL was last modified.
*
* @param lastModified
* the last time the sitemap was modified
* lastmod specified for the URL
*/
public void setLastModified(String lastModified) {
this.lastModified = SiteMap.convertToDate(lastModified);
@ -150,12 +155,24 @@ public class SiteMapURL {
* Set when this URL was last modified.
*
* @param lastModified
* the last time the sitemap was modified
* lastmod specified for the URL
*/
public void setLastModified(Date lastModified) {
this.lastModified = lastModified;
}
/**
* Set when this URL was last modified.
*
* @param lastModified
* lastmod specified for the URL
*/
public void setLastModified(ZonedDateTime lastModified) {
if (lastModified != null) {
this.lastModified = Date.from(lastModified.toInstant());
}
}
/**
* Return this URL's priority (a value between [0.0 - 1.0]).
*
@ -302,7 +319,7 @@ public class SiteMapURL {
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("url = \"").append(url).append("\"");
sb.append(", lastMod = ").append((lastModified == null) ? "null" : SiteMap.getFullDateFormat().format(lastModified));
sb.append(", lastMod = ").append((lastModified == null) ? "null" : SiteMap.W3C_FULLDATE_FORMATTER_UTC.format(lastModified.toInstant()));
sb.append(", changeFreq = ").append(changeFreq);
sb.append(", priority = ").append(priority);

View File

@ -21,6 +21,7 @@ import static crawlercommons.sitemaps.SiteMapParser.urlIsValid;
import java.net.MalformedURLException;
import java.net.URL;
import java.time.ZonedDateTime;
import java.util.LinkedList;
import org.xml.sax.Attributes;
@ -80,7 +81,7 @@ class RSSHandler extends DelegatorHandler {
private SiteMap sitemap;
private StringBuilder loc;
private URL locURL;
private String lastMod;
private ZonedDateTime lastMod;
boolean valid;
RSSHandler(URL url, LinkedList<String> elementStack, boolean strict) {
@ -128,8 +129,8 @@ class RSSHandler extends DelegatorHandler {
String localName = super.currentElement();
String value = String.valueOf(ch, start, length);
if ("pubDate".equals(localName)) {
lastMod = AbstractSiteMap.normalizeRSSTimestamp(value);
if ("channel".equals(super.currentElementParent())) {
lastMod = AbstractSiteMap.parseRSSTimestamp(value);
if (lastMod != null && "channel".equals(super.currentElementParent())) {
sitemap.setLastModified(lastMod);
}
} else if ("link".equals(localName)) {

View File

@ -19,6 +19,11 @@ package crawlercommons.sitemaps;
import static org.junit.Assert.*;
import java.text.SimpleDateFormat;
import java.time.Instant;
import java.time.ZoneId;
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Date;
import java.util.Locale;
import java.util.TimeZone;
@ -34,8 +39,8 @@ public class AbstractSiteMapTest {
SimpleDateFormat isoFormatNoTimezone = new SimpleDateFormat("yyyyMMdd", Locale.ROOT);
// For formats where there's no time zone information, the time zone is
// undefined, so we can
// only check on the year/month/day portion of the result.
// undefined, so we can only check on the year/month/day portion of the
// result.
assertEquals("20140101", isoFormatNoTimezone.format(AbstractSiteMap.convertToDate("2014")));
assertEquals("20140601", isoFormatNoTimezone.format(AbstractSiteMap.convertToDate("2014-06")));
assertEquals("20140603", isoFormatNoTimezone.format(AbstractSiteMap.convertToDate("2014-06-03")));
@ -59,6 +64,41 @@ public class AbstractSiteMapTest {
isoFormatWithFractionSeconds.setTimeZone(TimeZone.getTimeZone("UTC"));
assertEquals("20140603T103045.820", isoFormatWithFractionSeconds.format(AbstractSiteMap.convertToDate("2014-06-03T10:30:45.82+00:00")));
// Date examples given in https://www.w3.org/TR/NOTE-datetime
ZonedDateTime zdt = ZonedDateTime.ofInstant(Instant.ofEpochMilli(0), AbstractSiteMap.TIME_ZONE_UTC);
// YYYY (eg 1997) -- no time zone, see comment above
zdt = zdt.withYear(1997);
parseCompareDate(zdt, "1997", "yyyyMMdd");
// YYYY-MM (eg 1997-07) -- no time zone, see comment above
zdt = zdt.withMonth(7);
parseCompareDate(zdt, "1997-07", "yyyyMMdd");
// YYYY-MM-DD (eg 1997-07-16) -- no time zone, see comment above
zdt = zdt.withDayOfMonth(16);
parseCompareDate(zdt, "1997-07-16", "yyyyMMdd");
// YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00)
// one hour less in UTC because of time zone +01:00
zdt = zdt.withHour(19).withMinute(20).minusHours(1);
parseCompareDate(zdt, "1997-07-16T19:20+01:00");
// YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00)
zdt = zdt.withSecond(30);
parseCompareDate(zdt, "1997-07-16T19:20:30+01:00");
// YYYY-MM-DDThh:mm:ss.sTZD (eg 1997-07-16T19:20:30.45+01:00)
zdt = zdt.withNano(450000000);
parseCompareDate(zdt, "1997-07-16T19:20:30.45+01:00");
}
private void parseCompareDate(ZonedDateTime expected, String date) {
parseCompareDate(expected, date, null);
}
private void parseCompareDate(ZonedDateTime expected, String date, String dateFormat) {
ZonedDateTime parsed = AbstractSiteMap.convertToZonedDateTime(date);
if (dateFormat != null) {
DateTimeFormatter fmt = DateTimeFormatter.ofPattern(dateFormat, Locale.ROOT).withZone(ZoneId.systemDefault());
assertEquals("Failed to parse W3C date format:", fmt.format(expected), fmt.format(parsed));
} else {
assertTrue("Failed to parse W3C date format: " + expected + " <> " + parsed, expected.isEqual(parsed));
}
}
@Test
@ -66,11 +106,24 @@ public class AbstractSiteMapTest {
assertNull(AbstractSiteMap.normalizeRSSTimestamp(null));
assertEquals("incorrect", AbstractSiteMap.normalizeRSSTimestamp("incorrect"));
assertEquals("Full date-time with named timezone", "2017-01-05T12:34:50+0000", AbstractSiteMap.normalizeRSSTimestamp("Thu, 05 Jan 2017 12:34:50 GMT"));
assertEquals("Full date-time with local differental", "2017-01-05T12:34:51+0000", AbstractSiteMap.normalizeRSSTimestamp("Thu, 05 Jan 2017 13:34:51 +0100"));
assertEquals("Date-time without week day", "2017-01-05T12:34:52+0000", AbstractSiteMap.normalizeRSSTimestamp("05 Jan 2017 11:34:52 -0100"));
assertEquals("Date-time without week day and two-digit year", "2017-01-05T12:34:53+0000", AbstractSiteMap.normalizeRSSTimestamp("05 Jan 17 12:34:53 GMT"));
assertEquals("Date-time with two-digit year", "2017-01-05T12:34:54+0000", AbstractSiteMap.normalizeRSSTimestamp("Thu, 05 Jan 17 12:34:54 GMT"));
assertEquals("Full date-time with named timezone", "2017-01-05T12:34:50Z", AbstractSiteMap.normalizeRSSTimestamp("Thu, 05 Jan 2017 12:34:50 GMT"));
assertEquals("Full date-time with time zone offset", "2017-01-05T12:34:51Z", AbstractSiteMap.normalizeRSSTimestamp("Thu, 05 Jan 2017 13:34:51 +0100"));
assertEquals("Date-time without week day", "2017-01-05T12:34:52Z", AbstractSiteMap.normalizeRSSTimestamp("05 Jan 2017 11:34:52 -0100"));
assertEquals("Date-time without week day and two-digit year", "2017-01-05T12:34:53Z", AbstractSiteMap.normalizeRSSTimestamp("05 Jan 17 12:34:53 GMT"));
assertEquals("Date-time with two-digit year", "2017-01-05T12:34:54Z", AbstractSiteMap.normalizeRSSTimestamp("Thu, 05 Jan 17 12:34:54 GMT"));
}
@Test
public void testFullDateFormat() {
// test example date with time zone offset
// from https://www.w3.org/TR/NOTE-datetime
// the (re)formatted date should be identical
ZonedDateTime date1 = SiteMap.convertToZonedDateTime("1994-11-05T13:15:30Z");
ZonedDateTime date2 = SiteMap.convertToZonedDateTime("1994-11-05T08:15:30-05:00");
assertTrue("Failed to parse date with time zone", date1.isEqual(date2));
String datestr1 = SiteMap.W3C_FULLDATE_FORMATTER_UTC.format(date1);
String datestr2 = SiteMap.W3C_FULLDATE_FORMATTER_UTC.format(date2);
assertEquals("Failed to format date", datestr1, datestr2);
}
}

View File

@ -31,12 +31,9 @@ import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.nio.charset.StandardCharsets;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import org.apache.commons.io.IOUtils;
import org.junit.After;
@ -44,16 +41,12 @@ import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
@RunWith(JUnit4.class)
public class SiteMapParserTest {
private static final Logger LOG = LoggerFactory.getLogger(SiteMapParserTest.class);
@Before
public void setUp() throws Exception {
}
@ -159,14 +152,6 @@ public class SiteMapParserTest {
assertEquals(2, sm.getSiteMapUrls().size());
}
@Test
public void testFullDateFormat() {
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00", Locale.ROOT);
Date date = new Date();
LOG.info(format.format(date));
LOG.info(SiteMap.getFullDateFormat().format(date));
}
@Test
public void testSitemapTXT() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
@ -462,7 +447,7 @@ public class SiteMapParserTest {
SiteMap rss = (SiteMap) asm;
assertEquals("Incorrect items count", 7, rss.getSiteMapUrls().size());
Iterator<SiteMapURL> it = rss.getSiteMapUrls().iterator();
assertPubDate("Local differental offset", "article_1", pubDate + 1000, it);
assertPubDate("Local differential offset", "article_1", pubDate + 1000, it);
assertPubDate("Short year", "article_2", pubDate + 2000, it);
assertPubDate("No weekday", "article_3", pubDate + 3000, it);
assertPubDate("No weekday and short year", "article_4", pubDate + 4000, it);