mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-25 11:46:03 +02:00
Use the Java 8 date and time API (java.time.*) to parse dates in sitemaps (#217)
* Use the Java 8 date and time API (java.time.*) to parse dates in sitemaps - use thread-safe DateTimeFormatter instead of ThreadLocal<DateFormat> - simplify parsing of RSS publication dates - remove obsolete regex pattern to catch dates with time zone but without seconds (covered by DateTimeFormatter.ISO_OFFSET_DATE_TIME) - extend unit tests * Fix Javadoc error and warnings, update change log * Remove obsolete dependency to jaxb-api - import of javax.xml.bind.DatatypeConverter has been removed by updating to Java 8 date and time API
This commit is contained in:
parent
bef1b8437e
commit
9318de951f
|
@ -1,6 +1,7 @@
|
|||
Crawler-Commons Change Log
|
||||
|
||||
Current Development 0.11-SNAPSHOT (yyyy-mm-dd)
|
||||
- [Sitemaps] Use the Java 8 date and time API (java.time.*) to parse dates in sitemaps (sebastian-nagel) #217
|
||||
- [Robots] Fix for handling URLs with query parameters but no path (kkrugler) #215
|
||||
|
||||
Release 0.10 (2018-06-05)
|
||||
|
|
8
pom.xml
8
pom.xml
|
@ -322,7 +322,6 @@
|
|||
<mockito-core.version>1.8.0</mockito-core.version>
|
||||
<jetty.version>5.1.10</jetty.version>
|
||||
<servlet-api.version>2.5</servlet-api.version>
|
||||
<jaxb-api.version>2.2.11</jaxb-api.version>
|
||||
|
||||
<!-- Maven Plugin Dependencies -->
|
||||
<maven-compiler-plugin.version>2.3.2</maven-compiler-plugin.version>
|
||||
|
@ -364,13 +363,6 @@
|
|||
<version>${slf4j-api.version}</version>
|
||||
</dependency>
|
||||
|
||||
<!-- see https://github.com/crawler-commons/crawler-commons/issues/196 -->
|
||||
<dependency>
|
||||
<groupId>javax.xml.bind</groupId>
|
||||
<artifactId>jaxb-api</artifactId>
|
||||
<version>${jaxb-api.version}</version>
|
||||
</dependency>
|
||||
|
||||
<!-- Test dependencies -->
|
||||
|
||||
<dependency>
|
||||
|
|
|
@ -17,16 +17,18 @@
|
|||
package crawlercommons.sitemaps;
|
||||
|
||||
import java.net.URL;
|
||||
import java.text.DateFormat;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.time.LocalDate;
|
||||
import java.time.Year;
|
||||
import java.time.YearMonth;
|
||||
import java.time.ZoneId;
|
||||
import java.time.ZoneOffset;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.time.format.DateTimeParseException;
|
||||
import java.time.temporal.ChronoField;
|
||||
import java.time.temporal.TemporalAccessor;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
import java.util.TimeZone;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import javax.xml.bind.DatatypeConverter;
|
||||
|
||||
/** SiteMap or SiteMapIndex **/
|
||||
public abstract class AbstractSiteMap {
|
||||
|
@ -36,41 +38,23 @@ public abstract class AbstractSiteMap {
|
|||
INDEX, XML, ATOM, RSS, TEXT
|
||||
};
|
||||
|
||||
// 1997-07-16T19:20+01:00
|
||||
private static final Pattern W3C_NO_SECONDS_PATTERN = Pattern.compile("(\\d\\d\\d\\d\\-\\d\\d\\-\\d\\dT\\d\\d:\\d\\d)(\\-|\\+)(\\d\\d):(\\d\\d)");
|
||||
private static final ThreadLocal<DateFormat> W3C_NO_SECONDS_FORMAT = new ThreadLocal<DateFormat>() {
|
||||
|
||||
protected DateFormat initialValue() {
|
||||
return new SimpleDateFormat("yyyy-MM-dd'T'HH:mmZ", Locale.ROOT);
|
||||
}
|
||||
};
|
||||
|
||||
private static final ThreadLocal<DateFormat> W3C_FULLDATE_FORMAT = new ThreadLocal<DateFormat>() {
|
||||
protected DateFormat initialValue() {
|
||||
SimpleDateFormat result = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssX", Locale.ROOT);
|
||||
result.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
private static final ThreadLocal<DateFormat> W3C_FULLDATE_FORMAT_WITH_OFFSET = new ThreadLocal<DateFormat>() {
|
||||
protected DateFormat initialValue() {
|
||||
SimpleDateFormat result = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ", Locale.ROOT);
|
||||
result.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
return result;
|
||||
}
|
||||
};
|
||||
protected static final ZoneId TIME_ZONE_UTC = ZoneId.of(ZoneOffset.UTC.toString());
|
||||
|
||||
/**
|
||||
* The set of date-time formats which could be used as pubDate in RSS.
|
||||
* DateTimeFormatter for parsing dates in ISO-8601 format
|
||||
*/
|
||||
private static final ThreadLocal<DateFormat[]> RSS_DATE_FORMATS = new ThreadLocal<DateFormat[]>() {
|
||||
@Override
|
||||
protected DateFormat[] initialValue() {
|
||||
return new DateFormat[] { new SimpleDateFormat("EEE, dd MMM yy HH:mm:ss Z", Locale.ROOT), new SimpleDateFormat("dd MMM yy HH:mm:ss Z", Locale.ROOT),
|
||||
new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss Z", Locale.ROOT), new SimpleDateFormat("dd MMM yyyy HH:mm:ss Z", Locale.ROOT) };
|
||||
}
|
||||
};
|
||||
public static final DateTimeFormatter W3C_FULLDATE_FORMATTER = DateTimeFormatter.ISO_OFFSET_DATE_TIME;
|
||||
|
||||
/**
|
||||
* DateTimeFormatter to format dates in ISO-8601 format (UTC time zone 'Z')
|
||||
*/
|
||||
public static final DateTimeFormatter W3C_FULLDATE_FORMATTER_UTC = DateTimeFormatter.ISO_INSTANT;
|
||||
|
||||
/**
|
||||
* DateTimeFormatter for parsing short dates ('1997', '1997-07',
|
||||
* '1997-07-16') without daytime and time zone
|
||||
*/
|
||||
public static final DateTimeFormatter W3C_SHORTDATE_FORMATTER = DateTimeFormatter.ofPattern("yyyy[-MM[-dd]]", Locale.ROOT).withZone(TIME_ZONE_UTC);
|
||||
|
||||
/** W3C date the Sitemap was last modified */
|
||||
private Date lastModified;
|
||||
|
@ -87,10 +71,6 @@ public abstract class AbstractSiteMap {
|
|||
lastModified = null;
|
||||
}
|
||||
|
||||
public static DateFormat getFullDateFormat() {
|
||||
return W3C_FULLDATE_FORMAT.get();
|
||||
}
|
||||
|
||||
public boolean isIndex() {
|
||||
return (type == SitemapType.INDEX);
|
||||
};
|
||||
|
@ -135,7 +115,7 @@ public abstract class AbstractSiteMap {
|
|||
|
||||
/**
|
||||
* @param lastModified
|
||||
* - the lastModified to set
|
||||
* the last-modified date
|
||||
*/
|
||||
public void setLastModified(Date lastModified) {
|
||||
this.lastModified = lastModified;
|
||||
|
@ -143,7 +123,16 @@ public abstract class AbstractSiteMap {
|
|||
|
||||
/**
|
||||
* @param lastModified
|
||||
* - the lastModified to set
|
||||
* the last-modified date and time
|
||||
*/
|
||||
public void setLastModified(ZonedDateTime lastModified) {
|
||||
this.lastModified = Date.from(lastModified.toInstant());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param lastModified
|
||||
* the last-modified date time. If parsing of the given date time
|
||||
* fails, the last-modified field is set to null.
|
||||
*/
|
||||
public void setLastModified(String lastModified) {
|
||||
this.lastModified = SiteMap.convertToDate(lastModified);
|
||||
|
@ -160,49 +149,84 @@ public abstract class AbstractSiteMap {
|
|||
* Convert the given date (given in an acceptable DateFormat), null if the
|
||||
* date is not in the correct format.
|
||||
*
|
||||
* <p>
|
||||
* Dates must follow the <a href="https://www.w3.org/TR/NOTE-datetime">W3C
|
||||
* Datetime format</a> which is similar to <a
|
||||
* href="https://en.wikipedia.org/wiki/ISO_8601">ISO-8601</a> but allows
|
||||
* dates with different precisions:</p>
|
||||
*
|
||||
* <pre>
|
||||
* Year:
|
||||
* YYYY (eg 1997)
|
||||
* Year and month:
|
||||
* YYYY-MM (eg 1997-07)
|
||||
* Complete date:
|
||||
* YYYY-MM-DD (eg 1997-07-16)
|
||||
* Complete date plus hours and minutes:
|
||||
* YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00)
|
||||
* Complete date plus hours, minutes and seconds:
|
||||
* YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00)
|
||||
* Complete date plus hours, minutes, seconds and a decimal fraction of a second
|
||||
* YYYY-MM-DDThh:mm:ss.sTZD (eg 1997-07-16T19:20:30.45+01:00)
|
||||
* </pre>
|
||||
*
|
||||
* @param date
|
||||
* - the date to be parsed
|
||||
* @return the Date equivalent or NULL when encountering an unparsable date
|
||||
* string argument
|
||||
* @return the zoned date time equivalent to the date string or NULL parsing
|
||||
* failed
|
||||
*/
|
||||
public static Date convertToDate(String date) {
|
||||
public static ZonedDateTime convertToZonedDateTime(String date) {
|
||||
|
||||
if (date == null) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// full date including daytime and optional time zone
|
||||
try {
|
||||
return getFullDateFormat().parse(date);
|
||||
} catch (ParseException e1) {
|
||||
return W3C_FULLDATE_FORMATTER.parse(date, ZonedDateTime::from);
|
||||
} catch (DateTimeParseException e) {
|
||||
// fall-through and try date without daytime
|
||||
}
|
||||
|
||||
// dates without daytime
|
||||
try {
|
||||
return DatatypeConverter.parseDateTime(date).getTime();
|
||||
} catch (IllegalArgumentException e) {
|
||||
// See if it's the one W3C case that the javax.xml.bind
|
||||
// implementation (incorrectly) doesn't handle.
|
||||
Matcher m = W3C_NO_SECONDS_PATTERN.matcher(date);
|
||||
if (m.matches()) {
|
||||
try {
|
||||
// Convert to a format that Java can parse, which means
|
||||
// time zone has to be "-/+HHMM", not "+/-HH:MM"
|
||||
StringBuffer mungedDate = new StringBuffer(m.group(1));
|
||||
mungedDate.append(m.group(2));
|
||||
mungedDate.append(m.group(3));
|
||||
mungedDate.append(m.group(4));
|
||||
return W3C_NO_SECONDS_FORMAT.get().parse(mungedDate.toString());
|
||||
} catch (ParseException e2) {
|
||||
return null;
|
||||
}
|
||||
} else {
|
||||
return null;
|
||||
TemporalAccessor ta = W3C_SHORTDATE_FORMATTER.parse(date);
|
||||
LocalDate ldt = null;
|
||||
if (ta.isSupported(ChronoField.DAY_OF_MONTH)) {
|
||||
ldt = LocalDate.from(ta);
|
||||
} else if (ta.isSupported(ChronoField.MONTH_OF_YEAR)) {
|
||||
ldt = YearMonth.from(ta).atDay(1);
|
||||
} else if (ta.isSupported(ChronoField.YEAR)) {
|
||||
ldt = Year.from(ta).atDay(1);
|
||||
}
|
||||
if (ldt != null) {
|
||||
return ldt.atStartOfDay(TIME_ZONE_UTC);
|
||||
}
|
||||
} catch (DateTimeParseException e) {
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts pubDate of RSS to the string representation which could be
|
||||
* parsed in {@link #convertToDate(String)} method.
|
||||
* See {@link #convertToZonedDateTime(String)}.
|
||||
*
|
||||
* @param date
|
||||
* the date string to convert
|
||||
* @return returns the date or null if parsing of the date string fails
|
||||
*/
|
||||
public static Date convertToDate(String date) {
|
||||
ZonedDateTime zdt = convertToZonedDateTime(date);
|
||||
if (zdt == null) {
|
||||
return null;
|
||||
}
|
||||
return Date.from(zdt.toInstant());
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts pubDate of RSS to the ISO-8601 instant format, e.g.,
|
||||
* '2017-01-05T12:34:54Z' in UTC / GMT time zone, see
|
||||
* {@link DateTimeFormatter#ISO_INSTANT}.
|
||||
*
|
||||
* @param pubDate
|
||||
* - date time of pubDate in RFC822
|
||||
|
@ -213,19 +237,37 @@ public abstract class AbstractSiteMap {
|
|||
if (pubDate == null) {
|
||||
return null;
|
||||
}
|
||||
Date date = null;
|
||||
for (DateFormat format : RSS_DATE_FORMATS.get()) {
|
||||
try {
|
||||
date = format.parse(pubDate);
|
||||
break;
|
||||
} catch (ParseException ex) {
|
||||
// try next one
|
||||
}
|
||||
}
|
||||
if (date == null) {
|
||||
ZonedDateTime zdt = parseRSSTimestamp(pubDate);
|
||||
if (zdt == null) {
|
||||
return pubDate;
|
||||
}
|
||||
return W3C_FULLDATE_FORMAT_WITH_OFFSET.get().format(date);
|
||||
return W3C_FULLDATE_FORMATTER_UTC.format(zdt);
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse pubDate of RSS feeds.
|
||||
*
|
||||
* @param pubDate
|
||||
* - date time of pubDate in RFC822
|
||||
* @return date time or null if parsing failed
|
||||
*/
|
||||
public static ZonedDateTime parseRSSTimestamp(String pubDate) {
|
||||
ZonedDateTime zdt = null;
|
||||
try {
|
||||
zdt = DateTimeFormatter.RFC_1123_DATE_TIME.parse(pubDate, ZonedDateTime::from);
|
||||
} catch (DateTimeParseException ex) {
|
||||
return null;
|
||||
}
|
||||
if (zdt.getYear() <= 99 && zdt.getYear() >= 0) {
|
||||
// adjust two-digit years: RFC 1123 requires a fully-specified year,
|
||||
// while RFC 822 allows two digits
|
||||
if (zdt.getYear() >= 80) {
|
||||
// assume 19yy - RFC 822 has been publish in 1982
|
||||
zdt = zdt.plusYears(1900);
|
||||
} else {
|
||||
zdt = zdt.plusYears(2000);
|
||||
}
|
||||
}
|
||||
return zdt;
|
||||
}
|
||||
}
|
|
@ -95,8 +95,8 @@ public class SiteMap extends AbstractSiteMap {
|
|||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
sb.append("url = \"").append(url).append("\", lastMod = ").append((getLastModified() == null) ? "null" : SiteMap.getFullDateFormat().format(getLastModified())).append(", type = ")
|
||||
.append(getType()).append(", processed = ").append(isProcessed()).append(", urlListSize = ").append(urlList.size());
|
||||
sb.append("url = \"").append(url).append("\", lastMod = ").append((getLastModified() == null) ? "null" : SiteMap.W3C_FULLDATE_FORMATTER_UTC.format(getLastModified().toInstant()))
|
||||
.append(", type = ").append(getType()).append(", processed = ").append(isProcessed()).append(", urlListSize = ").append(urlList.size());
|
||||
|
||||
return sb.toString();
|
||||
}
|
||||
|
|
|
@ -119,12 +119,15 @@ public class SiteMapParser {
|
|||
}
|
||||
|
||||
/**
|
||||
* Sets the parser to allow any namespace or just the one from the
|
||||
* Sets the parser to allow any XML namespace or just the one from the
|
||||
* specification, or any accepted namespace (see
|
||||
* {@link #addAcceptedNamespace(String)}). Note enabling strict namespace
|
||||
* checking always adds the namespace defined by the current sitemap
|
||||
* specificiation ({@link Namespace#SITEMAP}) to the list of accepted
|
||||
* namespaces.
|
||||
*
|
||||
* @param s
|
||||
* if true enable strict namespace-checking, disable if false
|
||||
*/
|
||||
public void setStrictNamespace(boolean s) {
|
||||
strictNamespace = s;
|
||||
|
@ -137,6 +140,7 @@ public class SiteMapParser {
|
|||
* Add namespace URI to set of accepted namespaces.
|
||||
*
|
||||
* @param namespaceUri
|
||||
* URI of the accepted XML namespace
|
||||
*/
|
||||
public void addAcceptedNamespace(String namespaceUri) {
|
||||
acceptedNamespaces.add(namespaceUri);
|
||||
|
@ -146,6 +150,7 @@ public class SiteMapParser {
|
|||
* Add namespace URIs to set of accepted namespaces.
|
||||
*
|
||||
* @param namespaceUris
|
||||
* array of accepted XML namespace URIs
|
||||
*/
|
||||
public void addAcceptedNamespace(String[] namespaceUris) {
|
||||
for (String namespaceUri : namespaceUris) {
|
||||
|
@ -553,7 +558,9 @@ public class SiteMapParser {
|
|||
* are valid.
|
||||
*
|
||||
* @param sitemapBaseUrl
|
||||
* the base URL of the sitemap
|
||||
* @param testUrl
|
||||
* the URL to be tested
|
||||
* @return true if testUrl is under sitemapBaseUrl, false otherwise
|
||||
*/
|
||||
public static boolean urlIsValid(String sitemapBaseUrl, String testUrl) {
|
||||
|
|
|
@ -21,6 +21,7 @@ import org.slf4j.LoggerFactory;
|
|||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
|
||||
|
@ -92,6 +93,10 @@ public class SiteMapURL {
|
|||
setPriority(priority);
|
||||
}
|
||||
|
||||
public SiteMapURL(URL url, ZonedDateTime lastModified, ChangeFrequency changeFreq, double priority, boolean valid) {
|
||||
this(url, Date.from(lastModified.toInstant()), changeFreq, priority, valid);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the URL.
|
||||
*
|
||||
|
@ -140,7 +145,7 @@ public class SiteMapURL {
|
|||
* Set when this URL was last modified.
|
||||
*
|
||||
* @param lastModified
|
||||
* the last time the sitemap was modified
|
||||
* lastmod specified for the URL
|
||||
*/
|
||||
public void setLastModified(String lastModified) {
|
||||
this.lastModified = SiteMap.convertToDate(lastModified);
|
||||
|
@ -150,12 +155,24 @@ public class SiteMapURL {
|
|||
* Set when this URL was last modified.
|
||||
*
|
||||
* @param lastModified
|
||||
* the last time the sitemap was modified
|
||||
* lastmod specified for the URL
|
||||
*/
|
||||
public void setLastModified(Date lastModified) {
|
||||
this.lastModified = lastModified;
|
||||
}
|
||||
|
||||
/**
|
||||
* Set when this URL was last modified.
|
||||
*
|
||||
* @param lastModified
|
||||
* lastmod specified for the URL
|
||||
*/
|
||||
public void setLastModified(ZonedDateTime lastModified) {
|
||||
if (lastModified != null) {
|
||||
this.lastModified = Date.from(lastModified.toInstant());
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return this URL's priority (a value between [0.0 - 1.0]).
|
||||
*
|
||||
|
@ -302,7 +319,7 @@ public class SiteMapURL {
|
|||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("url = \"").append(url).append("\"");
|
||||
sb.append(", lastMod = ").append((lastModified == null) ? "null" : SiteMap.getFullDateFormat().format(lastModified));
|
||||
sb.append(", lastMod = ").append((lastModified == null) ? "null" : SiteMap.W3C_FULLDATE_FORMATTER_UTC.format(lastModified.toInstant()));
|
||||
sb.append(", changeFreq = ").append(changeFreq);
|
||||
sb.append(", priority = ").append(priority);
|
||||
|
||||
|
|
|
@ -21,6 +21,7 @@ import static crawlercommons.sitemaps.SiteMapParser.urlIsValid;
|
|||
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.util.LinkedList;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
|
@ -80,7 +81,7 @@ class RSSHandler extends DelegatorHandler {
|
|||
private SiteMap sitemap;
|
||||
private StringBuilder loc;
|
||||
private URL locURL;
|
||||
private String lastMod;
|
||||
private ZonedDateTime lastMod;
|
||||
boolean valid;
|
||||
|
||||
RSSHandler(URL url, LinkedList<String> elementStack, boolean strict) {
|
||||
|
@ -128,8 +129,8 @@ class RSSHandler extends DelegatorHandler {
|
|||
String localName = super.currentElement();
|
||||
String value = String.valueOf(ch, start, length);
|
||||
if ("pubDate".equals(localName)) {
|
||||
lastMod = AbstractSiteMap.normalizeRSSTimestamp(value);
|
||||
if ("channel".equals(super.currentElementParent())) {
|
||||
lastMod = AbstractSiteMap.parseRSSTimestamp(value);
|
||||
if (lastMod != null && "channel".equals(super.currentElementParent())) {
|
||||
sitemap.setLastModified(lastMod);
|
||||
}
|
||||
} else if ("link".equals(localName)) {
|
||||
|
|
|
@ -19,6 +19,11 @@ package crawlercommons.sitemaps;
|
|||
import static org.junit.Assert.*;
|
||||
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.time.Instant;
|
||||
import java.time.ZoneId;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.time.format.DateTimeFormatter;
|
||||
import java.util.Date;
|
||||
import java.util.Locale;
|
||||
import java.util.TimeZone;
|
||||
|
||||
|
@ -34,8 +39,8 @@ public class AbstractSiteMapTest {
|
|||
SimpleDateFormat isoFormatNoTimezone = new SimpleDateFormat("yyyyMMdd", Locale.ROOT);
|
||||
|
||||
// For formats where there's no time zone information, the time zone is
|
||||
// undefined, so we can
|
||||
// only check on the year/month/day portion of the result.
|
||||
// undefined, so we can only check on the year/month/day portion of the
|
||||
// result.
|
||||
assertEquals("20140101", isoFormatNoTimezone.format(AbstractSiteMap.convertToDate("2014")));
|
||||
assertEquals("20140601", isoFormatNoTimezone.format(AbstractSiteMap.convertToDate("2014-06")));
|
||||
assertEquals("20140603", isoFormatNoTimezone.format(AbstractSiteMap.convertToDate("2014-06-03")));
|
||||
|
@ -59,6 +64,41 @@ public class AbstractSiteMapTest {
|
|||
isoFormatWithFractionSeconds.setTimeZone(TimeZone.getTimeZone("UTC"));
|
||||
assertEquals("20140603T103045.820", isoFormatWithFractionSeconds.format(AbstractSiteMap.convertToDate("2014-06-03T10:30:45.82+00:00")));
|
||||
|
||||
// Date examples given in https://www.w3.org/TR/NOTE-datetime
|
||||
ZonedDateTime zdt = ZonedDateTime.ofInstant(Instant.ofEpochMilli(0), AbstractSiteMap.TIME_ZONE_UTC);
|
||||
// YYYY (eg 1997) -- no time zone, see comment above
|
||||
zdt = zdt.withYear(1997);
|
||||
parseCompareDate(zdt, "1997", "yyyyMMdd");
|
||||
// YYYY-MM (eg 1997-07) -- no time zone, see comment above
|
||||
zdt = zdt.withMonth(7);
|
||||
parseCompareDate(zdt, "1997-07", "yyyyMMdd");
|
||||
// YYYY-MM-DD (eg 1997-07-16) -- no time zone, see comment above
|
||||
zdt = zdt.withDayOfMonth(16);
|
||||
parseCompareDate(zdt, "1997-07-16", "yyyyMMdd");
|
||||
// YYYY-MM-DDThh:mmTZD (eg 1997-07-16T19:20+01:00)
|
||||
// one hour less in UTC because of time zone +01:00
|
||||
zdt = zdt.withHour(19).withMinute(20).minusHours(1);
|
||||
parseCompareDate(zdt, "1997-07-16T19:20+01:00");
|
||||
// YYYY-MM-DDThh:mm:ssTZD (eg 1997-07-16T19:20:30+01:00)
|
||||
zdt = zdt.withSecond(30);
|
||||
parseCompareDate(zdt, "1997-07-16T19:20:30+01:00");
|
||||
// YYYY-MM-DDThh:mm:ss.sTZD (eg 1997-07-16T19:20:30.45+01:00)
|
||||
zdt = zdt.withNano(450000000);
|
||||
parseCompareDate(zdt, "1997-07-16T19:20:30.45+01:00");
|
||||
}
|
||||
|
||||
private void parseCompareDate(ZonedDateTime expected, String date) {
|
||||
parseCompareDate(expected, date, null);
|
||||
}
|
||||
|
||||
private void parseCompareDate(ZonedDateTime expected, String date, String dateFormat) {
|
||||
ZonedDateTime parsed = AbstractSiteMap.convertToZonedDateTime(date);
|
||||
if (dateFormat != null) {
|
||||
DateTimeFormatter fmt = DateTimeFormatter.ofPattern(dateFormat, Locale.ROOT).withZone(ZoneId.systemDefault());
|
||||
assertEquals("Failed to parse W3C date format:", fmt.format(expected), fmt.format(parsed));
|
||||
} else {
|
||||
assertTrue("Failed to parse W3C date format: " + expected + " <> " + parsed, expected.isEqual(parsed));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
|
@ -66,11 +106,24 @@ public class AbstractSiteMapTest {
|
|||
assertNull(AbstractSiteMap.normalizeRSSTimestamp(null));
|
||||
assertEquals("incorrect", AbstractSiteMap.normalizeRSSTimestamp("incorrect"));
|
||||
|
||||
assertEquals("Full date-time with named timezone", "2017-01-05T12:34:50+0000", AbstractSiteMap.normalizeRSSTimestamp("Thu, 05 Jan 2017 12:34:50 GMT"));
|
||||
assertEquals("Full date-time with local differental", "2017-01-05T12:34:51+0000", AbstractSiteMap.normalizeRSSTimestamp("Thu, 05 Jan 2017 13:34:51 +0100"));
|
||||
assertEquals("Date-time without week day", "2017-01-05T12:34:52+0000", AbstractSiteMap.normalizeRSSTimestamp("05 Jan 2017 11:34:52 -0100"));
|
||||
assertEquals("Date-time without week day and two-digit year", "2017-01-05T12:34:53+0000", AbstractSiteMap.normalizeRSSTimestamp("05 Jan 17 12:34:53 GMT"));
|
||||
assertEquals("Date-time with two-digit year", "2017-01-05T12:34:54+0000", AbstractSiteMap.normalizeRSSTimestamp("Thu, 05 Jan 17 12:34:54 GMT"));
|
||||
assertEquals("Full date-time with named timezone", "2017-01-05T12:34:50Z", AbstractSiteMap.normalizeRSSTimestamp("Thu, 05 Jan 2017 12:34:50 GMT"));
|
||||
assertEquals("Full date-time with time zone offset", "2017-01-05T12:34:51Z", AbstractSiteMap.normalizeRSSTimestamp("Thu, 05 Jan 2017 13:34:51 +0100"));
|
||||
assertEquals("Date-time without week day", "2017-01-05T12:34:52Z", AbstractSiteMap.normalizeRSSTimestamp("05 Jan 2017 11:34:52 -0100"));
|
||||
assertEquals("Date-time without week day and two-digit year", "2017-01-05T12:34:53Z", AbstractSiteMap.normalizeRSSTimestamp("05 Jan 17 12:34:53 GMT"));
|
||||
assertEquals("Date-time with two-digit year", "2017-01-05T12:34:54Z", AbstractSiteMap.normalizeRSSTimestamp("Thu, 05 Jan 17 12:34:54 GMT"));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFullDateFormat() {
|
||||
// test example date with time zone offset
|
||||
// from https://www.w3.org/TR/NOTE-datetime
|
||||
// the (re)formatted date should be identical
|
||||
ZonedDateTime date1 = SiteMap.convertToZonedDateTime("1994-11-05T13:15:30Z");
|
||||
ZonedDateTime date2 = SiteMap.convertToZonedDateTime("1994-11-05T08:15:30-05:00");
|
||||
assertTrue("Failed to parse date with time zone", date1.isEqual(date2));
|
||||
String datestr1 = SiteMap.W3C_FULLDATE_FORMATTER_UTC.format(date1);
|
||||
String datestr2 = SiteMap.W3C_FULLDATE_FORMATTER_UTC.format(date2);
|
||||
assertEquals("Failed to format date", datestr1, datestr2);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -31,12 +31,9 @@ import java.io.IOException;
|
|||
import java.io.InputStream;
|
||||
import java.net.URL;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.junit.After;
|
||||
|
@ -44,16 +41,12 @@ import org.junit.Before;
|
|||
import org.junit.Test;
|
||||
import org.junit.runner.RunWith;
|
||||
import org.junit.runners.JUnit4;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
|
||||
|
||||
@RunWith(JUnit4.class)
|
||||
public class SiteMapParserTest {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(SiteMapParserTest.class);
|
||||
|
||||
@Before
|
||||
public void setUp() throws Exception {
|
||||
}
|
||||
|
@ -159,14 +152,6 @@ public class SiteMapParserTest {
|
|||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testFullDateFormat() {
|
||||
SimpleDateFormat format = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm+hh:00", Locale.ROOT);
|
||||
Date date = new Date();
|
||||
LOG.info(format.format(date));
|
||||
LOG.info(SiteMap.getFullDateFormat().format(date));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testSitemapTXT() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
|
@ -462,7 +447,7 @@ public class SiteMapParserTest {
|
|||
SiteMap rss = (SiteMap) asm;
|
||||
assertEquals("Incorrect items count", 7, rss.getSiteMapUrls().size());
|
||||
Iterator<SiteMapURL> it = rss.getSiteMapUrls().iterator();
|
||||
assertPubDate("Local differental offset", "article_1", pubDate + 1000, it);
|
||||
assertPubDate("Local differential offset", "article_1", pubDate + 1000, it);
|
||||
assertPubDate("Short year", "article_2", pubDate + 2000, it);
|
||||
assertPubDate("No weekday", "article_3", pubDate + 3000, it);
|
||||
assertPubDate("No weekday and short year", "article_4", pubDate + 4000, it);
|
||||
|
|
Loading…
Reference in New Issue