1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-06-04 14:16:05 +02:00

Merge pull request #218 from sebastian-nagel/support-sitemap-extensions

Support sitemap extensions
This commit is contained in:
Sebastian Nagel 2018-10-12 21:53:35 +02:00 committed by GitHub
commit 0519d14023
Signed by: GitHub
GPG Key ID: 4AEE18F83AFDEB23
30 changed files with 2520 additions and 7 deletions

View File

@ -1,6 +1,7 @@
Crawler-Commons Change Log
Current Development 0.11-SNAPSHOT (yyyy-mm-dd)
- [Sitemaps] Add support for sitemap extensions (tuxnco, sebastian-nagel) #35, #36, #149, #162
- [Sitemaps] Use the Java 8 date and time API (java.time.*) to parse dates in sitemaps (sebastian-nagel) #217
- [Robots] Fix for handling URLs with query parameters but no path (kkrugler) #215

View File

@ -153,7 +153,8 @@ public abstract class AbstractSiteMap {
* Dates must follow the <a href="https://www.w3.org/TR/NOTE-datetime">W3C
* Datetime format</a> which is similar to <a
* href="https://en.wikipedia.org/wiki/ISO_8601">ISO-8601</a> but allows
* dates with different precisions:</p>
* dates with different precisions:
* </p>
*
* <pre>
* Year:

View File

@ -18,7 +18,12 @@ package crawlercommons.sitemaps;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import crawlercommons.sitemaps.extension.Extension;
/**
* supported sitemap formats:
@ -60,6 +65,11 @@ public class Namespace {
"http://www.google.com/schemas/sitemap-news/0.84" //
};
public static final String[] MOBILE = { //
"http://www.google.com/schemas/sitemap-mobile/1.0", //
"https://www.google.com/schemas/sitemap-mobile/1.0" //
};
public static final String LINKS = "http://www.w3.org/1999/xhtml";
/**
@ -103,4 +113,12 @@ public class Namespace {
return SITEMAP_SUPPORTED_NAMESPACES.contains(uri);
}
public static final Map<Extension, List<String>> SITEMAP_EXTENSION_NAMESPACES = new TreeMap<>();
static {
SITEMAP_EXTENSION_NAMESPACES.put(Extension.NEWS, Arrays.asList(NEWS));
SITEMAP_EXTENSION_NAMESPACES.put(Extension.IMAGE, Arrays.asList(IMAGE));
SITEMAP_EXTENSION_NAMESPACES.put(Extension.VIDEO, Arrays.asList(VIDEO));
SITEMAP_EXTENSION_NAMESPACES.put(Extension.MOBILE, Arrays.asList(MOBILE));
SITEMAP_EXTENSION_NAMESPACES.put(Extension.LINKS, Arrays.asList(LINKS));
}
}

View File

@ -28,8 +28,10 @@ import java.io.StringReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Locale;
import java.util.Map;
import java.util.Set;
import java.util.function.Consumer;
import java.util.zip.GZIPInputStream;
@ -48,6 +50,7 @@ import org.xml.sax.SAXException;
import crawlercommons.mimetypes.MimeTypeDetector;
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
import crawlercommons.sitemaps.extension.Extension;
import crawlercommons.sitemaps.sax.DelegatorHandler;
public class SiteMapParser {
@ -84,6 +87,12 @@ public class SiteMapParser {
/** Set of namespaces (if {@link #strictNamespace}) accepted by the parser. URLs from other namespaces are ignored. */
protected Set<String> acceptedNamespaces = new HashSet<>();
/**
* Map of sitemap extension namespaces required to find the right extension
* handler.
*/
protected Map<String, Extension> extensionNamespaces = new HashMap<>();
private MimeTypeDetector mimeTypeDetector;
public SiteMapParser() {
@ -158,6 +167,29 @@ public class SiteMapParser {
}
}
/**
* Enable a support for a sitemap extension in the parser.
*
* @param extension
* sitemap extension (news, images, videos, etc.)
*/
public void enableExtension(Extension extension) {
for (String namespaceUri : Namespace.SITEMAP_EXTENSION_NAMESPACES.get(extension)) {
extensionNamespaces.put(namespaceUri, extension);
}
}
/**
* Enable all supported sitemap extensions in the parser.
*/
public void enableExtensions() {
for (Extension extension : Extension.values()) {
for (String namespaceUri : Namespace.SITEMAP_EXTENSION_NAMESPACES.get(extension)) {
extensionNamespaces.put(namespaceUri, extension);
}
}
}
/**
* Returns a SiteMap or SiteMapIndex given an online sitemap URL
*
@ -511,6 +543,7 @@ public class SiteMapParser {
if (isStrictNamespace()) {
handler.setAcceptedNamespaces(acceptedNamespaces);
}
handler.setExtensionNamespaces(extensionNamespaces);
try {
SAXParser saxParser = factory.newSAXParser();

View File

@ -44,6 +44,8 @@ public class SiteMapTester {
LOG.error("Java properties:");
LOG.error(" sitemap.strictNamespace");
LOG.error(" if true sitemaps are required to use the standard namespace URI");
LOG.error(" sitemap.extensions");
LOG.error(" if true enable sitemap extension parsing");
} else {
URL url = new URL(args[0]);
String mt = (args.length > 1) ? args[1] : null;
@ -64,6 +66,11 @@ public class SiteMapTester {
boolean strictNamespace = new Boolean(System.getProperty("sitemap.strictNamespace"));
saxParser.setStrictNamespace(strictNamespace);
boolean enableExtensions = new Boolean(System.getProperty("sitemap.extensions"));
if (enableExtensions) {
saxParser.enableExtensions();
}
AbstractSiteMap sm = null;
// guesses the mimetype
if (mt == null || mt.equals("")) {
@ -80,7 +87,11 @@ public class SiteMapTester {
} else {
Collection<SiteMapURL> links = ((SiteMap) sm).getSiteMapUrls();
for (SiteMapURL smu : links) {
LOG.info(smu.getUrl().toString());
if (enableExtensions) {
LOG.info(smu.toString());
} else {
LOG.info(smu.getUrl().toString());
}
}
}
}

View File

@ -24,6 +24,12 @@ import java.net.URL;
import java.time.ZonedDateTime;
import java.util.Date;
import java.util.Locale;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import crawlercommons.sitemaps.extension.Extension;
import crawlercommons.sitemaps.extension.ExtensionMetadata;
/**
* The SitemapUrl class represents a URL found in a Sitemap.
@ -69,6 +75,11 @@ public class SiteMapURL {
*/
private boolean valid;
/**
* attributes from sitemap extensions (news, image, video sitemaps, etc.)
*/
private Map<Extension, ExtensionMetadata[]> attributes;
public SiteMapURL(String url, boolean valid) {
setUrl(url);
setValid(valid);
@ -295,6 +306,45 @@ public class SiteMapURL {
return valid;
}
/**
* Add attributes of a specific sitemap extension
*
* @param extension
* sitemap extension (news, images, videos, etc.)
* @param attributes
* array of attributes
*/
public void addAttributesForExtension(Extension extension, ExtensionMetadata[] attributes) {
if (this.attributes == null) {
this.attributes = new TreeMap<>();
}
this.attributes.put(extension, attributes);
}
/**
* Get attributes of sitemap extensions (news, images, videos, etc.)
*
* @return attribute map or null if no extensions are used
*/
public Map<Extension, ExtensionMetadata[]> getAttributes() {
return attributes;
}
/**
* Get attributes of a specific sitemap extension
*
* @param extension
* sitemap extension (news, images, videos, etc.)
* @return array of attributes or null if there are no attributes for the
* given extension
*/
public ExtensionMetadata[] getAttributesForExtension(Extension extension) {
if (attributes == null) {
return null;
}
return attributes.get(extension);
}
@Override
public boolean equals(Object o) {
if (this == o)
@ -322,6 +372,13 @@ public class SiteMapURL {
sb.append(", lastMod = ").append((lastModified == null) ? "null" : SiteMap.W3C_FULLDATE_FORMATTER_UTC.format(lastModified.toInstant()));
sb.append(", changeFreq = ").append(changeFreq);
sb.append(", priority = ").append(priority);
if (attributes != null) {
for (Entry<Extension, ExtensionMetadata[]> e : attributes.entrySet()) {
for (ExtensionMetadata m : e.getValue()) {
sb.append(", ").append(m.toString());
}
}
}
return sb.toString();
}

View File

@ -0,0 +1,48 @@
/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.extension;
/** Sitemap extensions supported by the parser. */
public enum Extension {
/**
* Google News sitemaps, see
* https://support.google.com/news/publisher-center/answer/74288
*/
NEWS,
/**
* Google Image sitemaps, see
* https://support.google.com/webmasters/answer/178636
*/
IMAGE,
/**
* Google Video sitemaps, see
* https://support.google.com/webmasters/answer/80471
*/
VIDEO,
/**
* Usage of <code>&lt;xhtml:links&gt;</code> in sitemaps to include
* localized page versions/variants, see
* https://support.google.com/webmasters/answer/189077
*/
LINKS,
/**
* <cite>Mobile sitemaps just contain an empty "mobile" tag to identify a
* URL as having mobile content</cite>, cf.
* http://www.google.com/schemas/sitemap-mobile/1.0
*/
MOBILE
}

View File

@ -0,0 +1,33 @@
/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.extension;
import crawlercommons.sitemaps.SiteMapURL;
/**
* Container for attributes of a {@link SiteMapURL} defined by a sitemap
* extension.
*/
public abstract class ExtensionMetadata {
public abstract boolean equals(Object other);
public boolean isValid() {
return true;
}
}

View File

@ -0,0 +1,131 @@
/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.extension;
import java.net.URL;
import java.util.Objects;
/**
* Data model for Google extension to the sitemap protocol regarding images
* indexing, as per http://www.google.com/schemas/sitemap-image/1.1
*/
public class ImageAttributes extends ExtensionMetadata {
/**
* Image location attribute found under image/loc (required)
*/
private URL loc;
/**
* Image caption attribute found under image/caption (optional)
*/
private String caption;
/**
* Image geo location attribute found under image/geo_location (optional)
*/
private String geoLocation;
/**
* Image title attribute found under image/title (optional)
*/
private String title;
/**
* Image license attribute found under image/license (optional)
*/
private URL license;
public URL getLoc() {
return loc;
}
public void setLoc(URL loc) {
this.loc = loc;
}
public String getCaption() {
return caption;
}
public void setCaption(String caption) {
this.caption = caption;
}
public String getGeoLocation() {
return geoLocation;
}
public void setGeoLocation(String geoLocation) {
this.geoLocation = geoLocation;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public URL getLicense() {
return license;
}
public void setLicense(URL license) {
this.license = license;
}
public ImageAttributes() {
}
public ImageAttributes(URL loc) {
this.loc = loc;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Image loc: ").append(loc);
sb.append(", caption: ").append(caption);
sb.append(", title: ").append(title);
sb.append(", geoLocation: ").append(geoLocation);
sb.append(", license: ").append(license);
return sb.toString();
}
@Override
public boolean isValid() {
return loc != null;
}
@Override
public boolean equals(Object other) {
if (other == null) {
return false;
}
if (!(other instanceof ImageAttributes)) {
return false;
}
ImageAttributes that = (ImageAttributes) other;
return Objects.equals(loc, that.loc) //
&& Objects.equals(caption, that.caption) //
&& Objects.equals(geoLocation, that.geoLocation) //
&& Objects.equals(title, that.title) //
&& Objects.equals(license, that.license);
}
}

View File

@ -0,0 +1,99 @@
/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.extension;
import java.net.URL;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
/**
* Data model for Google extension to the sitemap protocol regarding alternate
* links indexing. Cf. https://support.google.com/webmasters/answer/189077:
* <blockquote>Each <code>&lt;url&gt;</code> element must have a child element:
* <code>&lt;xhtml:link rel="alternate" hreflang="supported_language-code"&gt;</code>
* that lists every alternate version of the page, including itself. The order
* of these child <code>&lt;xhtml:link&gt;</code> elements doesn't matter,
* though you might want to keep them in the same order to make them easier for
* you to check for mistakes.</blockquote>
*/
public class LinkAttributes extends ExtensionMetadata {
/**
* Link's href attribute
*/
private URL href;
/**
* Link's other attributes key and values
*/
private Map<String, String> params;
public LinkAttributes() {
}
public LinkAttributes(URL href) {
this.href = href;
}
public URL getHref() {
return href;
}
public void setHref(URL href) {
this.href = href;
}
public Map<String, String> getParams() {
return params;
}
public void setParams(Map<String, String> params) {
this.params = params;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("Links href: ").append(href);
if (params != null && !params.isEmpty()) {
sb.append(", params: ");
boolean first = true;
for (Entry<String, String> e : params.entrySet()) {
if (!first) {
sb.append(',');
}
sb.append(e.getKey()).append(':').append(e.getValue());
first = false;
}
}
return sb.toString();
}
@Override
public boolean equals(Object other) {
if (other == null) {
return false;
}
if (!(other instanceof LinkAttributes)) {
return false;
}
LinkAttributes that = (LinkAttributes) other;
return Objects.equals(href, that.href) //
&& Objects.equals(params, that.params);
}
}

View File

@ -0,0 +1,44 @@
/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.extension;
/**
* Google mobile sitemap attributes, see
* http://www.google.de/schemas/sitemap-mobile/1.0/ and
* https://www.google.com/schemas/sitemap-mobile/1.0/sitemap-mobile.xsd:
* <blockquote>Mobile sitemaps just contain an empty "mobile" tag to identify a
* URL as having mobile content.</blockquote>
*/
public class MobileAttributes extends ExtensionMetadata {
@Override
public String toString() {
return "Mobile content avaiblabe: yes";
}
@Override
public boolean equals(Object other) {
if (other == null) {
return false;
}
if (!(other instanceof MobileAttributes)) {
return false;
}
return true;
}
}

View File

@ -0,0 +1,185 @@
/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.extension;
import java.time.ZonedDateTime;
import java.util.Date;
import java.util.Objects;
/**
* Data model for Google's extension to the sitemap protocol regarding news
* indexing, as per http://www.google.com/schemas/sitemap-news/0.9
*/
public class NewsAttributes extends ExtensionMetadata {
public static enum NewsGenre {
Blog, OpEd, Opinion, PressRelease, Satire, UserGenerated
}
/**
* News publication name found under news/publication/name (required)
*/
private String name;
/**
* News publication language found under news/publication/language
* (required)
*/
private String language;
/**
* News genres found under news/genres (required if applicable)
*/
private NewsGenre[] genres;
/**
* News publication date found under news/publication_date (required)
*/
private ZonedDateTime publicationDate;
/**
* News title found under news/title (required)
*/
private String title;
/**
* News keywords found under news/keywords (optional)
*
* @see https://support.google.com/news/publisher/answer/116037 for examples
*/
private String[] keywords;
/**
* News stock tickers found under news/stock_tickers (optional)
*/
private String[] stockTickers;
public NewsAttributes() {
}
public NewsAttributes(final String name, final String language, final ZonedDateTime publicationDate, final String title) {
this.name = name;
this.language = language;
this.publicationDate = publicationDate;
this.title = title;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getLanguage() {
return language;
}
public void setLanguage(String language) {
this.language = language;
}
public NewsGenre[] getGenres() {
return genres;
}
public void setGenres(NewsGenre[] genres) {
this.genres = genres;
}
public Date getPublicationDate() {
return Date.from(publicationDate.toInstant());
}
public ZonedDateTime getPublicationDateTime() {
return publicationDate;
}
public void setPublicationDate(ZonedDateTime publicationDate) {
this.publicationDate = publicationDate;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String[] getKeywords() {
return keywords;
}
public void setKeywords(String[] keywords) {
this.keywords = keywords;
}
public String[] getStockTickers() {
return stockTickers;
}
public void setStockTickers(String[] stockTickers) {
this.stockTickers = stockTickers;
}
@Override
public boolean equals(Object other) {
if (other == null || !(other instanceof NewsAttributes)) {
return false;
}
NewsAttributes that = (NewsAttributes) other;
return Objects.equals(name, that.name) //
&& Objects.equals(language, that.language) //
&& Objects.equals(title, that.title) //
&& Objects.equals(publicationDate, publicationDate) //
&& Objects.deepEquals(keywords, that.keywords) //
&& Objects.deepEquals(genres, that.genres) //
&& Objects.deepEquals(stockTickers, that.stockTickers);
}
@Override
public boolean isValid() {
return name != null && language != null && publicationDate != null && title != null;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("News name: ").append(name);
sb.append(", title: ").append(title);
sb.append(", language: ").append(language);
sb.append(", publication-date: ").append(publicationDate);
if (keywords != null) {
sb.append(", keywords: ").append(String.join(", ", keywords));
}
// Arrays.asList(genres).stream().map(NewsGenre::toString);
if (genres != null) {
sb.append(", genres: ");
for (int i = 0; i < genres.length; i++) {
if (i > 0) {
sb.append(',');
}
sb.append(genres[i].toString());
}
}
if (stockTickers != null) {
sb.append(", keywords: ").append(String.join(", ", stockTickers));
}
return sb.toString();
}
}

View File

@ -0,0 +1,548 @@
/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.extension;
import java.net.URL;
import java.time.ZonedDateTime;
import java.util.Arrays;
import java.util.Date;
import java.util.Locale;
import java.util.Objects;
/**
* Data model for Google extension to the sitemap protocol regarding images
* indexing, as per http://www.google.com/schemas/sitemap-video/1.1
*/
public class VideoAttributes extends ExtensionMetadata {
/**
* Video thumbnail URL found under video/thumbnail_loc (required)
*/
private URL thumbnailLoc;
/**
* Video title found under video/title (required)
*/
private String title;
/**
* Video description found under video/description (required)
*/
private String description;
/**
* Video content location found under video/content_loc (depends) if not
* specified, player location must be specified
*/
private URL contentLoc;
/**
* Video player location found under video/player_loc (depends) if not
* specified, content location must be specified
*/
private URL playerLoc;
/**
* Video duration in seconds found under video/duration (recommended) Must
* be integer between 0 and 28800 (8 hours)
*/
private Integer duration;
/**
* Video expiration date found under video/expiration_date (recommended if
* applicable)
*/
private ZonedDateTime expirationDate;
/**
* Video rating found under video/rating (optional) Must be float value
* between 0.0 and 5.0
*/
private Float rating;
/**
* Video view count found under video/view_count (optional)
*/
private Integer viewCount;
/**
* Video publication date found under video/publication_date (optional)
*/
private ZonedDateTime publicationDate;
/**
* Video family friendly attribute found under video/family_friendly
* (optional)
*/
private Boolean familyFriendly;
/**
* Video tags found under video/tag (optional) Up to 32 tags can be
* specified
*/
private String[] tags;
/**
* Video category found under video/category (optional)
*/
private String category;
/**
* Video restricted countries found under video/restriction (optional)
* blacklist of countries filled if video/restriction node has an attribute
* named relationship with a value of deny.
*/
private String[] restrictedCountries;
/**
* Video allowed countries found under video/restriction (optional)
* whitelist of countries filled if video/restriction node has an attribute
* named relationship with a value of allow.
*/
private String[] allowedCountries;
/**
* Video gallery location found under video/gallery_loc (optional)
*/
private URL galleryLoc;
/**
* Video gallery title found under video/gallery_loc[@title] (optional)
*/
private String galleryTitle;
/**
* Video prices found under video/price (optional)
*/
private VideoPrice[] prices;
/**
* Video requires subscription (free or paid) found under
* video/requires_subscription (optional)
*/
private Boolean requiresSubscription;
/**
* Video uploader found under video/uploader (optional)
*/
private String uploader;
/**
* Video uploader location (optional) Must be on the same domain as the
* &lt;loc&gt; this property refers to
*/
private URL uploaderInfo;
/**
* Video restricted platforms found under video/platform (optional)
* blacklist of platform filled if video/platform node has an attribute
* named relationship with a value of deny.
*/
private String[] restrictedPlatforms;
/**
* Video allowed platforms found under video/platform (optional) whitelist
* of platforms filled if video/platform node has an attribute named
* relationship with a value of allow.
*/
private String[] allowedPlatforms;
/**
* Video is a live stream found under video/live (optional)
*/
private Boolean isLive;
public URL getThumbnailLoc() {
return thumbnailLoc;
}
public void setThumbnailLoc(URL thumbnailLoc) {
this.thumbnailLoc = thumbnailLoc;
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public URL getContentLoc() {
return contentLoc;
}
public void setContentLoc(URL contentLoc) {
this.contentLoc = contentLoc;
}
public URL getPlayerLoc() {
return playerLoc;
}
public void setPlayerLoc(URL playerLoc) {
this.playerLoc = playerLoc;
}
public Integer getDuration() {
return duration;
}
public void setDuration(Integer duration) {
this.duration = duration;
}
public Date getExpirationDate() {
return Date.from(expirationDate.toInstant());
}
public ZonedDateTime getExpirationDateTime() {
return expirationDate;
}
public void setExpirationDate(ZonedDateTime expirationDate) {
this.expirationDate = expirationDate;
}
public Float getRating() {
return rating;
}
public void setRating(Float rating) {
this.rating = rating;
}
public Integer getViewCount() {
return viewCount;
}
public void setViewCount(Integer viewCount) {
this.viewCount = viewCount;
}
public Date getPublicationDate() {
return Date.from(publicationDate.toInstant());
}
public ZonedDateTime getPublicationDateTime() {
return publicationDate;
}
public void setPublicationDate(ZonedDateTime publicationDate) {
this.publicationDate = publicationDate;
}
public Boolean getFamilyFriendly() {
return familyFriendly;
}
public void setFamilyFriendly(Boolean familyFriendly) {
this.familyFriendly = familyFriendly;
}
public String[] getTags() {
return tags;
}
public void setTags(String[] tags) {
this.tags = tags;
}
public void addTag(String tag) {
if (tag == null) {
return;
}
String[] arr;
if (tags == null) {
arr = new String[1];
arr[0] = tag;
} else {
arr = Arrays.copyOf(tags, tags.length + 1);
arr[tags.length] = tag;
}
tags = arr;
}
public String getCategory() {
return category;
}
public void setCategory(String category) {
this.category = category;
}
public String[] getRestrictedCountries() {
return restrictedCountries;
}
public void setRestrictedCountries(String[] restrictedCountries) {
this.restrictedCountries = restrictedCountries;
}
public String[] getAllowedCountries() {
return allowedCountries;
}
public void setAllowedCountries(String[] allowedCountries) {
this.allowedCountries = allowedCountries;
}
public URL getGalleryLoc() {
return galleryLoc;
}
public void setGalleryLoc(URL galleryLoc) {
this.galleryLoc = galleryLoc;
}
public String getGalleryTitle() {
return galleryTitle;
}
public void setGalleryTitle(String galleryTitle) {
this.galleryTitle = galleryTitle;
}
public VideoPrice[] getPrices() {
return prices;
}
public void setPrices(VideoPrice[] prices) {
this.prices = prices;
}
public void addPrice(VideoPrice price) {
if (price == null) {
return;
}
VideoPrice[] arr;
if (prices == null) {
arr = new VideoPrice[1];
arr[0] = price;
} else {
arr = Arrays.copyOf(prices, prices.length + 1);
arr[prices.length] = price;
}
prices = arr;
}
public Boolean getRequiresSubscription() {
return requiresSubscription;
}
public void setRequiresSubscription(Boolean requiresSubscription) {
this.requiresSubscription = requiresSubscription;
}
public String getUploader() {
return uploader;
}
public void setUploader(String uploader) {
this.uploader = uploader;
}
public URL getUploaderInfo() {
return uploaderInfo;
}
public void setUploaderInfo(URL uploaderInfo) {
this.uploaderInfo = uploaderInfo;
}
public String[] getRestrictedPlatforms() {
return restrictedPlatforms;
}
public void setRestrictedPlatforms(String[] restrictedPlatforms) {
this.restrictedPlatforms = restrictedPlatforms;
}
public String[] getAllowedPlatforms() {
return allowedPlatforms;
}
public void setAllowedPlatforms(String[] allowedPlatforms) {
this.allowedPlatforms = allowedPlatforms;
}
public Boolean getLive() {
return isLive;
}
public void setLive(Boolean live) {
isLive = live;
}
public enum VideoPriceType {
own, rent
}
public enum VideoPriceResolution {
SD, HD
}
public static final class VideoPrice {
/**
* Video price currency found under video/price[@currency] (required)
*/
private final String currency;
/**
* Video price type (rent vs own) found under video/price[@type]
* (optional, defaults to own)
*/
private final VideoPriceType type;
/**
* Video price resolution found under video/price[@resolution]
*/
private final VideoPriceResolution resolution;
/**
* Video price
*/
private float price;
public VideoPrice(final String currency, final float price) {
this(currency, price, VideoPriceType.own);
}
public VideoPrice(final String currency, final float price, final VideoPriceType type) {
this(currency, price, type, null);
}
public VideoPrice(final String currency, final float price, final VideoPriceType type, final VideoPriceResolution resolution) {
this.currency = currency;
this.price = price;
this.type = type;
this.resolution = resolution;
}
@Override
public String toString() {
return String.format(Locale.ENGLISH, "value: %.2f, currency: %s, type: %s, resolution: %s", price, currency, type, resolution);
}
@Override
public boolean equals(Object other) {
if (other == null) {
return false;
}
if (!(other instanceof VideoPrice)) {
return false;
}
VideoPrice that = (VideoPrice) other;
return Objects.equals(currency, that.currency) //
&& price == that.price //
&& type == that.type //
&& Objects.equals(resolution, that.resolution);
}
public String getCurrency() {
return currency;
}
public VideoPriceType getType() {
return type;
}
public VideoPriceResolution getResolution() {
return resolution;
}
public float getPrice() {
return price;
}
public void setPrice(float price) {
this.price = price;
}
}
public VideoAttributes() {
}
public VideoAttributes(final URL thumbnailLoc, final String title, final String description, final URL contentLoc, final URL playerLoc) {
this.thumbnailLoc = thumbnailLoc;
this.title = title;
this.description = description;
this.contentLoc = contentLoc;
this.playerLoc = playerLoc;
}
@Override
public String toString() {
return new StringBuilder("Video title: ").append(title) //
.append(", description: ").append(description) //
.append(", thumbnail: ").append(thumbnailLoc) //
.append(", contentLoc: ").append(contentLoc) //
.append(", playerLoc: ").append(playerLoc) //
.append(", prices: ").append(Arrays.toString(prices)) //
.toString();
}
@Override
public boolean equals(Object other) {
if (other == null) {
return false;
}
if (!(other instanceof VideoAttributes)) {
return false;
}
VideoAttributes that = (VideoAttributes) other;
return Objects.equals(thumbnailLoc, that.thumbnailLoc) //
&& Objects.equals(title, that.title) //
&& Objects.equals(description, that.description) //
&& Objects.equals(contentLoc, that.contentLoc) //
&& Objects.equals(playerLoc, that.playerLoc) //
&& Objects.equals(duration, that.duration) //
&& Objects.equals(expirationDate, that.expirationDate) //
&& Objects.equals(rating, that.rating) //
&& Objects.equals(viewCount, that.viewCount) //
&& Objects.equals(publicationDate, that.publicationDate) //
&& Objects.equals(familyFriendly, that.familyFriendly) //
&& Objects.deepEquals(tags, that.tags) //
&& Objects.equals(category, that.category) //
&& Objects.deepEquals(restrictedCountries, that.restrictedCountries) //
&& Objects.deepEquals(allowedCountries, that.allowedCountries) //
&& Objects.equals(galleryLoc, that.galleryLoc) //
&& Objects.equals(galleryTitle, that.galleryTitle) //
&& Objects.deepEquals(prices, that.prices) //
&& Objects.equals(requiresSubscription, that.requiresSubscription) //
&& Objects.equals(uploader, that.uploader) //
&& Objects.equals(uploaderInfo, that.uploaderInfo) //
&& Objects.deepEquals(allowedPlatforms, that.allowedPlatforms) //
&& Objects.deepEquals(restrictedPlatforms, that.restrictedPlatforms) //
&& Objects.equals(isLive, that.isLive);
}
@Override
public boolean isValid() {
return thumbnailLoc != null && title != null && title.length() <= 100 && description != null && description.length() <= 2048 && (contentLoc != null || playerLoc != null);
}
}

View File

@ -21,6 +21,7 @@ import static crawlercommons.sitemaps.SiteMapParser.LOG;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.xml.sax.Attributes;
@ -31,6 +32,7 @@ import org.xml.sax.helpers.DefaultHandler;
import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.Namespace;
import crawlercommons.sitemaps.UnknownFormatException;
import crawlercommons.sitemaps.extension.Extension;
/**
* Provides a base SAX handler for parsing of XML documents representing
@ -45,6 +47,7 @@ public class DelegatorHandler extends DefaultHandler {
private boolean strictNamespace;
private UnknownFormatException exception;
private Set<String> acceptedNamespaces;
protected Map<String, Extension> extensionNamespaces;
protected DelegatorHandler(LinkedList<String> elementStack, boolean strict) {
this.elementStack = elementStack;
@ -81,6 +84,17 @@ public class DelegatorHandler extends DefaultHandler {
return acceptedNamespaces.contains(uri);
}
public void setExtensionNamespaces(Map<String, Extension> extensionMap) {
extensionNamespaces = extensionMap;
}
protected boolean isExtensionNamespace(String uri) {
if (extensionNamespaces == null) {
return false;
}
return extensionNamespaces.containsKey(uri);
}
protected void setException(UnknownFormatException exception) {
this.exception = exception;
}
@ -153,6 +167,7 @@ public class DelegatorHandler extends DefaultHandler {
return;
}
}
delegate.setExtensionNamespaces(extensionNamespaces);
}
public void endElement(String uri, String localName, String qName) throws SAXException {

View File

@ -21,7 +21,12 @@ import static crawlercommons.sitemaps.SiteMapParser.urlIsValid;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Collection;
import java.util.LinkedList;
import java.util.Map;
import java.util.Map.Entry;
import java.util.TreeMap;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
@ -31,6 +36,8 @@ import crawlercommons.sitemaps.AbstractSiteMap;
import crawlercommons.sitemaps.AbstractSiteMap.SitemapType;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.extension.Extension;
import crawlercommons.sitemaps.sax.extension.ExtensionHandler;
/**
* Parse XML that contains a valid Sitemap. Example of a Sitemap:
@ -62,6 +69,8 @@ class XMLHandler extends DelegatorHandler {
private String priority;
private int i = 0;
private boolean currentElementNamespaceIsValid;
private String currentElementNamespace;
protected Map<Extension, ExtensionHandler> extensionHandlers;
XMLHandler(URL url, LinkedList<String> elementStack, boolean strict) {
super(elementStack, strict);
@ -71,7 +80,12 @@ class XMLHandler extends DelegatorHandler {
}
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if (isStrictNamespace() && !isAcceptedNamespace(uri)) {
currentElementNamespace = uri;
if (isExtensionNamespace(uri)) {
ExtensionHandler eh = getExtensionHandler(uri);
eh.startElement(uri, localName, qName, attributes);
return;
} else if (isStrictNamespace() && !isAcceptedNamespace(uri)) {
LOG.debug("Skip element <{}>, namespace <{}> not accepted", localName, uri);
currentElementNamespaceIsValid = false;
return;
@ -98,7 +112,11 @@ class XMLHandler extends DelegatorHandler {
}
public void endElement(String uri, String localName, String qName) throws SAXException {
if (isStrictNamespace() && !isAcceptedNamespace(uri)) {
if (isExtensionNamespace(uri)) {
ExtensionHandler eh = getExtensionHandler(uri);
eh.endElement(uri, localName, qName);
return;
} else if (isStrictNamespace() && !isAcceptedNamespace(uri)) {
return;
}
if ("url".equals(localName) && "urlset".equals(currentElementParent())) {
@ -109,7 +127,11 @@ class XMLHandler extends DelegatorHandler {
}
public void characters(char[] ch, int start, int length) throws SAXException {
if (isStrictNamespace() && !currentElementNamespaceIsValid) {
if (isExtensionNamespace(currentElementNamespace)) {
ExtensionHandler eh = getExtensionHandler(currentElementNamespace);
eh.characters(ch, start, length);
return;
} else if (isStrictNamespace() && !currentElementNamespaceIsValid) {
return;
}
String localName = super.currentElement();
@ -142,6 +164,11 @@ class XMLHandler extends DelegatorHandler {
sUrl.setPriority(priority);
sitemap.addSiteMapUrl(sUrl);
LOG.debug(" {}. {}", (++i), sUrl);
if (extensionHandlers != null) {
for (Entry<Extension, ExtensionHandler> e : extensionHandlers.entrySet()) {
sUrl.addAttributesForExtension(e.getKey(), e.getValue().getAttributes());
}
}
}
} catch (MalformedURLException e) {
LOG.debug("Bad url: [{}]", value);
@ -151,6 +178,48 @@ class XMLHandler extends DelegatorHandler {
lastMod = null;
changeFreq = null;
priority = null;
resetExtensionHandlers();
}
}
/**
* Registers and returns an ExtensionHandler instance bound to this handler
*
* @param uri
* URI of sitemap extension namespace
* @return handler for the sitemap extension defined by XML namespace
*/
protected ExtensionHandler getExtensionHandler(String uri) {
if (extensionNamespaces.containsKey(uri)) {
Extension ext = extensionNamespaces.get(uri);
if (extensionHandlers == null) {
extensionHandlers = new TreeMap<>();
}
if (!extensionHandlers.containsKey(ext)) {
extensionHandlers.put(ext, ExtensionHandler.create(ext));
}
return extensionHandlers.get(ext);
}
return null;
}
protected Collection<ExtensionHandler> getExtensionHandlers() {
if (extensionHandlers == null) {
return new ArrayList<ExtensionHandler>();
}
return extensionHandlers.values();
}
/**
* Reset all extension handlers. Attributes of sitemap extensions are bound
* to a single {@link SiteMapURL}, handlers should be reset if a sitemap URL
* is closed.
*/
public void resetExtensionHandlers() {
if (extensionHandlers != null) {
for (Entry<Extension, ExtensionHandler> e : extensionHandlers.entrySet()) {
e.getValue().reset();
}
}
}

View File

@ -0,0 +1,119 @@
/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.sax.extension;
import static crawlercommons.sitemaps.SiteMapParser.LOG;
import java.net.MalformedURLException;
import java.net.URL;
import java.time.ZonedDateTime;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import org.xml.sax.helpers.DefaultHandler;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.SiteMapURL;
import crawlercommons.sitemaps.extension.Extension;
import crawlercommons.sitemaps.extension.ExtensionMetadata;
/**
* Handler to be called for elements in the namespace of a sitemap extension.
* Parses the extension elements and creates extension-specific attributes to be
* assigned to a {@link SiteMapURL}.
*/
public abstract class ExtensionHandler extends DefaultHandler {
protected static Pattern commaSeparated = Pattern.compile("\\s*,\\s*");
protected List<ExtensionMetadata> attributes = new ArrayList<>();
public static ExtensionHandler create(Extension extension) {
switch (extension) {
case NEWS:
return new NewsHandler();
case VIDEO:
return new VideoHandler();
case IMAGE:
return new ImageHandler();
case LINKS:
return new LinksHandler();
case MOBILE:
return new MobileHandler();
default:
return null;
}
}
public ExtensionMetadata[] getAttributes() {
return attributes.toArray(new ExtensionMetadata[0]);
}
public void reset() {
attributes.clear();
}
protected static ZonedDateTime getDateValue(String value) {
return SiteMap.convertToZonedDateTime(value);
}
protected static URL getURLValue(final String value) {
if (value != null) {
try {
return new URL(value);
} catch (MalformedURLException e) {
LOG.debug("Invalid URL value: {}", value);
}
}
return null;
}
protected static Integer getIntegerValue(String value) {
if (value != null) {
try {
return Integer.parseInt(value);
} catch (NumberFormatException e) {
LOG.debug("Invalid integer value: {}", value);
}
}
return null;
}
protected static Float getFloatValue(String value) {
if (value != null) {
try {
return Float.parseFloat(value);
} catch (NumberFormatException e) {
LOG.debug("Invalid float value: {}", value);
}
}
return null;
}
protected static Boolean getYesNoBooleanValue(String value, String elemName) {
if ("no".equalsIgnoreCase(value)) {
return Boolean.FALSE;
} else if ("yes".equalsIgnoreCase(value)) {
return Boolean.TRUE;
} else {
LOG.debug("Unexpected value for {} node: {}", elemName, value);
}
return null;
}
}

View File

@ -0,0 +1,105 @@
/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.sax.extension;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import crawlercommons.sitemaps.extension.ExtensionMetadata;
import crawlercommons.sitemaps.extension.ImageAttributes;
/** Handle SAX events in the Google Image sitemap extension namespace. */
public class ImageHandler extends ExtensionHandler {
private ImageAttributes currAttr;
private StringBuilder currVal;
public ImageHandler() {
reset();
}
@Override
public void reset() {
super.reset();
resetCurrent();
}
private void resetCurrent() {
currAttr = null;
currVal = new StringBuilder();
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if ("image".equals(localName)) {
// add last attribute and reset in case of unclosed
// elements
if (currAttr != null && currAttr.isValid()) {
this.attributes.add(currAttr);
}
resetCurrent();
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
String value = currVal.toString().trim();
if ("image".equals(localName)) {
if (currAttr != null && currAttr.isValid()) {
attributes.add(currAttr);
}
resetCurrent();
return;
}
if (currAttr == null) {
currAttr = new ImageAttributes();
}
if (value.isEmpty()) {
// skip value but reset StringBuilder
} else if ("loc".equals(localName)) {
currAttr.setLoc(getURLValue(value));
} else if ("caption".equals(localName)) {
currAttr.setCaption(value);
} else if ("title".equals(localName)) {
currAttr.setTitle(value);
} else if ("geo_location".equals(localName)) {
currAttr.setGeoLocation(value);
} else if ("license".equals(localName)) {
currAttr.setLicense(getURLValue(value));
}
// reset StringBuilder
currVal = new StringBuilder();
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
currVal.append(String.valueOf(ch, start, length));
}
@Override
public ExtensionMetadata[] getAttributes() {
if (currAttr != null && currAttr.isValid()) {
/*
* add current element to attribute list, do not reset in case
* getAttributes is called during parsing of a sitemap <url> element
*/
attributes.add(currAttr);
}
return super.getAttributes();
}
}

View File

@ -0,0 +1,65 @@
/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.sax.extension;
import java.net.URL;
import java.util.Map;
import java.util.TreeMap;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import crawlercommons.sitemaps.extension.LinkAttributes;
/** Handle SAX events in the Google Image sitemap extension namespace. */
public class LinksHandler extends ExtensionHandler {
public LinksHandler() {
reset();
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if ("link".equals(localName)) {
String href = attributes.getValue("href");
if (href != null && !href.trim().isEmpty()) {
URL url = getURLValue(href.trim());
if (url != null) {
LinkAttributes attr = new LinkAttributes(url);
this.attributes.add(attr);
Map<String, String> params = new TreeMap<>();
for (int i = 0; i < attributes.getLength(); i++) {
String k = attributes.getLocalName(i);
if (!k.equals("href")) {
params.put(k, attributes.getValue(i).trim());
}
}
attr.setParams(params);
}
}
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
}
}

View File

@ -0,0 +1,66 @@
/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.sax.extension;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import crawlercommons.sitemaps.extension.ExtensionMetadata;
import crawlercommons.sitemaps.extension.MobileAttributes;
/** Handle SAX events in the Google Mobile sitemap extension namespace. */
public class MobileHandler extends ExtensionHandler {
private static MobileAttributes[] noMobileAttributes = new MobileAttributes[0];
private static MobileAttributes[] mobileAttributes = new MobileAttributes[1];
static {
mobileAttributes[0] = new MobileAttributes();
}
private boolean mobileElementFound = false;
public MobileHandler() {
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if ("mobile".equals(localName)) {
mobileElementFound = true;
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
}
public ExtensionMetadata[] getAttributes() {
if (mobileElementFound) {
return mobileAttributes;
}
return noMobileAttributes;
}
public void reset() {
super.reset();
mobileElementFound = false;
}
}

View File

@ -0,0 +1,128 @@
/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.sax.extension;
import static crawlercommons.sitemaps.SiteMapParser.LOG;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import crawlercommons.sitemaps.extension.ExtensionMetadata;
import crawlercommons.sitemaps.extension.NewsAttributes;
/** Handle SAX events in the Google News sitemap extension namespace. */
public class NewsHandler extends ExtensionHandler {
private NewsAttributes currAttr;
private StringBuilder currVal;
public NewsHandler() {
reset();
}
@Override
public void reset() {
super.reset();
resetCurrent();
}
private void resetCurrent() {
currAttr = null;
currVal = new StringBuilder();
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if ("news".equals(localName)) {
// add last attribute and reset in case of unclosed
// elements
if (currAttr != null && currAttr.isValid()) {
this.attributes.add(currAttr);
}
resetCurrent();
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
String value = currVal.toString().trim();
if ("news".equals(localName)) {
if (currAttr != null && currAttr.isValid()) {
attributes.add(currAttr);
}
resetCurrent();
return;
}
if (currAttr == null) {
currAttr = new NewsAttributes();
}
if (value.isEmpty()) {
// skip value but reset StringBuilder
} else if ("name".equals(localName)) {
currAttr.setName(value);
} else if ("title".equals(localName)) {
currAttr.setTitle(value);
} else if ("language".equals(localName)) {
currAttr.setLanguage(value);
} else if ("publication_date".equals(localName)) {
currAttr.setPublicationDate(getDateValue(value));
} else if ("genres".equals(localName)) {
String[] genresList = commaSeparated.split(value);
List<NewsAttributes.NewsGenre> _genres = new ArrayList<>();
for (String genre : genresList) {
try {
_genres.add(NewsAttributes.NewsGenre.valueOf(genre.trim()));
} catch (IllegalArgumentException e) {
LOG.debug("Unsupported news sitemap genre: {}", genre);
}
}
currAttr.setGenres(_genres.toArray(new NewsAttributes.NewsGenre[_genres.size()]));
} else if ("keywords".equals(localName)) {
currAttr.setKeywords(commaSeparated.split(value));
} else if ("stock_tickers".equals(localName)) {
String[] stockTickers = commaSeparated.split(value);
if (stockTickers.length > 5) {
stockTickers = Arrays.copyOfRange(stockTickers, 0, 5);
}
currAttr.setStockTickers(stockTickers);
}
// reset StringBuilder
currVal = new StringBuilder();
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
currVal.append(String.valueOf(ch, start, length));
}
@Override
public ExtensionMetadata[] getAttributes() {
if (currAttr != null && currAttr.isValid()) {
/*
* add current element to attribute list, do not reset in case
* getAttributes is called during parsing of a sitemap <url> element
*/
attributes.add(currAttr);
}
return super.getAttributes();
}
}

View File

@ -0,0 +1,215 @@
/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.sax.extension;
import static crawlercommons.sitemaps.SiteMapParser.LOG;
import java.time.ZonedDateTime;
import java.util.Map;
import java.util.TreeMap;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import crawlercommons.sitemaps.SiteMap;
import crawlercommons.sitemaps.extension.ExtensionMetadata;
import crawlercommons.sitemaps.extension.VideoAttributes;
import crawlercommons.sitemaps.extension.VideoAttributes.VideoPrice;
import crawlercommons.sitemaps.extension.VideoAttributes.VideoPriceResolution;
import crawlercommons.sitemaps.extension.VideoAttributes.VideoPriceType;
/** Handle SAX events in the Google Video sitemap extension namespace. */
public class VideoHandler extends ExtensionHandler {
private VideoAttributes currAttr;
private StringBuilder currVal;
private String relationAttr;
private Map<String, String> priceAttr;
private static String[] PRICE_ATTRIBUTES = { "currency", "type", "resolution" };
public VideoHandler() {
reset();
}
@Override
public void reset() {
super.reset();
resetCurrent();
}
private void resetCurrent() {
currAttr = null;
currVal = new StringBuilder();
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
if ("video".equals(localName)) {
// add last attribute and reset in case of unclosed
// elements
if (currAttr != null && currAttr.isValid()) {
this.attributes.add(currAttr);
}
resetCurrent();
return;
}
if (currAttr == null) {
currAttr = new VideoAttributes();
}
if ("restriction".equals(localName) || "platform".equals(localName)) {
relationAttr = attributes.getValue("relationship");
} else if ("gallery_loc".equals(localName)) {
currAttr.setGalleryTitle(attributes.getValue("title"));
} else if ("uploader".equals(localName)) {
currAttr.setUploaderInfo(getURLValue(attributes.getValue("info")));
} else if ("price".equals(localName)) {
priceAttr = new TreeMap<>();
for (String a : PRICE_ATTRIBUTES) {
String v = attributes.getValue(a);
if (v != null) {
priceAttr.put(a, v);
}
}
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
String value = currVal.toString().trim().replaceAll("\\s+", " ");
if ("video".equals(localName)) {
if (currAttr != null && currAttr.isValid()) {
// add current attribute to attribute list
attributes.add(currAttr);
}
resetCurrent();
return;
}
if (currAttr == null) {
currAttr = new VideoAttributes();
}
if (value.isEmpty()) {
// skip value but reset StringBuilder
} else if ("thumbnail_loc".equals(localName)) {
currAttr.setThumbnailLoc(getURLValue(value));
} else if ("title".equals(localName)) {
currAttr.setTitle(value);
} else if ("description".equals(localName)) {
currAttr.setDescription(value);
} else if ("content_loc".equals(localName)) {
currAttr.setContentLoc(getURLValue(value));
} else if ("player_loc".equals(localName)) {
currAttr.setPlayerLoc(getURLValue(value));
} else if ("duration".equals(localName)) {
Integer duration = getIntegerValue(value);
if (duration != null && (duration < 0 || duration > 28800)) {
LOG.debug("Invalid value for specified duration: {}", duration);
duration = null;
}
currAttr.setDuration(duration);
} else if ("expiration_date".equals(localName)) {
ZonedDateTime dateTime = SiteMap.convertToZonedDateTime(value);
currAttr.setExpirationDate(dateTime);
} else if ("rating".equals(localName)) {
currAttr.setRating(getFloatValue(value));
} else if ("view_count".equals(localName)) {
currAttr.setViewCount(getIntegerValue(value));
} else if ("publication_date".equals(localName)) {
currAttr.setPublicationDate(getDateValue(value));
} else if ("family_friendly".equals(localName)) {
currAttr.setFamilyFriendly(!"No".equalsIgnoreCase(value));
} else if ("tag".equals(localName)) {
currAttr.addTag(value);
} else if ("category".equals(localName)) {
currAttr.setCategory(value);
} else if ("restriction".equals(localName)) {
if (relationAttr != null) {
String[] vals = value.split("\\s+");
if ("allow".equalsIgnoreCase(relationAttr)) {
currAttr.setAllowedCountries(vals);
} else if ("deny".equalsIgnoreCase(relationAttr)) {
currAttr.setRestrictedCountries(vals);
}
}
relationAttr = null;
} else if ("gallery_loc".equals(localName)) {
currAttr.setGalleryLoc(getURLValue(value));
} else if ("price".equals(localName)) {
float fvalue = getFloatValue(value);
String currency = null;
VideoPriceType type = VideoPriceType.own;
VideoPriceResolution resolution = null;
if (priceAttr != null) {
currency = priceAttr.get("currency").trim();
String t = priceAttr.get("type");
if (t != null && !t.trim().isEmpty()) {
try {
type = VideoPriceType.valueOf(t.trim());
} catch (IllegalArgumentException e) {
LOG.debug("Illegal value for price type: {}", type);
}
}
String r = priceAttr.get("resolution");
if (r != null && !r.trim().isEmpty()) {
try {
resolution = VideoPriceResolution.valueOf(r.trim());
} catch (IllegalArgumentException e) {
LOG.debug("Illegal value for price resolution: {}", resolution);
}
}
}
VideoPrice price = new VideoPrice(currency, fvalue, type, resolution);
currAttr.addPrice(price);
priceAttr = null;
} else if ("requires_subscription".equals(localName)) {
currAttr.setRequiresSubscription(getYesNoBooleanValue(value, localName));
} else if ("uploader".equals(localName)) {
currAttr.setUploader(value);
} else if ("platform".equals(localName)) {
if (relationAttr != null) {
String[] vals = value.split("\\s+");
if ("allow".equalsIgnoreCase(relationAttr)) {
currAttr.setAllowedPlatforms(vals);
} else if ("deny".equalsIgnoreCase(relationAttr)) {
currAttr.setRestrictedPlatforms(vals);
}
}
relationAttr = null;
} else if ("live".equals(localName)) {
currAttr.setLive(getYesNoBooleanValue(value, localName));
}
// reset StringBuilder
currVal = new StringBuilder();
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
currVal.append(String.valueOf(ch, start, length));
}
@Override
public ExtensionMetadata[] getAttributes() {
if (currAttr != null && currAttr.isValid()) {
/*
* add current element to attribute list, do not reset in case
* getAttributes is called during parsing of a sitemap <url> element
*/
attributes.add(currAttr);
}
return super.getAttributes();
}
}

View File

@ -0,0 +1,260 @@
/**
* Copyright 2018 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.time.ZonedDateTime;
import java.util.HashMap;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.junit.runners.JUnit4;
import crawlercommons.sitemaps.extension.Extension;
import crawlercommons.sitemaps.extension.ExtensionMetadata;
import crawlercommons.sitemaps.extension.ImageAttributes;
import crawlercommons.sitemaps.extension.LinkAttributes;
import crawlercommons.sitemaps.extension.MobileAttributes;
import crawlercommons.sitemaps.extension.NewsAttributes;
import crawlercommons.sitemaps.extension.VideoAttributes;
@RunWith(JUnit4.class)
public class SiteMapParserExtensionTest {
@Test
public void testVideosSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.VIDEO);
String contentType = "text/xml";
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-videos.xml");
URL url = new URL("http://www.example.com/sitemap-video.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(1, sm.getSiteMapUrls().size());
VideoAttributes expectedVideoAttributes = new VideoAttributes(new URL("http://www.example.com/thumbs/123.jpg"), "Grilling steaks for summer",
"Alkis shows you how to get perfectly done steaks every time", new URL("http://www.example.com/video123.flv"), new URL("http://www.example.com/videoplayer.swf?video=123"));
expectedVideoAttributes.setDuration(600);
ZonedDateTime dt = ZonedDateTime.parse("2009-11-05T19:20:30+08:00");
expectedVideoAttributes.setExpirationDate(dt);
dt = ZonedDateTime.parse("2007-11-05T19:20:30+08:00");
expectedVideoAttributes.setPublicationDate(dt);
expectedVideoAttributes.setRating(4.2f);
expectedVideoAttributes.setViewCount(12345);
expectedVideoAttributes.setFamilyFriendly(true);
expectedVideoAttributes.setTags(new String[] { "sample_tag1", "sample_tag2" });
expectedVideoAttributes.setAllowedCountries(new String[] { "IE", "GB", "US", "CA" });
expectedVideoAttributes.setGalleryLoc(new URL("http://cooking.example.com"));
expectedVideoAttributes.setGalleryTitle("Cooking Videos");
expectedVideoAttributes.setPrices(new VideoAttributes.VideoPrice[] { new VideoAttributes.VideoPrice("EUR", 1.99f, VideoAttributes.VideoPriceType.own) });
expectedVideoAttributes.setRequiresSubscription(true);
expectedVideoAttributes.setUploader("GrillyMcGrillerson");
expectedVideoAttributes.setUploaderInfo(new URL("http://www.example.com/users/grillymcgrillerson"));
expectedVideoAttributes.setLive(false);
for (SiteMapURL su : sm.getSiteMapUrls()) {
assertNotNull(su.getAttributesForExtension(Extension.VIDEO));
VideoAttributes attr = (VideoAttributes) su.getAttributesForExtension(Extension.VIDEO)[0];
assertEquals(expectedVideoAttributes, attr);
}
}
@Test
public void testImageSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.IMAGE);
String contentType = "text/xml";
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-images.xml");
URL url = new URL("http://www.example.com/sitemap-images.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(1, sm.getSiteMapUrls().size());
ImageAttributes imageAttributes1 = new ImageAttributes(new URL("http://example.com/image.jpg"));
ImageAttributes imageAttributes2 = new ImageAttributes(new URL("http://example.com/photo.jpg"));
imageAttributes2.setCaption("This is the caption.");
imageAttributes2.setGeoLocation("Limerick, Ireland");
imageAttributes2.setTitle("Example photo shot in Limerick, Ireland");
imageAttributes2.setLicense(new URL("https://creativecommons.org/licenses/by/4.0/legalcode"));
for (SiteMapURL su : sm.getSiteMapUrls()) {
assertNotNull(su.getAttributesForExtension(Extension.IMAGE));
ExtensionMetadata[] attrs = su.getAttributesForExtension(Extension.IMAGE);
ImageAttributes attr = (ImageAttributes) attrs[0];
assertEquals(imageAttributes1, attr);
attr = (ImageAttributes) attrs[1];
assertEquals(imageAttributes2, attr);
}
}
@SuppressWarnings("serial")
@Test
public void testXHTMLLinksSitemap() throws UnknownFormatException, IOException, MalformedURLException {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.LINKS);
String contentType = "text/xml";
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-links.xml");
URL url = new URL("http://www.example.com/sitemap-links.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(3, sm.getSiteMapUrls().size());
// all three pages share the same links attributes
LinkAttributes[] linkAttributes = new LinkAttributes[] { new LinkAttributes(new URL("http://www.example.com/deutsch/")),
new LinkAttributes(new URL("http://www.example.com/schweiz-deutsch/")), new LinkAttributes(new URL("http://www.example.com/english/")) };
linkAttributes[0].setParams(new HashMap<String, String>() {
{
put("rel", "alternate");
put("hreflang", "de");
}
});
linkAttributes[1].setParams(new HashMap<String, String>() {
{
put("rel", "alternate");
put("hreflang", "de-ch");
}
});
linkAttributes[2].setParams(new HashMap<String, String>() {
{
put("rel", "alternate");
put("hreflang", "en");
}
});
for (SiteMapURL su : sm.getSiteMapUrls()) {
assertNotNull(su.getAttributesForExtension(Extension.LINKS));
ExtensionMetadata[] attrs = su.getAttributesForExtension(Extension.LINKS);
assertEquals(linkAttributes.length, attrs.length);
for (int i = 0; i < linkAttributes.length; i++) {
LinkAttributes attr = (LinkAttributes) attrs[i];
assertEquals(linkAttributes[i], attr);
}
}
}
@Test
public void testNewsSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.NEWS);
String contentType = "text/xml";
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-news.xml");
URL url = new URL("http://www.example.org/sitemap-news.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(1, sm.getSiteMapUrls().size());
ZonedDateTime dt = ZonedDateTime.parse("2008-11-23T00:00:00+00:00");
NewsAttributes expectedNewsAttributes = new NewsAttributes("The Example Times", "en", dt, "Companies A, B in Merger Talks");
expectedNewsAttributes.setKeywords(new String[] { "business", "merger", "acquisition", "A", "B" });
expectedNewsAttributes.setGenres(new NewsAttributes.NewsGenre[] { NewsAttributes.NewsGenre.PressRelease, NewsAttributes.NewsGenre.Blog });
expectedNewsAttributes.setStockTickers(new String[] { "NASDAQ:A", "NASDAQ:B" });
for (SiteMapURL su : sm.getSiteMapUrls()) {
assertNotNull(su.getAttributesForExtension(Extension.NEWS));
NewsAttributes attr = (NewsAttributes) su.getAttributesForExtension(Extension.NEWS)[0];
assertEquals(expectedNewsAttributes, attr);
}
}
@Test
public void testMobileSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.enableExtension(Extension.MOBILE);
String contentType = "text/xml";
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/sitemap-mobile.xml");
URL url = new URL("http://www.example.org/sitemap-mobile.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
for (SiteMapURL su : sm.getSiteMapUrls()) {
URL u = su.getUrl();
ExtensionMetadata[] attrs = su.getAttributesForExtension(Extension.MOBILE);
if (u.getPath().contains("mobile-friendly")) {
assertNotNull(attrs);
MobileAttributes attr = (MobileAttributes) attrs[0];
assertNotNull(attr);
} else {
assertTrue(attrs == null || attrs.length == 0);
}
}
}
@Test
public void testShinpaideshuNewsSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.setStrictNamespace(true);
parser.enableExtension(Extension.NEWS);
String contentType = "text/xml";
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/shinpaideshou-news-sitemap.xml");
URL url = new URL("https://shinpaideshou.wordpress.com/news-sitemap.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(3, sm.getSiteMapUrls().size());
for (SiteMapURL su : sm.getSiteMapUrls()) {
assertNotNull(su.getAttributesForExtension(Extension.NEWS));
NewsAttributes attr = (NewsAttributes) su.getAttributesForExtension(Extension.NEWS)[0];
assertNotNull(attr.getName());
assertNotNull(attr.getPublicationDateTime());
assertEquals(2017, attr.getPublicationDateTime().getYear());
}
}
@Test
public void testHebdenbridgetimesArticlesSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.setStrictNamespace(true);
parser.enableExtension(Extension.NEWS);
parser.enableExtension(Extension.IMAGE);
parser.enableExtension(Extension.VIDEO);
parser.enableExtension(Extension.MOBILE);
String contentType = "text/xml";
byte[] content = SiteMapParserTest.getResourceAsBytes("src/test/resources/sitemaps/extension/hebdenbridgetimes-articles-sitemap.xml");
URL url = new URL("http://www.hebdenbridgetimes.co.uk/sitemap-article-2015-18.xml");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(74, sm.getSiteMapUrls().size());
}
}

View File

@ -292,7 +292,7 @@ public class SiteMapParserTest {
public void testSitemapTextGZ() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
String contentType = "application/gzip";
byte[] content = this.getResourceAsBytes("src/test/resources/sitemaps/sitemap.txt.gz");
byte[] content = getResourceAsBytes("src/test/resources/sitemaps/sitemap.txt.gz");
URL url = new URL("http://www.example.com/sitemap.txt.gz");
AbstractSiteMap asm = parser.parseSiteMap(contentType, content, url);
@ -590,7 +590,7 @@ public class SiteMapParserTest {
* @return byte content of the file
* @throws IOException
*/
private byte[] getResourceAsBytes(String resourceName) throws IOException {
protected static byte[] getResourceAsBytes(String resourceName) throws IOException {
File file = new File(resourceName);
InputStream is = new FileInputStream(file);
return IOUtils.toByteArray(is);

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,17 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:image="http://www.google.com/schemas/sitemap-image/1.1">
<url>
<loc>http://www.example.com/images/some_image_landing_page.html</loc>
<image:image>
<image:loc>http://example.com/image.jpg</image:loc>
</image:image>
<image:image>
<image:loc>http://example.com/photo.jpg</image:loc>
<image:caption>This is the caption.</image:caption>
<image:geo_location>Limerick, Ireland</image:geo_location>
<image:title>Example photo shot in Limerick, Ireland</image:title>
<image:license>https://creativecommons.org/licenses/by/4.0/legalcode</image:license>
</image:image>
</url>
</urlset>

View File

@ -0,0 +1,21 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xhtml="http://www.w3.org/1999/xhtml">
<url>
<loc>http://www.example.com/english/</loc>
<xhtml:link rel="alternate" hreflang="de" href="http://www.example.com/deutsch/" />
<xhtml:link rel="alternate" hreflang="de-ch" href="http://www.example.com/schweiz-deutsch/" />
<xhtml:link rel="alternate" hreflang="en" href="http://www.example.com/english/" />
</url>
<url>
<loc>http://www.example.com/deutsch/</loc>
<xhtml:link rel="alternate" hreflang="de" href="http://www.example.com/deutsch/" />
<xhtml:link rel="alternate" hreflang="de-ch" href="http://www.example.com/schweiz-deutsch/" />
<xhtml:link rel="alternate" hreflang="en" href="http://www.example.com/english/" />
</url>
<url>
<loc>http://www.example.com/schweiz-deutsch/</loc>
<xhtml:link rel="alternate" hreflang="de" href="http://www.example.com/deutsch/" />
<xhtml:link rel="alternate" hreflang="de-ch" href="http://www.example.com/schweiz-deutsch/" />
<xhtml:link rel="alternate" hreflang="en" href="http://www.example.com/english/" />
</url>
</urlset>

View File

@ -0,0 +1,15 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:mobile="http://www.google.com/schemas/sitemap-mobile/1.0">
<url>
<loc>http://www.example.com/mobile-friendly-1/</loc>
<mobile:mobile/>
</url>
<url>
<loc>http://www.example.com/mobile-friendly-2/</loc>
<mobile:mobile></mobile:mobile>
</url>
<url>
<loc>http://www.example.com/no-mobile/</loc>
</url>
</urlset>

View File

@ -0,0 +1,19 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:news="http://www.google.com/schemas/sitemap-news/0.9">
<!-- example from https://support.google.com/news/publisher-center/answer/74288 -->
<url>
<loc>http://www.example.org/business/article55.html</loc>
<news:news>
<news:publication>
<news:name>The Example Times</news:name>
<news:language>en</news:language>
</news:publication>
<news:genres>PressRelease, Blog</news:genres>
<news:publication_date>2008-12-23</news:publication_date>
<news:title>Companies A, B in Merger Talks</news:title>
<news:keywords>business, merger, acquisition, A, B</news:keywords>
<news:stock_tickers>NASDAQ:A, NASDAQ:B</news:stock_tickers>
</news:news>
</url>
</urlset>

View File

@ -0,0 +1,33 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:video="http://www.google.com/schemas/sitemap-video/1.1" >
<!-- example from https://support.google.com/webmasters/answer/80471 -->
<url>
<loc>http://www.example.com/videos/some_video_landing_page.html</loc>
<video:video>
<video:thumbnail_loc>http://www.example.com/thumbs/123.jpg</video:thumbnail_loc>
<video:title>Grilling steaks for summer</video:title>
<video:description>Alkis shows you how to get perfectly done steaks every
time</video:description>
<video:content_loc>http://www.example.com/video123.flv</video:content_loc>
<video:player_loc allow_embed="yes" autoplay="ap=1">
http://www.example.com/videoplayer.swf?video=123</video:player_loc>
<video:duration>600</video:duration>
<video:expiration_date>2009-11-05T19:20:30+08:00</video:expiration_date>
<video:rating>4.2</video:rating>
<video:view_count>12345</video:view_count>
<video:publication_date>2007-11-05T19:20:30+08:00</video:publication_date>
<video:family_friendly>yes</video:family_friendly>
<video:tag>sample_tag1</video:tag>
<video:tag>sample_tag2</video:tag>
<video:restriction relationship="allow">IE GB US CA</video:restriction>
<video:gallery_loc title="Cooking Videos">http://cooking.example.com</video:gallery_loc>
<video:price currency="EUR">1.99</video:price>
<video:requires_subscription>yes</video:requires_subscription>
<video:uploader
info="http://www.example.com/users/grillymcgrillerson">GrillyMcGrillerson
</video:uploader>
<video:live>no</video:live>
</video:video>
</url>
</urlset>