1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-09 23:56:04 +02:00
crawler-commons/src/main/java/crawlercommons/sitemaps/Namespace.java

129 lines
5.3 KiB
Java

/**
* Copyright 2016 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import crawlercommons.sitemaps.extension.Extension;
/**
* supported sitemap formats:
* https://www.sitemaps.org/protocol.html#otherformats
*/
public class Namespace {
public static final String SITEMAP = "http://www.sitemaps.org/schemas/sitemap/0.9";
/**
* Legacy schema URIs from prior sitemap protocol versions and frequent
* variants.
*/
public static final String[] SITEMAP_LEGACY = { //
"https://www.sitemaps.org/schemas/sitemap/0.9", //
"http://www.sitemaps.org/schemas/sitemap/0.9/", //
"https://www.sitemaps.org/schemas/sitemap/0.9/", //
"http://www.google.com/schemas/sitemap/0.9", //
"https://www.google.com/schemas/sitemap/0.9", //
"http://www.google.com/schemas/sitemap/0.84", //
"https://www.google.com/schemas/sitemap/0.84", //
"http://www.google.com/schemas/sitemap/0.90", //
"https://sitemaps.org/schemas/sitemap/0.9",
};
public static final String[] IMAGE = { //
"http://www.google.com/schemas/sitemap-image/1.1", //
"https://www.google.com/schemas/sitemap-image/1.1" //
};
public static final String[] VIDEO = { //
"http://www.google.com/schemas/sitemap-video/1.1", //
"https://www.google.com/schemas/sitemap-video/1.1" //
};
public static final String[] NEWS = { //
"http://www.google.com/schemas/sitemap-news/0.9", //
"https://www.google.com/schemas/sitemap-news/0.9", //
"http://www.google.com/schemas/sitemap-news/0.84" //
};
public static final String[] MOBILE = { //
"http://www.google.com/schemas/sitemap-mobile/1.0", //
"https://www.google.com/schemas/sitemap-mobile/1.0" //
};
public static final String LINKS = "http://www.w3.org/1999/xhtml";
public static final String PAGEMAPS = "http://www.google.com/schemas/sitemap-pagemap/1.0";
/**
* In contradiction to the protocol specification ("The Sitemap must ...
* [s]pecify the namespace (protocol standard) within the <urlset>
* tag."), some sitemaps do not define a (default) namespace.
*
* By accepting the "empty" namespace, you'll get URLs even from those
* sitemaps.
*/
public static final String EMPTY = "";
/**
* RSS and Atom sitemap formats do not have strict definition. But if we do
* not parse as namespace aware, then RSS/Atom files that choose to use
* namespaces will break. The relaxed compromise for RSS/Atom is to always
* parse as "namespace aware", but we will only match elements by the
* localName, accepting any element namespace.
*/
public static final String RSS_2_0 = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
public static final String ATOM_0_3 = "http://purl.org/atom/ns#";
public static final String ATOM_1_0 = "http://www.w3.org/2005/Atom";
public static final Set<String> SITEMAP_SUPPORTED_NAMESPACES = new HashSet<>();
static {
SITEMAP_SUPPORTED_NAMESPACES.add(SITEMAP);
SITEMAP_SUPPORTED_NAMESPACES.addAll(Arrays.asList(SITEMAP_LEGACY));
SITEMAP_SUPPORTED_NAMESPACES.addAll(Arrays.asList(IMAGE));
SITEMAP_SUPPORTED_NAMESPACES.addAll(Arrays.asList(VIDEO));
SITEMAP_SUPPORTED_NAMESPACES.addAll(Arrays.asList(NEWS));
SITEMAP_SUPPORTED_NAMESPACES.add(LINKS);
SITEMAP_SUPPORTED_NAMESPACES.add(PAGEMAPS);
}
/**
* @param uri
* URI string identifying the namespace
* @return true if namespace (identified by URI) is supported, false if the
* namespace is not supported or unknown
*/
public static boolean isSupported(String uri) {
return SITEMAP_SUPPORTED_NAMESPACES.contains(uri);
}
public static final Map<Extension, List<String>> SITEMAP_EXTENSION_NAMESPACES = new TreeMap<>();
static {
SITEMAP_EXTENSION_NAMESPACES.put(Extension.NEWS, Arrays.asList(NEWS));
SITEMAP_EXTENSION_NAMESPACES.put(Extension.IMAGE, Arrays.asList(IMAGE));
SITEMAP_EXTENSION_NAMESPACES.put(Extension.VIDEO, Arrays.asList(VIDEO));
SITEMAP_EXTENSION_NAMESPACES.put(Extension.MOBILE, Arrays.asList(MOBILE));
SITEMAP_EXTENSION_NAMESPACES.put(Extension.LINKS, Arrays.asList(LINKS));
SITEMAP_EXTENSION_NAMESPACES.put(Extension.PAGEMAPS, Arrays.asList(PAGEMAPS));
}
}