1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-07 07:16:02 +02:00
crawler-commons/src/main/java/crawlercommons/sitemaps/SiteMapTester.java
Sebastian Nagel 23ee0634dc [Sitemaps] Disable support for DTDs in sitemaps by default
- update change log
- apply code formatting
- add support for parsing sitemaps with DTD in SiteMapTester
2022-03-02 16:03:13 +01:00

113 lines
4.3 KiB
Java

/**
* Copyright 2016 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps;
import java.io.IOException;
import java.net.URL;
import java.util.Collection;
import org.apache.commons.io.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import crawlercommons.filters.basic.BasicURLNormalizer;
/**
* Sitemap Tool for recursively fetching all URL's from a sitemap (and all of
* it's children)
**/
public class SiteMapTester {
private static final Logger LOG = LoggerFactory.getLogger(SiteMapTester.class);
private static SiteMapParser saxParser = new SiteMapParser(false, true);
public static void main(String[] args) throws IOException, UnknownFormatException {
if (args.length < 1) {
LOG.error("Fetch and process a sitemap (recursively if a sitemap index)");
LOG.error("Usage: SiteMapTester <URL_TO_TEST> [MIME_TYPE]");
LOG.error("Options:");
LOG.error(" URL_TO_TEST URL of sitemap");
LOG.error(" MIME_TYPE force processing sitemap as MIME type,");
LOG.error(" bypass automatic MIME type detection");
LOG.error("Java properties:");
LOG.error(" sitemap.strictNamespace");
LOG.error(" if true sitemaps are required to use the standard namespace URI");
LOG.error(" sitemap.allow.dtd");
LOG.error(" if true sitemaps are allowed to include a DTD");
LOG.error(" sitemap.extensions");
LOG.error(" if true enable sitemap extension parsing");
LOG.error(" sitemap.filter.urls");
LOG.error(" if true filter and normalize all URLs found in the sitemap");
LOG.error(" using crawlercommons.filters.basic.BasicURLNormalizer");
} else {
URL url = new URL(args[0]);
String mt = (args.length > 1) ? args[1] : null;
parse(url, mt);
}
}
/**
* Parses a Sitemap recursively meaning that if the sitemap is a
* sitemapIndex then it parses all of the internal sitemaps
*/
private static void parse(URL url, String mt) throws IOException, UnknownFormatException {
byte[] content = IOUtils.toByteArray(url);
LOG.info("Parsing {} {}", url, ((mt != null && !mt.isEmpty()) ? "as MIME type " + mt : ""));
boolean strictNamespace = Boolean.getBoolean("sitemap.strictNamespace");
saxParser.setStrictNamespace(strictNamespace);
boolean allowDTD = Boolean.getBoolean("sitemap.allow.dtd");
saxParser.setAllowDocTypeDefinitions(allowDTD);
boolean enableExtensions = Boolean.getBoolean("sitemap.extensions");
if (enableExtensions) {
saxParser.enableExtensions();
}
boolean enableURLFilter = Boolean.getBoolean("sitemap.filter.urls");
if (enableURLFilter) {
saxParser.setURLFilter(new BasicURLNormalizer());
}
AbstractSiteMap sm = null;
// guesses the mimetype
if (mt == null || mt.equals("")) {
sm = saxParser.parseSiteMap(content, url);
} else {
sm = saxParser.parseSiteMap(mt, content, url);
}
if (sm.isIndex()) {
Collection<AbstractSiteMap> links = ((SiteMapIndex) sm).getSitemaps();
for (AbstractSiteMap asm : links) {
parse(asm.getUrl(), mt); // Recursive call
}
} else {
Collection<SiteMapURL> links = ((SiteMap) sm).getSiteMapUrls();
for (SiteMapURL smu : links) {
if (enableExtensions) {
LOG.info(smu.toString());
} else {
LOG.info(smu.getUrl().toString());
}
}
}
}
}