1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-18 18:06:05 +02:00
crawler-commons/1.4/crawlercommons/sitemaps/SiteMapParser.html
Sebastian Nagel 80f287ecfd Javadoc 1.4
2023-07-18 13:04:29 +02:00

1112 lines
62 KiB
HTML

<!DOCTYPE HTML>
<!-- NewPage -->
<html lang="en">
<head>
<!-- Generated by javadoc (11.0.19) on Thu Jul 13 10:31:24 CEST 2023 -->
<title>SiteMapParser (Crawler-commons 1.4 API)</title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="dc.created" content="2023-07-13">
<link rel="stylesheet" type="text/css" href="../../stylesheet.css" title="Style">
<link rel="stylesheet" type="text/css" href="../../jquery/jquery-ui.min.css" title="Style">
<link rel="stylesheet" type="text/css" href="../../jquery-ui.overrides.css" title="Style">
<script type="text/javascript" src="../../script.js"></script>
<script type="text/javascript" src="../../jquery/jszip/dist/jszip.min.js"></script>
<script type="text/javascript" src="../../jquery/jszip-utils/dist/jszip-utils.min.js"></script>
<!--[if IE]>
<script type="text/javascript" src="../../jquery/jszip-utils/dist/jszip-utils-ie.min.js"></script>
<![endif]-->
<script type="text/javascript" src="../../jquery/jquery-3.6.1.min.js"></script>
<script type="text/javascript" src="../../jquery/jquery-ui.min.js"></script>
</head>
<body>
<script type="text/javascript"><!--
try {
if (location.href.indexOf('is-external=true') == -1) {
parent.document.title="SiteMapParser (Crawler-commons 1.4 API)";
}
}
catch(err) {
}
//-->
var data = {"i0":10,"i1":10,"i2":10,"i3":10,"i4":10,"i5":10,"i6":10,"i7":10,"i8":10,"i9":10,"i10":10,"i11":10,"i12":10,"i13":10,"i14":10,"i15":10,"i16":10,"i17":10,"i18":10,"i19":9,"i20":10,"i21":10};
var tabs = {65535:["t0","All Methods"],1:["t1","Static Methods"],2:["t2","Instance Methods"],8:["t4","Concrete Methods"]};
var altColor = "altColor";
var rowColor = "rowColor";
var tableTab = "tableTab";
var activeTableTab = "activeTableTab";
var pathtoroot = "../../";
var useModuleDirectories = false;
loadScripts(document, 'script');</script>
<noscript>
<div>JavaScript is disabled on your browser.</div>
</noscript>
<header role="banner">
<nav role="navigation">
<div class="fixedNav">
<!-- ========= START OF TOP NAVBAR ======= -->
<div class="topNav"><a id="navbar.top">
<!-- -->
</a>
<div class="skipNav"><a href="#skip.navbar.top" title="Skip navigation links">Skip navigation links</a></div>
<a id="navbar.top.firstrow">
<!-- -->
</a>
<ul class="navList" title="Navigation">
<li><a href="../../index.html">Overview</a></li>
<li><a href="package-summary.html">Package</a></li>
<li class="navBarCell1Rev">Class</li>
<li><a href="class-use/SiteMapParser.html">Use</a></li>
<li><a href="package-tree.html">Tree</a></li>
<li><a href="../../deprecated-list.html">Deprecated</a></li>
<li><a href="../../index-all.html">Index</a></li>
<li><a href="../../help-doc.html">Help</a></li>
</ul>
</div>
<div class="subNav">
<ul class="navList" id="allclasses_navbar_top">
<li><a href="../../allclasses.html">All&nbsp;Classes</a></li>
</ul>
<ul class="navListSearch">
<li><label for="search">SEARCH:</label>
<input type="text" id="search" value="search" disabled="disabled">
<input type="reset" id="reset" value="reset" disabled="disabled">
</li>
</ul>
<div>
<script type="text/javascript"><!--
allClassesLink = document.getElementById("allclasses_navbar_top");
if(window==top) {
allClassesLink.style.display = "block";
}
else {
allClassesLink.style.display = "none";
}
//-->
</script>
<noscript>
<div>JavaScript is disabled on your browser.</div>
</noscript>
</div>
<div>
<ul class="subNavList">
<li>Summary:&nbsp;</li>
<li>Nested&nbsp;|&nbsp;</li>
<li><a href="#field.summary">Field</a>&nbsp;|&nbsp;</li>
<li><a href="#constructor.summary">Constr</a>&nbsp;|&nbsp;</li>
<li><a href="#method.summary">Method</a></li>
</ul>
<ul class="subNavList">
<li>Detail:&nbsp;</li>
<li><a href="#field.detail">Field</a>&nbsp;|&nbsp;</li>
<li><a href="#constructor.detail">Constr</a>&nbsp;|&nbsp;</li>
<li><a href="#method.detail">Method</a></li>
</ul>
</div>
<a id="skip.navbar.top">
<!-- -->
</a></div>
<!-- ========= END OF TOP NAVBAR ========= -->
</div>
<div class="navPadding">&nbsp;</div>
<script type="text/javascript"><!--
$('.navPadding').css('padding-top', $('.fixedNav').css("height"));
//-->
</script>
</nav>
</header>
<!-- ======== START OF CLASS DATA ======== -->
<main role="main">
<div class="header">
<div class="subTitle"><span class="packageLabelInType">Package</span>&nbsp;<a href="package-summary.html">crawlercommons.sitemaps</a></div>
<h2 title="Class SiteMapParser" class="title">Class SiteMapParser</h2>
</div>
<div class="contentContainer">
<ul class="inheritance">
<li><a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/Object.html?is-external=true" title="class or interface in java.lang" class="externalLink">java.lang.Object</a></li>
<li>
<ul class="inheritance">
<li>crawlercommons.sitemaps.SiteMapParser</li>
</ul>
</li>
</ul>
<div class="description">
<ul class="blockList">
<li class="blockList">
<hr>
<pre>public class <span class="typeNameLabel">SiteMapParser</span>
extends <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/Object.html?is-external=true" title="class or interface in java.lang" class="externalLink">Object</a></pre>
</li>
</ul>
</div>
<div class="summary">
<ul class="blockList">
<li class="blockList">
<!-- =========== FIELD SUMMARY =========== -->
<section>
<ul class="blockList">
<li class="blockList"><a id="field.summary">
<!-- -->
</a>
<h3>Field Summary</h3>
<table class="memberSummary">
<caption><span>Fields</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Modifier and Type</th>
<th class="colSecond" scope="col">Field</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tr class="altColor">
<td class="colFirst"><code>protected <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/util/Set.html?is-external=true" title="class or interface in java.util" class="externalLink">Set</a>&lt;<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>&gt;</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#acceptedNamespaces">acceptedNamespaces</a></span></code></th>
<td class="colLast">
<div class="block">Set of namespaces (if <a href="#strictNamespace"><code>strictNamespace</code></a>) accepted by the parser.</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><code>protected <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/util/Map.html?is-external=true" title="class or interface in java.util" class="externalLink">Map</a>&lt;<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>,&#8203;<a href="extension/Extension.html" title="enum in crawlercommons.sitemaps.extension">Extension</a>&gt;</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#extensionNamespaces">extensionNamespaces</a></span></code></th>
<td class="colLast">
<div class="block">Map of sitemap extension namespaces required to find the right extension
handler.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><code>static org.slf4j.Logger</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#LOG">LOG</a></span></code></th>
<td class="colLast">&nbsp;</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><code>static int</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#MAX_BYTES_ALLOWED">MAX_BYTES_ALLOWED</a></span></code></th>
<td class="colLast">
<div class="block">Sitemaps (including sitemap index files) &quot;must be no larger than
50MB (52,428,800 bytes)&quot; as specified in the
<a href="https://www.sitemaps.org/protocol.html#index">Sitemaps XML
format</a> (before Nov.</div>
</td>
</tr>
<tr class="altColor">
<td class="colFirst"><code>protected boolean</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#strict">strict</a></span></code></th>
<td class="colLast">
<div class="block">True (by default) meaning that invalid URLs should be rejected, as the
official docs allow the siteMapURLs to be only under the base url:
https://www.sitemaps.org/protocol.html#location</div>
</td>
</tr>
<tr class="rowColor">
<td class="colFirst"><code>protected boolean</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#strictNamespace">strictNamespace</a></span></code></th>
<td class="colLast">
<div class="block">Indicates whether the parser should work with the namespace from the
specifications or any namespace.</div>
</td>
</tr>
</table>
</li>
</ul>
</section>
<!-- ======== CONSTRUCTOR SUMMARY ======== -->
<section>
<ul class="blockList">
<li class="blockList"><a id="constructor.summary">
<!-- -->
</a>
<h3>Constructor Summary</h3>
<table class="memberSummary">
<caption><span>Constructors</span><span class="tabEnd">&nbsp;</span></caption>
<tr>
<th class="colFirst" scope="col">Constructor</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tr class="altColor">
<th class="colConstructorName" scope="row"><code><span class="memberNameLink"><a href="#%3Cinit%3E()">SiteMapParser</a></span>()</code></th>
<td class="colLast">
<div class="block">SiteMapParser with strict location validation (<a href="#isStrict()"><code>isStrict()</code></a>) and not
allowing partially parsed content.</div>
</td>
</tr>
<tr class="rowColor">
<th class="colConstructorName" scope="row"><code><span class="memberNameLink"><a href="#%3Cinit%3E(boolean)">SiteMapParser</a></span>&#8203;(boolean&nbsp;strict)</code></th>
<td class="colLast">
<div class="block">SiteMapParser with configurable location validation, not allowing
partially parsed content.</div>
</td>
</tr>
<tr class="altColor">
<th class="colConstructorName" scope="row"><code><span class="memberNameLink"><a href="#%3Cinit%3E(boolean,boolean)">SiteMapParser</a></span>&#8203;(boolean&nbsp;strict,
boolean&nbsp;allowPartial)</code></th>
<td class="colLast">&nbsp;</td>
</tr>
</table>
</li>
</ul>
</section>
<!-- ========== METHOD SUMMARY =========== -->
<section>
<ul class="blockList">
<li class="blockList"><a id="method.summary">
<!-- -->
</a>
<h3>Method Summary</h3>
<table class="memberSummary">
<caption><span id="t0" class="activeTableTab"><span>All Methods</span><span class="tabEnd">&nbsp;</span></span><span id="t1" class="tableTab"><span><a href="javascript:show(1);">Static Methods</a></span><span class="tabEnd">&nbsp;</span></span><span id="t2" class="tableTab"><span><a href="javascript:show(2);">Instance Methods</a></span><span class="tabEnd">&nbsp;</span></span><span id="t4" class="tableTab"><span><a href="javascript:show(8);">Concrete Methods</a></span><span class="tabEnd">&nbsp;</span></span></caption>
<tr>
<th class="colFirst" scope="col">Modifier and Type</th>
<th class="colSecond" scope="col">Method</th>
<th class="colLast" scope="col">Description</th>
</tr>
<tr id="i0" class="altColor">
<td class="colFirst"><code>void</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#addAcceptedNamespace(java.lang.String)">addAcceptedNamespace</a></span>&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>&nbsp;namespaceUri)</code></th>
<td class="colLast">
<div class="block">Add namespace URI to set of accepted namespaces.</div>
</td>
</tr>
<tr id="i1" class="rowColor">
<td class="colFirst"><code>void</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#addAcceptedNamespace(java.lang.String%5B%5D)">addAcceptedNamespace</a></span>&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>[]&nbsp;namespaceUris)</code></th>
<td class="colLast">
<div class="block">Add namespace URIs to set of accepted namespaces.</div>
</td>
</tr>
<tr id="i2" class="altColor">
<td class="colFirst"><code>void</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#enableExtension(crawlercommons.sitemaps.extension.Extension)">enableExtension</a></span>&#8203;(<a href="extension/Extension.html" title="enum in crawlercommons.sitemaps.extension">Extension</a>&nbsp;extension)</code></th>
<td class="colLast">
<div class="block">Enable a support for a sitemap extension in the parser.</div>
</td>
</tr>
<tr id="i3" class="rowColor">
<td class="colFirst"><code>void</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#enableExtensions()">enableExtensions</a></span>()</code></th>
<td class="colLast">
<div class="block">Enable all supported sitemap extensions in the parser.</div>
</td>
</tr>
<tr id="i4" class="altColor">
<td class="colFirst"><code>boolean</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#isStrict()">isStrict</a></span>()</code></th>
<td class="colLast">&nbsp;</td>
</tr>
<tr id="i5" class="rowColor">
<td class="colFirst"><code>boolean</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#isStrictNamespace()">isStrictNamespace</a></span>()</code></th>
<td class="colLast">&nbsp;</td>
</tr>
<tr id="i6" class="altColor">
<td class="colFirst"><code><a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a></code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#parseSiteMap(byte%5B%5D,java.net.URL)">parseSiteMap</a></span>&#8203;(byte[]&nbsp;content,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;url)</code></th>
<td class="colLast">
<div class="block">Parse a sitemap, given the content bytes and the URL.</div>
</td>
</tr>
<tr id="i7" class="rowColor">
<td class="colFirst"><code><a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a></code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#parseSiteMap(java.lang.String,byte%5B%5D,crawlercommons.sitemaps.AbstractSiteMap)">parseSiteMap</a></span>&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>&nbsp;contentType,
byte[]&nbsp;content,
<a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a>&nbsp;sitemap)</code></th>
<td class="colLast">
<div class="block">Returns a processed copy of an unprocessed sitemap object, i.e.</div>
</td>
</tr>
<tr id="i8" class="altColor">
<td class="colFirst"><code><a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a></code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#parseSiteMap(java.lang.String,byte%5B%5D,java.net.URL)">parseSiteMap</a></span>&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>&nbsp;contentType,
byte[]&nbsp;content,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;url)</code></th>
<td class="colLast">
<div class="block">Parse a sitemap, given the MIME type, the content bytes, and the URL.</div>
</td>
</tr>
<tr id="i9" class="rowColor">
<td class="colFirst"><code><a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a></code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#parseSiteMap(java.net.URL)">parseSiteMap</a></span>&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;onlineSitemapUrl)</code></th>
<td class="colLast">
<div class="block">Returns a SiteMap or SiteMapIndex given an online sitemap URL
Please note that this method is a static method which goes online and
fetches the sitemap then parses it
This method is a convenience method for a user who has a sitemap URL and
wants a "Keep it simple" way to parse it.</div>
</td>
</tr>
<tr id="i10" class="altColor">
<td class="colFirst"><code>protected <a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a></code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#processGzippedXML(java.net.URL,byte%5B%5D)">processGzippedXML</a></span>&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;url,
byte[]&nbsp;response)</code></th>
<td class="colLast">
<div class="block">Decompress the gzipped content and process the resulting XML Sitemap.</div>
</td>
</tr>
<tr id="i11" class="rowColor">
<td class="colFirst"><code>protected <a href="SiteMap.html" title="class in crawlercommons.sitemaps">SiteMap</a></code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#processText(java.net.URL,byte%5B%5D)">processText</a></span>&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;sitemapUrl,
byte[]&nbsp;content)</code></th>
<td class="colLast">
<div class="block">Process a text-based Sitemap.</div>
</td>
</tr>
<tr id="i12" class="altColor">
<td class="colFirst"><code>protected <a href="SiteMap.html" title="class in crawlercommons.sitemaps">SiteMap</a></code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#processText(java.net.URL,java.io.InputStream)">processText</a></span>&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;sitemapUrl,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/InputStream.html?is-external=true" title="class or interface in java.io" class="externalLink">InputStream</a>&nbsp;stream)</code></th>
<td class="colLast">
<div class="block">Process a text-based Sitemap.</div>
</td>
</tr>
<tr id="i13" class="rowColor">
<td class="colFirst"><code>protected <a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a></code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#processXml(java.net.URL,byte%5B%5D)">processXml</a></span>&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;sitemapUrl,
byte[]&nbsp;xmlContent)</code></th>
<td class="colLast">
<div class="block">Parse the given XML content.</div>
</td>
</tr>
<tr id="i14" class="altColor">
<td class="colFirst"><code>protected <a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a></code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#processXml(java.net.URL,org.xml.sax.InputSource)">processXml</a></span>&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;sitemapUrl,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/org/xml/sax/InputSource.html?is-external=true" title="class or interface in org.xml.sax" class="externalLink">InputSource</a>&nbsp;is)</code></th>
<td class="colLast">
<div class="block">Parse the given XML content.</div>
</td>
</tr>
<tr id="i15" class="rowColor">
<td class="colFirst"><code>void</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#setAllowDocTypeDefinitions(boolean)">setAllowDocTypeDefinitions</a></span>&#8203;(boolean&nbsp;allowDocTypeDefinitions)</code></th>
<td class="colLast">
<div class="block">Sets if the parser allows a DTD in sitemaps or feeds.</div>
</td>
</tr>
<tr id="i16" class="altColor">
<td class="colFirst"><code>void</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#setStrictNamespace(boolean)">setStrictNamespace</a></span>&#8203;(boolean&nbsp;s)</code></th>
<td class="colLast">
<div class="block">Sets the parser to allow any XML namespace or just the one from the
specification, or any accepted namespace (see
<a href="#addAcceptedNamespace(java.lang.String)"><code>addAcceptedNamespace(String)</code></a>).</div>
</td>
</tr>
<tr id="i17" class="rowColor">
<td class="colFirst"><code>void</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#setURLFilter(crawlercommons.filters.URLFilter)">setURLFilter</a></span>&#8203;(<a href="../filters/URLFilter.html" title="class in crawlercommons.filters">URLFilter</a>&nbsp;filter)</code></th>
<td class="colLast">
<div class="block">Use <a href="../filters/URLFilter.html" title="class in crawlercommons.filters"><code>URLFilter</code></a> to filter URLs, eg.</div>
</td>
</tr>
<tr id="i18" class="altColor">
<td class="colFirst"><code>void</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#setURLFilter(java.util.function.Function)">setURLFilter</a></span>&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/util/function/Function.html?is-external=true" title="class or interface in java.util.function" class="externalLink">Function</a>&lt;<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>,&#8203;<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>&gt;&nbsp;filter)</code></th>
<td class="colLast">
<div class="block">Set URL filter function to normalize URLs found in sitemaps or filter
URLs away if the function returns null.</div>
</td>
</tr>
<tr id="i19" class="rowColor">
<td class="colFirst"><code>static boolean</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#urlIsValid(java.lang.String,java.lang.String)">urlIsValid</a></span>&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>&nbsp;sitemapBaseUrl,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>&nbsp;testUrl)</code></th>
<td class="colLast">
<div class="block">See if testUrl is under sitemapBaseUrl.</div>
</td>
</tr>
<tr id="i20" class="altColor">
<td class="colFirst"><code>void</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#walkSiteMap(crawlercommons.sitemaps.AbstractSiteMap,java.util.function.Consumer)">walkSiteMap</a></span>&#8203;(<a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a>&nbsp;sitemap,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/util/function/Consumer.html?is-external=true" title="class or interface in java.util.function" class="externalLink">Consumer</a>&lt;<a href="SiteMapURL.html" title="class in crawlercommons.sitemaps">SiteMapURL</a>&gt;&nbsp;action)</code></th>
<td class="colLast">
<div class="block">Traverse a sitemap, recursively fetching and traversing the content of
any enclosed sitemap index, and performing the specified action for each
sitemap URL until all URLs have been processed or the action throws an
exception.</div>
</td>
</tr>
<tr id="i21" class="rowColor">
<td class="colFirst"><code>void</code></td>
<th class="colSecond" scope="row"><code><span class="memberNameLink"><a href="#walkSiteMap(java.net.URL,java.util.function.Consumer)">walkSiteMap</a></span>&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;onlineSitemapUrl,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/util/function/Consumer.html?is-external=true" title="class or interface in java.util.function" class="externalLink">Consumer</a>&lt;<a href="SiteMapURL.html" title="class in crawlercommons.sitemaps">SiteMapURL</a>&gt;&nbsp;action)</code></th>
<td class="colLast">
<div class="block">Fetch a sitemap from the specified URL, recursively fetching and
traversing the content of any enclosed sitemap index, and performing the
specified action for each sitemap URL until all URLs have been processed
or the action throws an exception.</div>
</td>
</tr>
</table>
<ul class="blockList">
<li class="blockList"><a id="methods.inherited.from.class.java.lang.Object">
<!-- -->
</a>
<h3>Methods inherited from class&nbsp;java.lang.<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/Object.html?is-external=true" title="class or interface in java.lang" class="externalLink">Object</a></h3>
<code><a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/Object.html?is-external=true#clone()" title="class or interface in java.lang" class="externalLink">clone</a>, <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/Object.html?is-external=true#equals(java.lang.Object)" title="class or interface in java.lang" class="externalLink">equals</a>, <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/Object.html?is-external=true#finalize()" title="class or interface in java.lang" class="externalLink">finalize</a>, <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/Object.html?is-external=true#getClass()" title="class or interface in java.lang" class="externalLink">getClass</a>, <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/Object.html?is-external=true#hashCode()" title="class or interface in java.lang" class="externalLink">hashCode</a>, <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/Object.html?is-external=true#notify()" title="class or interface in java.lang" class="externalLink">notify</a>, <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/Object.html?is-external=true#notifyAll()" title="class or interface in java.lang" class="externalLink">notifyAll</a>, <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/Object.html?is-external=true#toString()" title="class or interface in java.lang" class="externalLink">toString</a>, <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/Object.html?is-external=true#wait()" title="class or interface in java.lang" class="externalLink">wait</a>, <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/Object.html?is-external=true#wait(long)" title="class or interface in java.lang" class="externalLink">wait</a>, <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/Object.html?is-external=true#wait(long,int)" title="class or interface in java.lang" class="externalLink">wait</a></code></li>
</ul>
</li>
</ul>
</section>
</li>
</ul>
</div>
<div class="details">
<ul class="blockList">
<li class="blockList">
<!-- ============ FIELD DETAIL =========== -->
<section>
<ul class="blockList">
<li class="blockList"><a id="field.detail">
<!-- -->
</a>
<h3>Field Detail</h3>
<a id="LOG">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>LOG</h4>
<pre>public static final&nbsp;org.slf4j.Logger LOG</pre>
</li>
</ul>
<a id="MAX_BYTES_ALLOWED">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>MAX_BYTES_ALLOWED</h4>
<pre>public static final&nbsp;int MAX_BYTES_ALLOWED</pre>
<div class="block">Sitemaps (including sitemap index files) &quot;must be no larger than
50MB (52,428,800 bytes)&quot; as specified in the
<a href="https://www.sitemaps.org/protocol.html#index">Sitemaps XML
format</a> (before Nov. 2016 the limit has been 10MB).</div>
<dl>
<dt><span class="seeLabel">See Also:</span></dt>
<dd><a href="../../constant-values.html#crawlercommons.sitemaps.SiteMapParser.MAX_BYTES_ALLOWED">Constant Field Values</a></dd>
</dl>
</li>
</ul>
<a id="strict">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>strict</h4>
<pre>protected&nbsp;boolean strict</pre>
<div class="block">True (by default) meaning that invalid URLs should be rejected, as the
official docs allow the siteMapURLs to be only under the base url:
https://www.sitemaps.org/protocol.html#location</div>
</li>
</ul>
<a id="strictNamespace">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>strictNamespace</h4>
<pre>protected&nbsp;boolean strictNamespace</pre>
<div class="block">Indicates whether the parser should work with the namespace from the
specifications or any namespace. Defaults to false.</div>
</li>
</ul>
<a id="acceptedNamespaces">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>acceptedNamespaces</h4>
<pre>protected&nbsp;<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/util/Set.html?is-external=true" title="class or interface in java.util" class="externalLink">Set</a>&lt;<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>&gt; acceptedNamespaces</pre>
<div class="block">Set of namespaces (if <a href="#strictNamespace"><code>strictNamespace</code></a>) accepted by the parser. URLs from other namespaces are ignored.</div>
</li>
</ul>
<a id="extensionNamespaces">
<!-- -->
</a>
<ul class="blockListLast">
<li class="blockList">
<h4>extensionNamespaces</h4>
<pre>protected&nbsp;<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/util/Map.html?is-external=true" title="class or interface in java.util" class="externalLink">Map</a>&lt;<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>,&#8203;<a href="extension/Extension.html" title="enum in crawlercommons.sitemaps.extension">Extension</a>&gt; extensionNamespaces</pre>
<div class="block">Map of sitemap extension namespaces required to find the right extension
handler.</div>
</li>
</ul>
</li>
</ul>
</section>
<!-- ========= CONSTRUCTOR DETAIL ======== -->
<section>
<ul class="blockList">
<li class="blockList"><a id="constructor.detail">
<!-- -->
</a>
<h3>Constructor Detail</h3>
<a id="&lt;init&gt;()">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>SiteMapParser</h4>
<pre>public&nbsp;SiteMapParser()</pre>
<div class="block">SiteMapParser with strict location validation (<a href="#isStrict()"><code>isStrict()</code></a>) and not
allowing partially parsed content.</div>
</li>
</ul>
<a id="&lt;init&gt;(boolean)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>SiteMapParser</h4>
<pre>public&nbsp;SiteMapParser&#8203;(boolean&nbsp;strict)</pre>
<div class="block">SiteMapParser with configurable location validation, not allowing
partially parsed content.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>strict</code> - see <a href="#isStrict()"><code>isStrict()</code></a></dd>
</dl>
</li>
</ul>
<a id="&lt;init&gt;(boolean,boolean)">
<!-- -->
</a>
<ul class="blockListLast">
<li class="blockList">
<h4>SiteMapParser</h4>
<pre>public&nbsp;SiteMapParser&#8203;(boolean&nbsp;strict,
boolean&nbsp;allowPartial)</pre>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>strict</code> - see <a href="#isStrict()"><code>isStrict()</code></a></dd>
<dd><code>allowPartial</code> - if true: allow URLs from sitemaps only partially parsed
because of format errors or truncated (incompletely fetched)
content. If false any parser error will cause an
<a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps"><code>UnknownFormatException</code></a>.</dd>
</dl>
</li>
</ul>
</li>
</ul>
</section>
<!-- ============ METHOD DETAIL ========== -->
<section>
<ul class="blockList">
<li class="blockList"><a id="method.detail">
<!-- -->
</a>
<h3>Method Detail</h3>
<a id="setAllowDocTypeDefinitions(boolean)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>setAllowDocTypeDefinitions</h4>
<pre class="methodSignature">public&nbsp;void&nbsp;setAllowDocTypeDefinitions&#8203;(boolean&nbsp;allowDocTypeDefinitions)</pre>
<div class="block">Sets if the parser allows a DTD in sitemaps or feeds.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>allowDocTypeDefinitions</code> - true if allowed. Default is false.</dd>
</dl>
</li>
</ul>
<a id="isStrict()">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>isStrict</h4>
<pre class="methodSignature">public&nbsp;boolean&nbsp;isStrict()</pre>
<dl>
<dt><span class="returnLabel">Returns:</span></dt>
<dd>whether invalid URLs will be rejected (where invalid means that
the URL is not under the base URL, see <a href="https://www.sitemaps.org/protocol.html#location">sitemap file
location</a>)</dd>
</dl>
</li>
</ul>
<a id="isStrictNamespace()">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>isStrictNamespace</h4>
<pre class="methodSignature">public&nbsp;boolean&nbsp;isStrictNamespace()</pre>
<dl>
<dt><span class="returnLabel">Returns:</span></dt>
<dd>whether the parser allows any namespace or just the one from the
specification (or any namespace accepted,
<a href="#addAcceptedNamespace(java.lang.String)"><code>addAcceptedNamespace(String)</code></a>)</dd>
</dl>
</li>
</ul>
<a id="setStrictNamespace(boolean)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>setStrictNamespace</h4>
<pre class="methodSignature">public&nbsp;void&nbsp;setStrictNamespace&#8203;(boolean&nbsp;s)</pre>
<div class="block">Sets the parser to allow any XML namespace or just the one from the
specification, or any accepted namespace (see
<a href="#addAcceptedNamespace(java.lang.String)"><code>addAcceptedNamespace(String)</code></a>). Note enabling strict namespace
checking always adds the namespace defined by the current sitemap
specification (<a href="Namespace.html#SITEMAP"><code>Namespace.SITEMAP</code></a>) to the list of accepted
namespaces.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>s</code> - if true enable strict namespace-checking, disable if false</dd>
</dl>
</li>
</ul>
<a id="addAcceptedNamespace(java.lang.String)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>addAcceptedNamespace</h4>
<pre class="methodSignature">public&nbsp;void&nbsp;addAcceptedNamespace&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>&nbsp;namespaceUri)</pre>
<div class="block">Add namespace URI to set of accepted namespaces.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>namespaceUri</code> - URI of the accepted XML namespace</dd>
</dl>
</li>
</ul>
<a id="addAcceptedNamespace(java.lang.String[])">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>addAcceptedNamespace</h4>
<pre class="methodSignature">public&nbsp;void&nbsp;addAcceptedNamespace&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>[]&nbsp;namespaceUris)</pre>
<div class="block">Add namespace URIs to set of accepted namespaces.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>namespaceUris</code> - array of accepted XML namespace URIs</dd>
</dl>
</li>
</ul>
<a id="enableExtension(crawlercommons.sitemaps.extension.Extension)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>enableExtension</h4>
<pre class="methodSignature">public&nbsp;void&nbsp;enableExtension&#8203;(<a href="extension/Extension.html" title="enum in crawlercommons.sitemaps.extension">Extension</a>&nbsp;extension)</pre>
<div class="block">Enable a support for a sitemap extension in the parser.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>extension</code> - sitemap extension (news, images, videos, etc.)</dd>
</dl>
</li>
</ul>
<a id="enableExtensions()">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>enableExtensions</h4>
<pre class="methodSignature">public&nbsp;void&nbsp;enableExtensions()</pre>
<div class="block">Enable all supported sitemap extensions in the parser.</div>
</li>
</ul>
<a id="setURLFilter(java.util.function.Function)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>setURLFilter</h4>
<pre class="methodSignature">public&nbsp;void&nbsp;setURLFilter&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/util/function/Function.html?is-external=true" title="class or interface in java.util.function" class="externalLink">Function</a>&lt;<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>,&#8203;<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>&gt;&nbsp;filter)</pre>
<div class="block">Set URL filter function to normalize URLs found in sitemaps or filter
URLs away if the function returns null.</div>
</li>
</ul>
<a id="setURLFilter(crawlercommons.filters.URLFilter)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>setURLFilter</h4>
<pre class="methodSignature">public&nbsp;void&nbsp;setURLFilter&#8203;(<a href="../filters/URLFilter.html" title="class in crawlercommons.filters">URLFilter</a>&nbsp;filter)</pre>
<div class="block">Use <a href="../filters/URLFilter.html" title="class in crawlercommons.filters"><code>URLFilter</code></a> to filter URLs, eg. to configure that URLs found in
sitemaps are normalized by
<a href="../filters/basic/BasicURLNormalizer.html" title="class in crawlercommons.filters.basic"><code>BasicURLNormalizer</code></a>:
<pre>
sitemapParser.setURLFilter(new BasicURLNormalizer());
</pre></div>
</li>
</ul>
<a id="parseSiteMap(java.net.URL)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>parseSiteMap</h4>
<pre class="methodSignature">public&nbsp;<a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a>&nbsp;parseSiteMap&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;onlineSitemapUrl)
throws <a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a>,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></pre>
<div class="block">Returns a SiteMap or SiteMapIndex given an online sitemap URL
Please note that this method is a static method which goes online and
fetches the sitemap then parses it
This method is a convenience method for a user who has a sitemap URL and
wants a "Keep it simple" way to parse it.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>onlineSitemapUrl</code> - URL of the online sitemap</dd>
<dt><span class="returnLabel">Returns:</span></dt>
<dd>Extracted SiteMap/SiteMapIndex or null if the onlineSitemapUrl is
null</dd>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a></code> - if there is an error parsing the sitemap</dd>
<dd><code><a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></code> - if there is an error reading in the site map
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink"><code>URL</code></a></dd>
</dl>
</li>
</ul>
<a id="parseSiteMap(java.lang.String,byte[],crawlercommons.sitemaps.AbstractSiteMap)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>parseSiteMap</h4>
<pre class="methodSignature">public&nbsp;<a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a>&nbsp;parseSiteMap&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>&nbsp;contentType,
byte[]&nbsp;content,
<a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a>&nbsp;sitemap)
throws <a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a>,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></pre>
<div class="block">Returns a processed copy of an unprocessed sitemap object, i.e. transfer
the value of getLastModified(). Please note that the sitemap input stays
unchanged. Note that contentType is assumed to be correct; in general it
is more robust to use the method that doesn't take a contentType, but
instead detects this using Tika.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>contentType</code> - MIME type of content</dd>
<dd><code>content</code> - raw bytes of sitemap file</dd>
<dd><code>sitemap</code> - an <a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps"><code>AbstractSiteMap</code></a>
implementation</dd>
<dt><span class="returnLabel">Returns:</span></dt>
<dd>Extracted SiteMap/SiteMapIndex</dd>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a></code> - if there is an error parsing the sitemap</dd>
<dd><code><a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></code> - if there is an error reading in the site map
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink"><code>URL</code></a></dd>
</dl>
</li>
</ul>
<a id="parseSiteMap(byte[],java.net.URL)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>parseSiteMap</h4>
<pre class="methodSignature">public&nbsp;<a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a>&nbsp;parseSiteMap&#8203;(byte[]&nbsp;content,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;url)
throws <a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a>,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></pre>
<div class="block">Parse a sitemap, given the content bytes and the URL.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>content</code> - raw bytes of sitemap file</dd>
<dd><code>url</code> - URL to sitemap file</dd>
<dt><span class="returnLabel">Returns:</span></dt>
<dd>Extracted SiteMap/SiteMapIndex</dd>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a></code> - if there is an error parsing the sitemap</dd>
<dd><code><a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></code> - if there is an error reading in the site map
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink"><code>URL</code></a></dd>
</dl>
</li>
</ul>
<a id="parseSiteMap(java.lang.String,byte[],java.net.URL)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>parseSiteMap</h4>
<pre class="methodSignature">public&nbsp;<a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a>&nbsp;parseSiteMap&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>&nbsp;contentType,
byte[]&nbsp;content,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;url)
throws <a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a>,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></pre>
<div class="block">Parse a sitemap, given the MIME type, the content bytes, and the URL.
Note that contentType is assumed to be correct; in general it is more
robust to use the method that doesn't take a contentType, but instead
detects this using Tika.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>contentType</code> - MIME type of content</dd>
<dd><code>content</code> - raw bytes of sitemap file</dd>
<dd><code>url</code> - URL to sitemap file</dd>
<dt><span class="returnLabel">Returns:</span></dt>
<dd>Extracted SiteMap/SiteMapIndex</dd>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a></code> - if there is an error parsing the sitemap</dd>
<dd><code><a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></code> - if there is an error reading in the site map
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink"><code>URL</code></a></dd>
</dl>
</li>
</ul>
<a id="walkSiteMap(java.net.URL,java.util.function.Consumer)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>walkSiteMap</h4>
<pre class="methodSignature">public&nbsp;void&nbsp;walkSiteMap&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;onlineSitemapUrl,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/util/function/Consumer.html?is-external=true" title="class or interface in java.util.function" class="externalLink">Consumer</a>&lt;<a href="SiteMapURL.html" title="class in crawlercommons.sitemaps">SiteMapURL</a>&gt;&nbsp;action)
throws <a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a>,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></pre>
<div class="block">Fetch a sitemap from the specified URL, recursively fetching and
traversing the content of any enclosed sitemap index, and performing the
specified action for each sitemap URL until all URLs have been processed
or the action throws an exception.
<p>
This method is a convenience method for a user who has a sitemap URL and
wants a simple way to traverse it.
<p>
Exceptions thrown by the action are relayed to the caller.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>onlineSitemapUrl</code> - URL of the online sitemap</dd>
<dd><code>action</code> - The action to be performed for each element</dd>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a></code> - if there is an error parsing the sitemap</dd>
<dd><code><a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></code> - if there is an error fetching the content of any
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink"><code>URL</code></a></dd>
</dl>
</li>
</ul>
<a id="walkSiteMap(crawlercommons.sitemaps.AbstractSiteMap,java.util.function.Consumer)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>walkSiteMap</h4>
<pre class="methodSignature">public&nbsp;void&nbsp;walkSiteMap&#8203;(<a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a>&nbsp;sitemap,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/util/function/Consumer.html?is-external=true" title="class or interface in java.util.function" class="externalLink">Consumer</a>&lt;<a href="SiteMapURL.html" title="class in crawlercommons.sitemaps">SiteMapURL</a>&gt;&nbsp;action)
throws <a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a>,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></pre>
<div class="block">Traverse a sitemap, recursively fetching and traversing the content of
any enclosed sitemap index, and performing the specified action for each
sitemap URL until all URLs have been processed or the action throws an
exception.
<p>
This method is a convenience method for a user who has a sitemap and
wants a simple way to traverse it.
<p>
Exceptions thrown by the action are relayed to the caller.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>sitemap</code> - The sitemap to traverse</dd>
<dd><code>action</code> - The action to be performed for each element</dd>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a></code> - if there is an error parsing the sitemap</dd>
<dd><code><a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></code> - if there is an error fetching the content of any
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink"><code>URL</code></a></dd>
</dl>
</li>
</ul>
<a id="processXml(java.net.URL,byte[])">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>processXml</h4>
<pre class="methodSignature">protected&nbsp;<a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a>&nbsp;processXml&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;sitemapUrl,
byte[]&nbsp;xmlContent)
throws <a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a></pre>
<div class="block">Parse the given XML content.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>sitemapUrl</code> - URL to sitemap file</dd>
<dd><code>xmlContent</code> - the byte[] backing the sitemapUrl</dd>
<dt><span class="returnLabel">Returns:</span></dt>
<dd>The site map</dd>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a></code> - if there is an error parsing the sitemap</dd>
</dl>
</li>
</ul>
<a id="processText(java.net.URL,byte[])">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>processText</h4>
<pre class="methodSignature">protected&nbsp;<a href="SiteMap.html" title="class in crawlercommons.sitemaps">SiteMap</a>&nbsp;processText&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;sitemapUrl,
byte[]&nbsp;content)
throws <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></pre>
<div class="block">Process a text-based Sitemap. Text sitemaps only list URLs but no
priorities, last mods, etc.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>sitemapUrl</code> - URL to sitemap file</dd>
<dd><code>content</code> - the byte[] backing the sitemapUrl</dd>
<dt><span class="returnLabel">Returns:</span></dt>
<dd>The site map</dd>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></code> - if there is an error reading in the site map content</dd>
</dl>
</li>
</ul>
<a id="processText(java.net.URL,java.io.InputStream)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>processText</h4>
<pre class="methodSignature">protected&nbsp;<a href="SiteMap.html" title="class in crawlercommons.sitemaps">SiteMap</a>&nbsp;processText&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;sitemapUrl,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/InputStream.html?is-external=true" title="class or interface in java.io" class="externalLink">InputStream</a>&nbsp;stream)
throws <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></pre>
<div class="block">Process a text-based Sitemap. Text sitemaps only list URLs but no
priorities, last mods, etc.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>sitemapUrl</code> - URL to sitemap file</dd>
<dd><code>stream</code> - content stream</dd>
<dt><span class="returnLabel">Returns:</span></dt>
<dd>The site map</dd>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></code> - if there is an error reading in the site map content</dd>
</dl>
</li>
</ul>
<a id="processGzippedXML(java.net.URL,byte[])">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>processGzippedXML</h4>
<pre class="methodSignature">protected&nbsp;<a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a>&nbsp;processGzippedXML&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;url,
byte[]&nbsp;response)
throws <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a>,
<a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a></pre>
<div class="block">Decompress the gzipped content and process the resulting XML Sitemap.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>url</code> - - URL of the gzipped content</dd>
<dd><code>response</code> - - Gzipped content</dd>
<dt><span class="returnLabel">Returns:</span></dt>
<dd>the site map</dd>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a></code> - if there is an error parsing the gzip</dd>
<dd><code><a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/io/IOException.html?is-external=true" title="class or interface in java.io" class="externalLink">IOException</a></code> - if there is an error reading in the gzip <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink"><code>URL</code></a></dd>
</dl>
</li>
</ul>
<a id="processXml(java.net.URL,org.xml.sax.InputSource)">
<!-- -->
</a>
<ul class="blockList">
<li class="blockList">
<h4>processXml</h4>
<pre class="methodSignature">protected&nbsp;<a href="AbstractSiteMap.html" title="class in crawlercommons.sitemaps">AbstractSiteMap</a>&nbsp;processXml&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink">URL</a>&nbsp;sitemapUrl,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/org/xml/sax/InputSource.html?is-external=true" title="class or interface in org.xml.sax" class="externalLink">InputSource</a>&nbsp;is)
throws <a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a></pre>
<div class="block">Parse the given XML content.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>sitemapUrl</code> - a sitemap <a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/net/URL.html?is-external=true" title="class or interface in java.net" class="externalLink"><code>URL</code></a></dd>
<dd><code>is</code> - an <a href="https://docs.oracle.com/en/java/javase/11/docs/api/org/xml/sax/InputSource.html?is-external=true" title="class or interface in org.xml.sax" class="externalLink"><code>InputSource</code></a> backing the sitemap</dd>
<dt><span class="returnLabel">Returns:</span></dt>
<dd>the site map</dd>
<dt><span class="throwsLabel">Throws:</span></dt>
<dd><code><a href="UnknownFormatException.html" title="class in crawlercommons.sitemaps">UnknownFormatException</a></code> - if there is an error parsing the
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/org/xml/sax/InputSource.html?is-external=true" title="class or interface in org.xml.sax" class="externalLink"><code>InputSource</code></a></dd>
</dl>
</li>
</ul>
<a id="urlIsValid(java.lang.String,java.lang.String)">
<!-- -->
</a>
<ul class="blockListLast">
<li class="blockList">
<h4>urlIsValid</h4>
<pre class="methodSignature">public static&nbsp;boolean&nbsp;urlIsValid&#8203;(<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>&nbsp;sitemapBaseUrl,
<a href="https://docs.oracle.com/en/java/javase/11/docs/api/java/lang/String.html?is-external=true" title="class or interface in java.lang" class="externalLink">String</a>&nbsp;testUrl)</pre>
<div class="block">See if testUrl is under sitemapBaseUrl. Only URLs under sitemapBaseUrl
are valid.</div>
<dl>
<dt><span class="paramLabel">Parameters:</span></dt>
<dd><code>sitemapBaseUrl</code> - the base URL of the sitemap</dd>
<dd><code>testUrl</code> - the URL to be tested</dd>
<dt><span class="returnLabel">Returns:</span></dt>
<dd>true if testUrl is under sitemapBaseUrl, false otherwise</dd>
</dl>
</li>
</ul>
</li>
</ul>
</section>
</li>
</ul>
</div>
</div>
</main>
<!-- ========= END OF CLASS DATA ========= -->
<footer role="contentinfo">
<nav role="navigation">
<!-- ======= START OF BOTTOM NAVBAR ====== -->
<div class="bottomNav"><a id="navbar.bottom">
<!-- -->
</a>
<div class="skipNav"><a href="#skip.navbar.bottom" title="Skip navigation links">Skip navigation links</a></div>
<a id="navbar.bottom.firstrow">
<!-- -->
</a>
<ul class="navList" title="Navigation">
<li><a href="../../index.html">Overview</a></li>
<li><a href="package-summary.html">Package</a></li>
<li class="navBarCell1Rev">Class</li>
<li><a href="class-use/SiteMapParser.html">Use</a></li>
<li><a href="package-tree.html">Tree</a></li>
<li><a href="../../deprecated-list.html">Deprecated</a></li>
<li><a href="../../index-all.html">Index</a></li>
<li><a href="../../help-doc.html">Help</a></li>
</ul>
</div>
<div class="subNav">
<ul class="navList" id="allclasses_navbar_bottom">
<li><a href="../../allclasses.html">All&nbsp;Classes</a></li>
</ul>
<div>
<script type="text/javascript"><!--
allClassesLink = document.getElementById("allclasses_navbar_bottom");
if(window==top) {
allClassesLink.style.display = "block";
}
else {
allClassesLink.style.display = "none";
}
//-->
</script>
<noscript>
<div>JavaScript is disabled on your browser.</div>
</noscript>
</div>
<div>
<ul class="subNavList">
<li>Summary:&nbsp;</li>
<li>Nested&nbsp;|&nbsp;</li>
<li><a href="#field.summary">Field</a>&nbsp;|&nbsp;</li>
<li><a href="#constructor.summary">Constr</a>&nbsp;|&nbsp;</li>
<li><a href="#method.summary">Method</a></li>
</ul>
<ul class="subNavList">
<li>Detail:&nbsp;</li>
<li><a href="#field.detail">Field</a>&nbsp;|&nbsp;</li>
<li><a href="#constructor.detail">Constr</a>&nbsp;|&nbsp;</li>
<li><a href="#method.detail">Method</a></li>
</ul>
</div>
<a id="skip.navbar.bottom">
<!-- -->
</a></div>
<!-- ======== END OF BOTTOM NAVBAR ======= -->
</nav>
<p class="legalCopy"><small>Copyright &#169; 2009&#x2013;2023 <a href="https://github.com/crawler-commons">Crawler-Commons</a>. All rights reserved.</small></p>
</footer>
</body>
</html>