diff --git a/src/main/java/crawlercommons/sitemaps/Namespace.java b/src/main/java/crawlercommons/sitemaps/Namespace.java
index be554c3..93a1d9a 100644
--- a/src/main/java/crawlercommons/sitemaps/Namespace.java
+++ b/src/main/java/crawlercommons/sitemaps/Namespace.java
@@ -72,6 +72,8 @@ public class Namespace {
public static final String LINKS = "http://www.w3.org/1999/xhtml";
+ public static final String PAGEMAPS = "http://www.google.com/schemas/sitemap-pagemap/1.0";
+
/**
* In contradiction to the protocol specification ("The Sitemap must ...
* [s]pecify the namespace (protocol standard) within the <urlset>
@@ -101,6 +103,7 @@ public class Namespace {
SITEMAP_SUPPORTED_NAMESPACES.addAll(Arrays.asList(VIDEO));
SITEMAP_SUPPORTED_NAMESPACES.addAll(Arrays.asList(NEWS));
SITEMAP_SUPPORTED_NAMESPACES.add(LINKS);
+ SITEMAP_SUPPORTED_NAMESPACES.add(PAGEMAPS);
}
/**
@@ -120,5 +123,6 @@ public class Namespace {
SITEMAP_EXTENSION_NAMESPACES.put(Extension.VIDEO, Arrays.asList(VIDEO));
SITEMAP_EXTENSION_NAMESPACES.put(Extension.MOBILE, Arrays.asList(MOBILE));
SITEMAP_EXTENSION_NAMESPACES.put(Extension.LINKS, Arrays.asList(LINKS));
+ SITEMAP_EXTENSION_NAMESPACES.put(Extension.PAGEMAPS, Arrays.asList(PAGEMAPS));
}
}
diff --git a/src/main/java/crawlercommons/sitemaps/extension/Extension.java b/src/main/java/crawlercommons/sitemaps/extension/Extension.java
index 2e8a49a..0d90e7a 100644
--- a/src/main/java/crawlercommons/sitemaps/extension/Extension.java
+++ b/src/main/java/crawlercommons/sitemaps/extension/Extension.java
@@ -44,5 +44,11 @@ public enum Extension {
* URL as having mobile content, cf.
* http://www.google.com/schemas/sitemap-mobile/1.0
*/
- MOBILE
+ MOBILE,
+ /**
+ * PageMaps is a structured data format that Google created to enable
+ * website creators to embed data and notes in their webpages., cf.
+ * https://support.google.com/programmable-search/answer/1628213
+ */
+ PAGEMAPS
}
diff --git a/src/main/java/crawlercommons/sitemaps/extension/PageMap.java b/src/main/java/crawlercommons/sitemaps/extension/PageMap.java
new file mode 100644
index 0000000..036f887
--- /dev/null
+++ b/src/main/java/crawlercommons/sitemaps/extension/PageMap.java
@@ -0,0 +1,89 @@
+/**
+ * Copyright 2023 Crawler-Commons
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package crawlercommons.sitemaps.extension;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.Objects;
+
+/**
+ * Data model for the PageMaps
+ * extension to the sitemap protocol used for Google's Programmable Search
+ * Engine.
+ *
+ * A PageMap holds a list of {@link PageMapDataObject}s, each PageMapDataObject
+ * a map of attributes (pairs of name and value).
+ */
+@SuppressWarnings("serial")
+public class PageMap extends ExtensionMetadata {
+
+ private List dataObjects = new ArrayList<>();
+
+ public List getPageMapDataObjects() {
+ return dataObjects;
+ }
+
+ public void addDataObject(PageMapDataObject d) {
+ dataObjects.add(d);
+ }
+
+ @Override
+ public Map asMap() {
+ Map map = new LinkedHashMap<>();
+ for (PageMapDataObject dobj : dataObjects) {
+ for (Entry e : dobj.asMap().entrySet()) {
+ map.put(e.getKey(), e.getValue());
+ }
+ }
+ return map;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("PageMap: [");
+ if (!dataObjects.isEmpty()) {
+ sb.append('\n');
+ }
+ for (PageMapDataObject dobj : dataObjects) {
+ sb.append(dobj.toString()).append(",\n");
+ }
+ sb.append(']');
+ return sb.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(dataObjects);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ PageMap other = (PageMap) obj;
+ return Objects.equals(dataObjects, other.dataObjects);
+ }
+
+}
diff --git a/src/main/java/crawlercommons/sitemaps/extension/PageMapDataObject.java b/src/main/java/crawlercommons/sitemaps/extension/PageMapDataObject.java
new file mode 100644
index 0000000..9cb38f2
--- /dev/null
+++ b/src/main/java/crawlercommons/sitemaps/extension/PageMapDataObject.java
@@ -0,0 +1,90 @@
+/**
+ * Copyright 2023 Crawler-Commons
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package crawlercommons.sitemaps.extension;
+
+import java.util.LinkedHashMap;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Map.Entry;
+
+@SuppressWarnings("serial")
+public class PageMapDataObject extends ExtensionMetadata {
+ private String type;
+ private String id;
+ private Map attributes;
+
+ public PageMapDataObject(String type, String id) {
+ this.type = type;
+ this.id = id;
+ attributes = new LinkedHashMap<>();
+ }
+
+ public String getType() {
+ return type;
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public Map getAttributes() {
+ return attributes;
+ }
+
+ public boolean hasAttribute(String name) {
+ return attributes.containsKey(name);
+ }
+
+ public String getAttribute(String name) {
+ return attributes.get(name);
+ }
+
+ public String addAttribute(String name, String value) {
+ return attributes.put(name, value);
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ PageMapDataObject other = (PageMapDataObject) obj;
+ return Objects.equals(attributes, other.attributes) && Objects.equals(id, other.id) && Objects.equals(type, other.type);
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("{type = ").append(type);
+ sb.append(", id = ").append(id);
+ sb.append(", attributes = ").append(attributes);
+ sb.append("}");
+ return sb.toString();
+ }
+
+ @Override
+ public Map asMap() {
+ String keyFormat = "%s::%s";
+ String valueFormat = "%s: %s";
+ String key = String.format(Locale.ROOT, keyFormat, (getType() == null ? "" : getType()), (getId() == null ? "" : getId()));
+ String[] values = getAttributes().entrySet().stream().map((Entry e) -> String.format(Locale.ROOT, valueFormat, e.getKey(), e.getValue())).toArray(String[]::new);
+ return Map.of(key, values);
+ }
+}
diff --git a/src/main/java/crawlercommons/sitemaps/sax/extension/ExtensionHandler.java b/src/main/java/crawlercommons/sitemaps/sax/extension/ExtensionHandler.java
index 4f4b454..14e4d3a 100644
--- a/src/main/java/crawlercommons/sitemaps/sax/extension/ExtensionHandler.java
+++ b/src/main/java/crawlercommons/sitemaps/sax/extension/ExtensionHandler.java
@@ -55,6 +55,8 @@ public abstract class ExtensionHandler extends DefaultHandler {
return new LinksHandler();
case MOBILE:
return new MobileHandler();
+ case PAGEMAPS:
+ return new PageMapsHandler();
default:
return null;
}
diff --git a/src/main/java/crawlercommons/sitemaps/sax/extension/PageMapsHandler.java b/src/main/java/crawlercommons/sitemaps/sax/extension/PageMapsHandler.java
new file mode 100644
index 0000000..bd3cbc1
--- /dev/null
+++ b/src/main/java/crawlercommons/sitemaps/sax/extension/PageMapsHandler.java
@@ -0,0 +1,119 @@
+/**
+ * Copyright 2023 Crawler-Commons
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package crawlercommons.sitemaps.sax.extension;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+
+import crawlercommons.sitemaps.extension.PageMap;
+import crawlercommons.sitemaps.extension.PageMapDataObject;
+
+/**
+ * Handle SAX events in the Google's Programmable Search Engine PageMaps
+ * extension namespace.
+ */
+public class PageMapsHandler extends ExtensionHandler {
+
+ private PageMap currPageMap;
+ private PageMapDataObject currDataObj;
+ private String currAttrName;
+ private StringBuilder currAttrVal = new StringBuilder();
+ private String currAttrValFromAttr;
+
+ public PageMapsHandler() {
+ reset();
+ }
+
+ @Override
+ public void reset() {
+ super.reset();
+ resetCurrent();
+ }
+
+ private void resetCurrent() {
+ currDataObj = null;
+ currAttrName = null;
+ currAttrVal.setLength(0);
+ currAttrValFromAttr = null;
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+ switch (localName) {
+ case "PageMap":
+ currPageMap = new PageMap();
+ break;
+ case "DataObject":
+ currDataObj = new PageMapDataObject(attributes.getValue("type"), attributes.getValue("id"));
+ if (currPageMap == null) {
+ // ignore lonesome DataObject elements
+ } else {
+ currPageMap.addDataObject(currDataObj);
+ }
+ break;
+ case "Attribute":
+ currAttrVal.setLength(0);
+ currAttrName = attributes.getValue("name");
+ if (attributes.getValue("value") != null) {
+ /*
+ * The PageMaps specification
+ * (https://support.google.com/programmable-search/answer/
+ * 1628213) describes for PageMaps embedded in HTML that the
+ * attribute value is given as element attribute named "value".
+ * For sitemaps it should be given as character data. However,
+ * some PageMaps sitemaps in the wild also use the HTML
+ * mechanism. We fall back to the HTML mechanism if there is no
+ * or white space only character data.
+ */
+ currAttrValFromAttr = attributes.getValue("value");
+ }
+ break;
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ switch (localName) {
+ case "PageMap":
+ attributes.add(currPageMap);
+ break;
+ case "DataObject":
+ break;
+ case "Attribute":
+ String currAttrValStr = currAttrVal.toString().trim();
+ if (currDataObj == null) {
+ // ignore lonesome attributes
+ } else if (currAttrValStr.isEmpty() && currAttrValFromAttr != null) {
+ /*
+ * If there is no or white space only character data, fall back
+ * to the HTML mechanism and use the content of the attribute
+ * "value".
+ */
+ currDataObj.addAttribute(currAttrName, currAttrValFromAttr);
+ } else {
+ currDataObj.addAttribute(currAttrName, currAttrValStr);
+ }
+ break;
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ currAttrVal.append(String.valueOf(ch, start, length));
+ }
+
+}
diff --git a/src/test/java/crawlercommons/sitemaps/SiteMapParserExtensionTest.java b/src/test/java/crawlercommons/sitemaps/SiteMapParserExtensionTest.java
index 0bc9f50..322530a 100644
--- a/src/test/java/crawlercommons/sitemaps/SiteMapParserExtensionTest.java
+++ b/src/test/java/crawlercommons/sitemaps/SiteMapParserExtensionTest.java
@@ -16,17 +16,32 @@
package crawlercommons.sitemaps;
-import crawlercommons.sitemaps.extension.*;
-import org.junit.jupiter.api.Test;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
+import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.time.ZonedDateTime;
+import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
+import java.util.List;
+import java.util.Map.Entry;
-import static org.junit.jupiter.api.Assertions.*;
+import org.junit.jupiter.api.Test;
+
+import crawlercommons.sitemaps.extension.Extension;
+import crawlercommons.sitemaps.extension.ExtensionMetadata;
+import crawlercommons.sitemaps.extension.ImageAttributes;
+import crawlercommons.sitemaps.extension.LinkAttributes;
+import crawlercommons.sitemaps.extension.MobileAttributes;
+import crawlercommons.sitemaps.extension.NewsAttributes;
+import crawlercommons.sitemaps.extension.PageMap;
+import crawlercommons.sitemaps.extension.PageMapDataObject;
+import crawlercommons.sitemaps.extension.VideoAttributes;
public class SiteMapParserExtensionTest {
@@ -257,4 +272,56 @@ public class SiteMapParserExtensionTest {
SiteMap sm = (SiteMap) asm;
assertEquals(74, sm.getSiteMapUrls().size());
}
+
+ @Test
+ public void testPageMapSitemap() throws UnknownFormatException, IOException {
+ SiteMapParser parser = new SiteMapParser();
+ parser.setStrictNamespace(true);
+ parser.enableExtension(Extension.PAGEMAPS);
+
+ String urlStr = "http://www.example.com/pagemaps-sitemap.xml";
+ URL url = new URL(urlStr);
+ AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/pagemaps-sitemap.xml", url);
+
+ assertEquals(false, asm.isIndex());
+ assertEquals(true, asm instanceof SiteMap);
+ SiteMap sm = (SiteMap) asm;
+ assertEquals(2, sm.getSiteMapUrls().size());
+ assertEquals(urlStr, sm.getUrl().toString());
+ // System.out.println(sm.toString());
+ for (SiteMapURL u : sm.getSiteMapUrls()) {
+ System.out.println(u.toString());
+ for (Entry x : u.getAttributes().entrySet()) {
+ assertEquals(Extension.PAGEMAPS, x.getKey());
+ System.out.println(x.getValue().getClass());
+ PageMap pageMap = (PageMap) x.getValue()[0];
+ List dataObjects = pageMap.getPageMapDataObjects();
+ PageMapDataObject dataObject;
+ switch (u.getUrl().toString()) {
+ case "http://www.example.com/foo":
+ assertEquals(2, dataObjects.size());
+ dataObject = dataObjects.get(0);
+ assertEquals("document", dataObject.getType());
+ assertEquals("one", dataObject.getId());
+ assertEquals("Doc One", dataObject.getAttribute("name"));
+ assertEquals("3.5", dataObject.getAttribute("review"));
+ dataObject = dataObjects.get(1);
+ assertEquals("image", dataObject.getType());
+ assertNull(dataObject.getId());
+ assertEquals("http://www.example.com/foo.gif", dataObject.getAttribute("image_src"));
+ break;
+ case "http://www.example.com/bar":
+ assertEquals(1, dataObjects.size());
+ dataObject = dataObjects.get(0);
+ assertEquals("document", dataObject.getType());
+ assertEquals("two", dataObject.getId());
+ assertEquals("Doc Two", dataObject.getAttribute("name"));
+ assertEquals("4.0", dataObject.getAttribute("review"));
+ break;
+ }
+ System.out.println(x.getKey() + ": " + Arrays.toString(x.getValue()));
+ System.out.println(x.getValue().length);
+ }
+ }
+ }
}
diff --git a/src/test/java/crawlercommons/sitemaps/extension/PageMapsTest.java b/src/test/java/crawlercommons/sitemaps/extension/PageMapsTest.java
new file mode 100644
index 0000000..9a72837
--- /dev/null
+++ b/src/test/java/crawlercommons/sitemaps/extension/PageMapsTest.java
@@ -0,0 +1,71 @@
+/**
+ * Copyright 2023 Crawler-Commons
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package crawlercommons.sitemaps.extension;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+
+import org.junit.jupiter.api.Test;
+
+public class PageMapsTest {
+
+ @Test
+ public void testPageMapAttributesEquals() {
+ PageMap a = new PageMap();
+ PageMapDataObject da = new PageMapDataObject("test", "a");
+ da.addAttribute("foo", "bar");
+ a.addDataObject(da);
+ assertEquals(a, a);
+ assertNotNull(a.toString());
+ assertEquals(1, a.getPageMapDataObjects().size());
+ assertEquals("test", a.getPageMapDataObjects().get(0).getType());
+ assertEquals("a", a.getPageMapDataObjects().get(0).getId());
+ assertEquals("bar", a.getPageMapDataObjects().get(0).getAttribute("foo"));
+
+ PageMap b = new PageMap();
+ PageMapDataObject db = new PageMapDataObject("test", "a");
+ db.addAttribute("foo", "bar");
+ b.addDataObject(db);
+ assertEquals(da, db);
+ assertEquals(a, b);
+ db.addAttribute("hello", "world");
+ assertNotEquals(da, db);
+ assertNotEquals(a, b);
+ assertEquals(b, b);
+ assertNotNull(a.toString());
+ assertEquals(1, b.getPageMapDataObjects().size());
+ assertEquals("test", b.getPageMapDataObjects().get(0).getType());
+ assertEquals("a", b.getPageMapDataObjects().get(0).getId());
+ assertEquals("bar", b.getPageMapDataObjects().get(0).getAttribute("foo"));
+ assertEquals("world", b.getPageMapDataObjects().get(0).getAttribute("hello"));
+ assertEquals(1, b.asMap().size());
+ assertNotNull(b.asMap().get("test::a"));
+ assertEquals(2, b.asMap().get("test::a").length);
+
+ PageMap c = new PageMap();
+ PageMapDataObject dc = new PageMapDataObject("test", "c");
+ dc.addAttribute("abc", "xyz");
+ c.addDataObject(dc);
+ assertEquals(c, c);
+ assertNotEquals(a, c);
+ assertNotNull(a.toString());
+ assertEquals(1, c.getPageMapDataObjects().size());
+ assertEquals("test", c.getPageMapDataObjects().get(0).getType());
+ assertEquals("c", c.getPageMapDataObjects().get(0).getId());
+ assertEquals("xyz", c.getPageMapDataObjects().get(0).getAttribute("abc"));
+ }
+}
\ No newline at end of file
diff --git a/src/test/resources/sitemaps/extension/pagemaps-sitemap.xml b/src/test/resources/sitemaps/extension/pagemaps-sitemap.xml
new file mode 100644
index 0000000..5306bde
--- /dev/null
+++ b/src/test/resources/sitemaps/extension/pagemaps-sitemap.xml
@@ -0,0 +1,30 @@
+
+
+
+
+ http://www.example.com/foo
+
+
+ Doc One
+ 3.5
+
+
+ http://www.example.com/foo.gif
+
+
+
+
+ http://www.example.com/bar
+
+
+ Doc Two
+ 4.0
+
+
+
+
\ No newline at end of file