diff --git a/src/main/java/crawlercommons/sitemaps/Namespace.java b/src/main/java/crawlercommons/sitemaps/Namespace.java index be554c3..93a1d9a 100644 --- a/src/main/java/crawlercommons/sitemaps/Namespace.java +++ b/src/main/java/crawlercommons/sitemaps/Namespace.java @@ -72,6 +72,8 @@ public class Namespace { public static final String LINKS = "http://www.w3.org/1999/xhtml"; + public static final String PAGEMAPS = "http://www.google.com/schemas/sitemap-pagemap/1.0"; + /** * In contradiction to the protocol specification ("The Sitemap must ... * [s]pecify the namespace (protocol standard) within the <urlset> @@ -101,6 +103,7 @@ public class Namespace { SITEMAP_SUPPORTED_NAMESPACES.addAll(Arrays.asList(VIDEO)); SITEMAP_SUPPORTED_NAMESPACES.addAll(Arrays.asList(NEWS)); SITEMAP_SUPPORTED_NAMESPACES.add(LINKS); + SITEMAP_SUPPORTED_NAMESPACES.add(PAGEMAPS); } /** @@ -120,5 +123,6 @@ public class Namespace { SITEMAP_EXTENSION_NAMESPACES.put(Extension.VIDEO, Arrays.asList(VIDEO)); SITEMAP_EXTENSION_NAMESPACES.put(Extension.MOBILE, Arrays.asList(MOBILE)); SITEMAP_EXTENSION_NAMESPACES.put(Extension.LINKS, Arrays.asList(LINKS)); + SITEMAP_EXTENSION_NAMESPACES.put(Extension.PAGEMAPS, Arrays.asList(PAGEMAPS)); } } diff --git a/src/main/java/crawlercommons/sitemaps/extension/Extension.java b/src/main/java/crawlercommons/sitemaps/extension/Extension.java index 2e8a49a..0d90e7a 100644 --- a/src/main/java/crawlercommons/sitemaps/extension/Extension.java +++ b/src/main/java/crawlercommons/sitemaps/extension/Extension.java @@ -44,5 +44,11 @@ public enum Extension { * URL as having mobile content, cf. * http://www.google.com/schemas/sitemap-mobile/1.0 */ - MOBILE + MOBILE, + /** + * PageMaps is a structured data format that Google created to enable + * website creators to embed data and notes in their webpages., cf. + * https://support.google.com/programmable-search/answer/1628213 + */ + PAGEMAPS } diff --git a/src/main/java/crawlercommons/sitemaps/extension/PageMap.java b/src/main/java/crawlercommons/sitemaps/extension/PageMap.java new file mode 100644 index 0000000..036f887 --- /dev/null +++ b/src/main/java/crawlercommons/sitemaps/extension/PageMap.java @@ -0,0 +1,89 @@ +/** + * Copyright 2023 Crawler-Commons + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package crawlercommons.sitemaps.extension; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Objects; + +/** + * Data model for the PageMaps + * extension to the sitemap protocol used for Google's Programmable Search + * Engine. + * + * A PageMap holds a list of {@link PageMapDataObject}s, each PageMapDataObject + * a map of attributes (pairs of name and value). + */ +@SuppressWarnings("serial") +public class PageMap extends ExtensionMetadata { + + private List dataObjects = new ArrayList<>(); + + public List getPageMapDataObjects() { + return dataObjects; + } + + public void addDataObject(PageMapDataObject d) { + dataObjects.add(d); + } + + @Override + public Map asMap() { + Map map = new LinkedHashMap<>(); + for (PageMapDataObject dobj : dataObjects) { + for (Entry e : dobj.asMap().entrySet()) { + map.put(e.getKey(), e.getValue()); + } + } + return map; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("PageMap: ["); + if (!dataObjects.isEmpty()) { + sb.append('\n'); + } + for (PageMapDataObject dobj : dataObjects) { + sb.append(dobj.toString()).append(",\n"); + } + sb.append(']'); + return sb.toString(); + } + + @Override + public int hashCode() { + return Objects.hash(dataObjects); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + PageMap other = (PageMap) obj; + return Objects.equals(dataObjects, other.dataObjects); + } + +} diff --git a/src/main/java/crawlercommons/sitemaps/extension/PageMapDataObject.java b/src/main/java/crawlercommons/sitemaps/extension/PageMapDataObject.java new file mode 100644 index 0000000..9cb38f2 --- /dev/null +++ b/src/main/java/crawlercommons/sitemaps/extension/PageMapDataObject.java @@ -0,0 +1,90 @@ +/** + * Copyright 2023 Crawler-Commons + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package crawlercommons.sitemaps.extension; + +import java.util.LinkedHashMap; +import java.util.Locale; +import java.util.Map; +import java.util.Objects; +import java.util.Map.Entry; + +@SuppressWarnings("serial") +public class PageMapDataObject extends ExtensionMetadata { + private String type; + private String id; + private Map attributes; + + public PageMapDataObject(String type, String id) { + this.type = type; + this.id = id; + attributes = new LinkedHashMap<>(); + } + + public String getType() { + return type; + } + + public String getId() { + return id; + } + + public Map getAttributes() { + return attributes; + } + + public boolean hasAttribute(String name) { + return attributes.containsKey(name); + } + + public String getAttribute(String name) { + return attributes.get(name); + } + + public String addAttribute(String name, String value) { + return attributes.put(name, value); + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (obj == null) + return false; + if (getClass() != obj.getClass()) + return false; + PageMapDataObject other = (PageMapDataObject) obj; + return Objects.equals(attributes, other.attributes) && Objects.equals(id, other.id) && Objects.equals(type, other.type); + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("{type = ").append(type); + sb.append(", id = ").append(id); + sb.append(", attributes = ").append(attributes); + sb.append("}"); + return sb.toString(); + } + + @Override + public Map asMap() { + String keyFormat = "%s::%s"; + String valueFormat = "%s: %s"; + String key = String.format(Locale.ROOT, keyFormat, (getType() == null ? "" : getType()), (getId() == null ? "" : getId())); + String[] values = getAttributes().entrySet().stream().map((Entry e) -> String.format(Locale.ROOT, valueFormat, e.getKey(), e.getValue())).toArray(String[]::new); + return Map.of(key, values); + } +} diff --git a/src/main/java/crawlercommons/sitemaps/sax/extension/ExtensionHandler.java b/src/main/java/crawlercommons/sitemaps/sax/extension/ExtensionHandler.java index 4f4b454..14e4d3a 100644 --- a/src/main/java/crawlercommons/sitemaps/sax/extension/ExtensionHandler.java +++ b/src/main/java/crawlercommons/sitemaps/sax/extension/ExtensionHandler.java @@ -55,6 +55,8 @@ public abstract class ExtensionHandler extends DefaultHandler { return new LinksHandler(); case MOBILE: return new MobileHandler(); + case PAGEMAPS: + return new PageMapsHandler(); default: return null; } diff --git a/src/main/java/crawlercommons/sitemaps/sax/extension/PageMapsHandler.java b/src/main/java/crawlercommons/sitemaps/sax/extension/PageMapsHandler.java new file mode 100644 index 0000000..bd3cbc1 --- /dev/null +++ b/src/main/java/crawlercommons/sitemaps/sax/extension/PageMapsHandler.java @@ -0,0 +1,119 @@ +/** + * Copyright 2023 Crawler-Commons + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package crawlercommons.sitemaps.sax.extension; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; + +import crawlercommons.sitemaps.extension.PageMap; +import crawlercommons.sitemaps.extension.PageMapDataObject; + +/** + * Handle SAX events in the Google's Programmable Search Engine PageMaps + * extension namespace. + */ +public class PageMapsHandler extends ExtensionHandler { + + private PageMap currPageMap; + private PageMapDataObject currDataObj; + private String currAttrName; + private StringBuilder currAttrVal = new StringBuilder(); + private String currAttrValFromAttr; + + public PageMapsHandler() { + reset(); + } + + @Override + public void reset() { + super.reset(); + resetCurrent(); + } + + private void resetCurrent() { + currDataObj = null; + currAttrName = null; + currAttrVal.setLength(0); + currAttrValFromAttr = null; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException { + switch (localName) { + case "PageMap": + currPageMap = new PageMap(); + break; + case "DataObject": + currDataObj = new PageMapDataObject(attributes.getValue("type"), attributes.getValue("id")); + if (currPageMap == null) { + // ignore lonesome DataObject elements + } else { + currPageMap.addDataObject(currDataObj); + } + break; + case "Attribute": + currAttrVal.setLength(0); + currAttrName = attributes.getValue("name"); + if (attributes.getValue("value") != null) { + /* + * The PageMaps specification + * (https://support.google.com/programmable-search/answer/ + * 1628213) describes for PageMaps embedded in HTML that the + * attribute value is given as element attribute named "value". + * For sitemaps it should be given as character data. However, + * some PageMaps sitemaps in the wild also use the HTML + * mechanism. We fall back to the HTML mechanism if there is no + * or white space only character data. + */ + currAttrValFromAttr = attributes.getValue("value"); + } + break; + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + switch (localName) { + case "PageMap": + attributes.add(currPageMap); + break; + case "DataObject": + break; + case "Attribute": + String currAttrValStr = currAttrVal.toString().trim(); + if (currDataObj == null) { + // ignore lonesome attributes + } else if (currAttrValStr.isEmpty() && currAttrValFromAttr != null) { + /* + * If there is no or white space only character data, fall back + * to the HTML mechanism and use the content of the attribute + * "value". + */ + currDataObj.addAttribute(currAttrName, currAttrValFromAttr); + } else { + currDataObj.addAttribute(currAttrName, currAttrValStr); + } + break; + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + currAttrVal.append(String.valueOf(ch, start, length)); + } + +} diff --git a/src/test/java/crawlercommons/sitemaps/SiteMapParserExtensionTest.java b/src/test/java/crawlercommons/sitemaps/SiteMapParserExtensionTest.java index 0bc9f50..322530a 100644 --- a/src/test/java/crawlercommons/sitemaps/SiteMapParserExtensionTest.java +++ b/src/test/java/crawlercommons/sitemaps/SiteMapParserExtensionTest.java @@ -16,17 +16,32 @@ package crawlercommons.sitemaps; -import crawlercommons.sitemaps.extension.*; -import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.time.ZonedDateTime; +import java.util.Arrays; import java.util.HashMap; import java.util.Iterator; +import java.util.List; +import java.util.Map.Entry; -import static org.junit.jupiter.api.Assertions.*; +import org.junit.jupiter.api.Test; + +import crawlercommons.sitemaps.extension.Extension; +import crawlercommons.sitemaps.extension.ExtensionMetadata; +import crawlercommons.sitemaps.extension.ImageAttributes; +import crawlercommons.sitemaps.extension.LinkAttributes; +import crawlercommons.sitemaps.extension.MobileAttributes; +import crawlercommons.sitemaps.extension.NewsAttributes; +import crawlercommons.sitemaps.extension.PageMap; +import crawlercommons.sitemaps.extension.PageMapDataObject; +import crawlercommons.sitemaps.extension.VideoAttributes; public class SiteMapParserExtensionTest { @@ -257,4 +272,56 @@ public class SiteMapParserExtensionTest { SiteMap sm = (SiteMap) asm; assertEquals(74, sm.getSiteMapUrls().size()); } + + @Test + public void testPageMapSitemap() throws UnknownFormatException, IOException { + SiteMapParser parser = new SiteMapParser(); + parser.setStrictNamespace(true); + parser.enableExtension(Extension.PAGEMAPS); + + String urlStr = "http://www.example.com/pagemaps-sitemap.xml"; + URL url = new URL(urlStr); + AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/pagemaps-sitemap.xml", url); + + assertEquals(false, asm.isIndex()); + assertEquals(true, asm instanceof SiteMap); + SiteMap sm = (SiteMap) asm; + assertEquals(2, sm.getSiteMapUrls().size()); + assertEquals(urlStr, sm.getUrl().toString()); + // System.out.println(sm.toString()); + for (SiteMapURL u : sm.getSiteMapUrls()) { + System.out.println(u.toString()); + for (Entry x : u.getAttributes().entrySet()) { + assertEquals(Extension.PAGEMAPS, x.getKey()); + System.out.println(x.getValue().getClass()); + PageMap pageMap = (PageMap) x.getValue()[0]; + List dataObjects = pageMap.getPageMapDataObjects(); + PageMapDataObject dataObject; + switch (u.getUrl().toString()) { + case "http://www.example.com/foo": + assertEquals(2, dataObjects.size()); + dataObject = dataObjects.get(0); + assertEquals("document", dataObject.getType()); + assertEquals("one", dataObject.getId()); + assertEquals("Doc One", dataObject.getAttribute("name")); + assertEquals("3.5", dataObject.getAttribute("review")); + dataObject = dataObjects.get(1); + assertEquals("image", dataObject.getType()); + assertNull(dataObject.getId()); + assertEquals("http://www.example.com/foo.gif", dataObject.getAttribute("image_src")); + break; + case "http://www.example.com/bar": + assertEquals(1, dataObjects.size()); + dataObject = dataObjects.get(0); + assertEquals("document", dataObject.getType()); + assertEquals("two", dataObject.getId()); + assertEquals("Doc Two", dataObject.getAttribute("name")); + assertEquals("4.0", dataObject.getAttribute("review")); + break; + } + System.out.println(x.getKey() + ": " + Arrays.toString(x.getValue())); + System.out.println(x.getValue().length); + } + } + } } diff --git a/src/test/java/crawlercommons/sitemaps/extension/PageMapsTest.java b/src/test/java/crawlercommons/sitemaps/extension/PageMapsTest.java new file mode 100644 index 0000000..9a72837 --- /dev/null +++ b/src/test/java/crawlercommons/sitemaps/extension/PageMapsTest.java @@ -0,0 +1,71 @@ +/** + * Copyright 2023 Crawler-Commons + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package crawlercommons.sitemaps.extension; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +import org.junit.jupiter.api.Test; + +public class PageMapsTest { + + @Test + public void testPageMapAttributesEquals() { + PageMap a = new PageMap(); + PageMapDataObject da = new PageMapDataObject("test", "a"); + da.addAttribute("foo", "bar"); + a.addDataObject(da); + assertEquals(a, a); + assertNotNull(a.toString()); + assertEquals(1, a.getPageMapDataObjects().size()); + assertEquals("test", a.getPageMapDataObjects().get(0).getType()); + assertEquals("a", a.getPageMapDataObjects().get(0).getId()); + assertEquals("bar", a.getPageMapDataObjects().get(0).getAttribute("foo")); + + PageMap b = new PageMap(); + PageMapDataObject db = new PageMapDataObject("test", "a"); + db.addAttribute("foo", "bar"); + b.addDataObject(db); + assertEquals(da, db); + assertEquals(a, b); + db.addAttribute("hello", "world"); + assertNotEquals(da, db); + assertNotEquals(a, b); + assertEquals(b, b); + assertNotNull(a.toString()); + assertEquals(1, b.getPageMapDataObjects().size()); + assertEquals("test", b.getPageMapDataObjects().get(0).getType()); + assertEquals("a", b.getPageMapDataObjects().get(0).getId()); + assertEquals("bar", b.getPageMapDataObjects().get(0).getAttribute("foo")); + assertEquals("world", b.getPageMapDataObjects().get(0).getAttribute("hello")); + assertEquals(1, b.asMap().size()); + assertNotNull(b.asMap().get("test::a")); + assertEquals(2, b.asMap().get("test::a").length); + + PageMap c = new PageMap(); + PageMapDataObject dc = new PageMapDataObject("test", "c"); + dc.addAttribute("abc", "xyz"); + c.addDataObject(dc); + assertEquals(c, c); + assertNotEquals(a, c); + assertNotNull(a.toString()); + assertEquals(1, c.getPageMapDataObjects().size()); + assertEquals("test", c.getPageMapDataObjects().get(0).getType()); + assertEquals("c", c.getPageMapDataObjects().get(0).getId()); + assertEquals("xyz", c.getPageMapDataObjects().get(0).getAttribute("abc")); + } +} \ No newline at end of file diff --git a/src/test/resources/sitemaps/extension/pagemaps-sitemap.xml b/src/test/resources/sitemaps/extension/pagemaps-sitemap.xml new file mode 100644 index 0000000..5306bde --- /dev/null +++ b/src/test/resources/sitemaps/extension/pagemaps-sitemap.xml @@ -0,0 +1,30 @@ + + + + + http://www.example.com/foo + + + Doc One + 3.5 + + + http://www.example.com/foo.gif + + + + + http://www.example.com/bar + + + Doc Two + 4.0 + + + + \ No newline at end of file