1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-04-27 17:25:03 +02:00

[Sitemaps] Google Sitemap PageMap extensions, implements #388 (#442)

This commit is contained in:
Sebastian Nagel 2023-10-28 17:09:45 +02:00 committed by GitHub
parent ed1cebeff7
commit 54576e810d
Signed by: GitHub
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 482 additions and 4 deletions

View File

@ -72,6 +72,8 @@ public class Namespace {
public static final String LINKS = "http://www.w3.org/1999/xhtml";
public static final String PAGEMAPS = "http://www.google.com/schemas/sitemap-pagemap/1.0";
/**
* In contradiction to the protocol specification ("The Sitemap must ...
* [s]pecify the namespace (protocol standard) within the <urlset>
@ -101,6 +103,7 @@ public class Namespace {
SITEMAP_SUPPORTED_NAMESPACES.addAll(Arrays.asList(VIDEO));
SITEMAP_SUPPORTED_NAMESPACES.addAll(Arrays.asList(NEWS));
SITEMAP_SUPPORTED_NAMESPACES.add(LINKS);
SITEMAP_SUPPORTED_NAMESPACES.add(PAGEMAPS);
}
/**
@ -120,5 +123,6 @@ public class Namespace {
SITEMAP_EXTENSION_NAMESPACES.put(Extension.VIDEO, Arrays.asList(VIDEO));
SITEMAP_EXTENSION_NAMESPACES.put(Extension.MOBILE, Arrays.asList(MOBILE));
SITEMAP_EXTENSION_NAMESPACES.put(Extension.LINKS, Arrays.asList(LINKS));
SITEMAP_EXTENSION_NAMESPACES.put(Extension.PAGEMAPS, Arrays.asList(PAGEMAPS));
}
}

View File

@ -44,5 +44,11 @@ public enum Extension {
* URL as having mobile content</cite>, cf.
* http://www.google.com/schemas/sitemap-mobile/1.0
*/
MOBILE
MOBILE,
/**
* <cite>PageMaps is a structured data format that Google created to enable
* website creators to embed data and notes in their webpages.</cite>, cf.
* https://support.google.com/programmable-search/answer/1628213
*/
PAGEMAPS
}

View File

@ -0,0 +1,89 @@
/**
* Copyright 2023 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.extension;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Objects;
/**
* Data model for the <a href=
* "https://support.google.com/programmable-search/answer/1628213">PageMaps</a>
* extension to the sitemap protocol used for Google's Programmable Search
* Engine.
*
* A PageMap holds a list of {@link PageMapDataObject}s, each PageMapDataObject
* a map of attributes (pairs of name and value).
*/
@SuppressWarnings("serial")
public class PageMap extends ExtensionMetadata {
private List<PageMapDataObject> dataObjects = new ArrayList<>();
public List<PageMapDataObject> getPageMapDataObjects() {
return dataObjects;
}
public void addDataObject(PageMapDataObject d) {
dataObjects.add(d);
}
@Override
public Map<String, String[]> asMap() {
Map<String, String[]> map = new LinkedHashMap<>();
for (PageMapDataObject dobj : dataObjects) {
for (Entry<String, String[]> e : dobj.asMap().entrySet()) {
map.put(e.getKey(), e.getValue());
}
}
return map;
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("PageMap: [");
if (!dataObjects.isEmpty()) {
sb.append('\n');
}
for (PageMapDataObject dobj : dataObjects) {
sb.append(dobj.toString()).append(",\n");
}
sb.append(']');
return sb.toString();
}
@Override
public int hashCode() {
return Objects.hash(dataObjects);
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
PageMap other = (PageMap) obj;
return Objects.equals(dataObjects, other.dataObjects);
}
}

View File

@ -0,0 +1,90 @@
/**
* Copyright 2023 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.extension;
import java.util.LinkedHashMap;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Map.Entry;
@SuppressWarnings("serial")
public class PageMapDataObject extends ExtensionMetadata {
private String type;
private String id;
private Map<String, String> attributes;
public PageMapDataObject(String type, String id) {
this.type = type;
this.id = id;
attributes = new LinkedHashMap<>();
}
public String getType() {
return type;
}
public String getId() {
return id;
}
public Map<String, String> getAttributes() {
return attributes;
}
public boolean hasAttribute(String name) {
return attributes.containsKey(name);
}
public String getAttribute(String name) {
return attributes.get(name);
}
public String addAttribute(String name, String value) {
return attributes.put(name, value);
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
PageMapDataObject other = (PageMapDataObject) obj;
return Objects.equals(attributes, other.attributes) && Objects.equals(id, other.id) && Objects.equals(type, other.type);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("{type = ").append(type);
sb.append(", id = ").append(id);
sb.append(", attributes = ").append(attributes);
sb.append("}");
return sb.toString();
}
@Override
public Map<String, String[]> asMap() {
String keyFormat = "%s::%s";
String valueFormat = "%s: %s";
String key = String.format(Locale.ROOT, keyFormat, (getType() == null ? "" : getType()), (getId() == null ? "" : getId()));
String[] values = getAttributes().entrySet().stream().map((Entry<String, String> e) -> String.format(Locale.ROOT, valueFormat, e.getKey(), e.getValue())).toArray(String[]::new);
return Map.of(key, values);
}
}

View File

@ -55,6 +55,8 @@ public abstract class ExtensionHandler extends DefaultHandler {
return new LinksHandler();
case MOBILE:
return new MobileHandler();
case PAGEMAPS:
return new PageMapsHandler();
default:
return null;
}

View File

@ -0,0 +1,119 @@
/**
* Copyright 2023 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.sax.extension;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import crawlercommons.sitemaps.extension.PageMap;
import crawlercommons.sitemaps.extension.PageMapDataObject;
/**
* Handle SAX events in the Google's Programmable Search Engine <a href=
* "https://support.google.com/programmable-search/answer/1628213">PageMaps</a>
* extension namespace.
*/
public class PageMapsHandler extends ExtensionHandler {
private PageMap currPageMap;
private PageMapDataObject currDataObj;
private String currAttrName;
private StringBuilder currAttrVal = new StringBuilder();
private String currAttrValFromAttr;
public PageMapsHandler() {
reset();
}
@Override
public void reset() {
super.reset();
resetCurrent();
}
private void resetCurrent() {
currDataObj = null;
currAttrName = null;
currAttrVal.setLength(0);
currAttrValFromAttr = null;
}
@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
switch (localName) {
case "PageMap":
currPageMap = new PageMap();
break;
case "DataObject":
currDataObj = new PageMapDataObject(attributes.getValue("type"), attributes.getValue("id"));
if (currPageMap == null) {
// ignore lonesome DataObject elements
} else {
currPageMap.addDataObject(currDataObj);
}
break;
case "Attribute":
currAttrVal.setLength(0);
currAttrName = attributes.getValue("name");
if (attributes.getValue("value") != null) {
/*
* The PageMaps specification
* (https://support.google.com/programmable-search/answer/
* 1628213) describes for PageMaps embedded in HTML that the
* attribute value is given as element attribute named "value".
* For sitemaps it should be given as character data. However,
* some PageMaps sitemaps in the wild also use the HTML
* mechanism. We fall back to the HTML mechanism if there is no
* or white space only character data.
*/
currAttrValFromAttr = attributes.getValue("value");
}
break;
}
}
@Override
public void endElement(String uri, String localName, String qName) throws SAXException {
switch (localName) {
case "PageMap":
attributes.add(currPageMap);
break;
case "DataObject":
break;
case "Attribute":
String currAttrValStr = currAttrVal.toString().trim();
if (currDataObj == null) {
// ignore lonesome attributes
} else if (currAttrValStr.isEmpty() && currAttrValFromAttr != null) {
/*
* If there is no or white space only character data, fall back
* to the HTML mechanism and use the content of the attribute
* "value".
*/
currDataObj.addAttribute(currAttrName, currAttrValFromAttr);
} else {
currDataObj.addAttribute(currAttrName, currAttrValStr);
}
break;
}
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
currAttrVal.append(String.valueOf(ch, start, length));
}
}

View File

@ -16,17 +16,32 @@
package crawlercommons.sitemaps;
import crawlercommons.sitemaps.extension.*;
import org.junit.jupiter.api.Test;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.time.ZonedDateTime;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map.Entry;
import static org.junit.jupiter.api.Assertions.*;
import org.junit.jupiter.api.Test;
import crawlercommons.sitemaps.extension.Extension;
import crawlercommons.sitemaps.extension.ExtensionMetadata;
import crawlercommons.sitemaps.extension.ImageAttributes;
import crawlercommons.sitemaps.extension.LinkAttributes;
import crawlercommons.sitemaps.extension.MobileAttributes;
import crawlercommons.sitemaps.extension.NewsAttributes;
import crawlercommons.sitemaps.extension.PageMap;
import crawlercommons.sitemaps.extension.PageMapDataObject;
import crawlercommons.sitemaps.extension.VideoAttributes;
public class SiteMapParserExtensionTest {
@ -257,4 +272,56 @@ public class SiteMapParserExtensionTest {
SiteMap sm = (SiteMap) asm;
assertEquals(74, sm.getSiteMapUrls().size());
}
@Test
public void testPageMapSitemap() throws UnknownFormatException, IOException {
SiteMapParser parser = new SiteMapParser();
parser.setStrictNamespace(true);
parser.enableExtension(Extension.PAGEMAPS);
String urlStr = "http://www.example.com/pagemaps-sitemap.xml";
URL url = new URL(urlStr);
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/pagemaps-sitemap.xml", url);
assertEquals(false, asm.isIndex());
assertEquals(true, asm instanceof SiteMap);
SiteMap sm = (SiteMap) asm;
assertEquals(2, sm.getSiteMapUrls().size());
assertEquals(urlStr, sm.getUrl().toString());
// System.out.println(sm.toString());
for (SiteMapURL u : sm.getSiteMapUrls()) {
System.out.println(u.toString());
for (Entry<Extension, ExtensionMetadata[]> x : u.getAttributes().entrySet()) {
assertEquals(Extension.PAGEMAPS, x.getKey());
System.out.println(x.getValue().getClass());
PageMap pageMap = (PageMap) x.getValue()[0];
List<PageMapDataObject> dataObjects = pageMap.getPageMapDataObjects();
PageMapDataObject dataObject;
switch (u.getUrl().toString()) {
case "http://www.example.com/foo":
assertEquals(2, dataObjects.size());
dataObject = dataObjects.get(0);
assertEquals("document", dataObject.getType());
assertEquals("one", dataObject.getId());
assertEquals("Doc One", dataObject.getAttribute("name"));
assertEquals("3.5", dataObject.getAttribute("review"));
dataObject = dataObjects.get(1);
assertEquals("image", dataObject.getType());
assertNull(dataObject.getId());
assertEquals("http://www.example.com/foo.gif", dataObject.getAttribute("image_src"));
break;
case "http://www.example.com/bar":
assertEquals(1, dataObjects.size());
dataObject = dataObjects.get(0);
assertEquals("document", dataObject.getType());
assertEquals("two", dataObject.getId());
assertEquals("Doc Two", dataObject.getAttribute("name"));
assertEquals("4.0", dataObject.getAttribute("review"));
break;
}
System.out.println(x.getKey() + ": " + Arrays.toString(x.getValue()));
System.out.println(x.getValue().length);
}
}
}
}

View File

@ -0,0 +1,71 @@
/**
* Copyright 2023 Crawler-Commons
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.sitemaps.extension;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import org.junit.jupiter.api.Test;
public class PageMapsTest {
@Test
public void testPageMapAttributesEquals() {
PageMap a = new PageMap();
PageMapDataObject da = new PageMapDataObject("test", "a");
da.addAttribute("foo", "bar");
a.addDataObject(da);
assertEquals(a, a);
assertNotNull(a.toString());
assertEquals(1, a.getPageMapDataObjects().size());
assertEquals("test", a.getPageMapDataObjects().get(0).getType());
assertEquals("a", a.getPageMapDataObjects().get(0).getId());
assertEquals("bar", a.getPageMapDataObjects().get(0).getAttribute("foo"));
PageMap b = new PageMap();
PageMapDataObject db = new PageMapDataObject("test", "a");
db.addAttribute("foo", "bar");
b.addDataObject(db);
assertEquals(da, db);
assertEquals(a, b);
db.addAttribute("hello", "world");
assertNotEquals(da, db);
assertNotEquals(a, b);
assertEquals(b, b);
assertNotNull(a.toString());
assertEquals(1, b.getPageMapDataObjects().size());
assertEquals("test", b.getPageMapDataObjects().get(0).getType());
assertEquals("a", b.getPageMapDataObjects().get(0).getId());
assertEquals("bar", b.getPageMapDataObjects().get(0).getAttribute("foo"));
assertEquals("world", b.getPageMapDataObjects().get(0).getAttribute("hello"));
assertEquals(1, b.asMap().size());
assertNotNull(b.asMap().get("test::a"));
assertEquals(2, b.asMap().get("test::a").length);
PageMap c = new PageMap();
PageMapDataObject dc = new PageMapDataObject("test", "c");
dc.addAttribute("abc", "xyz");
c.addDataObject(dc);
assertEquals(c, c);
assertNotEquals(a, c);
assertNotNull(a.toString());
assertEquals(1, c.getPageMapDataObjects().size());
assertEquals("test", c.getPageMapDataObjects().get(0).getType());
assertEquals("c", c.getPageMapDataObjects().get(0).getId());
assertEquals("xyz", c.getPageMapDataObjects().get(0).getAttribute("abc"));
}
}

View File

@ -0,0 +1,30 @@
<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
xmlns:content="http://www.google.com/schemas/sitemap-content/1.0">
<!-- source: Google's Programmable Search Engine documentation - PageMaps
https://support.google.com/programmable-search/answer/1628213
and Providing Structured Data
https://developers.google.com/custom-search/docs/structured_data
-->
<url>
<loc>http://www.example.com/foo</loc>
<PageMap xmlns="http://www.google.com/schemas/sitemap-pagemap/1.0">
<DataObject type="document" id="one">
<Attribute name="name">Doc One</Attribute>
<Attribute name="review">3.5</Attribute>
</DataObject>
<DataObject type="image">
<Attribute name="image_src">http://www.example.com/foo.gif</Attribute>
</DataObject>
</PageMap>
</url>
<url>
<loc>http://www.example.com/bar</loc>
<PageMap xmlns="http://www.google.com/schemas/sitemap-pagemap/1.0">
<DataObject type="document" id="two">
<Attribute name="name">Doc Two</Attribute>
<Attribute name="review">4.0</Attribute>
</DataObject>
</PageMap>
</url>
</urlset>