mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-09 23:56:04 +02:00
parent
ed1cebeff7
commit
54576e810d
|
@ -72,6 +72,8 @@ public class Namespace {
|
|||
|
||||
public static final String LINKS = "http://www.w3.org/1999/xhtml";
|
||||
|
||||
public static final String PAGEMAPS = "http://www.google.com/schemas/sitemap-pagemap/1.0";
|
||||
|
||||
/**
|
||||
* In contradiction to the protocol specification ("The Sitemap must ...
|
||||
* [s]pecify the namespace (protocol standard) within the <urlset>
|
||||
|
@ -101,6 +103,7 @@ public class Namespace {
|
|||
SITEMAP_SUPPORTED_NAMESPACES.addAll(Arrays.asList(VIDEO));
|
||||
SITEMAP_SUPPORTED_NAMESPACES.addAll(Arrays.asList(NEWS));
|
||||
SITEMAP_SUPPORTED_NAMESPACES.add(LINKS);
|
||||
SITEMAP_SUPPORTED_NAMESPACES.add(PAGEMAPS);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -120,5 +123,6 @@ public class Namespace {
|
|||
SITEMAP_EXTENSION_NAMESPACES.put(Extension.VIDEO, Arrays.asList(VIDEO));
|
||||
SITEMAP_EXTENSION_NAMESPACES.put(Extension.MOBILE, Arrays.asList(MOBILE));
|
||||
SITEMAP_EXTENSION_NAMESPACES.put(Extension.LINKS, Arrays.asList(LINKS));
|
||||
SITEMAP_EXTENSION_NAMESPACES.put(Extension.PAGEMAPS, Arrays.asList(PAGEMAPS));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -44,5 +44,11 @@ public enum Extension {
|
|||
* URL as having mobile content</cite>, cf.
|
||||
* http://www.google.com/schemas/sitemap-mobile/1.0
|
||||
*/
|
||||
MOBILE
|
||||
MOBILE,
|
||||
/**
|
||||
* <cite>PageMaps is a structured data format that Google created to enable
|
||||
* website creators to embed data and notes in their webpages.</cite>, cf.
|
||||
* https://support.google.com/programmable-search/answer/1628213
|
||||
*/
|
||||
PAGEMAPS
|
||||
}
|
||||
|
|
|
@ -0,0 +1,89 @@
|
|||
/**
|
||||
* Copyright 2023 Crawler-Commons
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package crawlercommons.sitemaps.extension;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Objects;
|
||||
|
||||
/**
|
||||
* Data model for the <a href=
|
||||
* "https://support.google.com/programmable-search/answer/1628213">PageMaps</a>
|
||||
* extension to the sitemap protocol used for Google's Programmable Search
|
||||
* Engine.
|
||||
*
|
||||
* A PageMap holds a list of {@link PageMapDataObject}s, each PageMapDataObject
|
||||
* a map of attributes (pairs of name and value).
|
||||
*/
|
||||
@SuppressWarnings("serial")
|
||||
public class PageMap extends ExtensionMetadata {
|
||||
|
||||
private List<PageMapDataObject> dataObjects = new ArrayList<>();
|
||||
|
||||
public List<PageMapDataObject> getPageMapDataObjects() {
|
||||
return dataObjects;
|
||||
}
|
||||
|
||||
public void addDataObject(PageMapDataObject d) {
|
||||
dataObjects.add(d);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String[]> asMap() {
|
||||
Map<String, String[]> map = new LinkedHashMap<>();
|
||||
for (PageMapDataObject dobj : dataObjects) {
|
||||
for (Entry<String, String[]> e : dobj.asMap().entrySet()) {
|
||||
map.put(e.getKey(), e.getValue());
|
||||
}
|
||||
}
|
||||
return map;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("PageMap: [");
|
||||
if (!dataObjects.isEmpty()) {
|
||||
sb.append('\n');
|
||||
}
|
||||
for (PageMapDataObject dobj : dataObjects) {
|
||||
sb.append(dobj.toString()).append(",\n");
|
||||
}
|
||||
sb.append(']');
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(dataObjects);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if (obj == null)
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
PageMap other = (PageMap) obj;
|
||||
return Objects.equals(dataObjects, other.dataObjects);
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,90 @@
|
|||
/**
|
||||
* Copyright 2023 Crawler-Commons
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package crawlercommons.sitemaps.extension;
|
||||
|
||||
import java.util.LinkedHashMap;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
import java.util.Objects;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
@SuppressWarnings("serial")
|
||||
public class PageMapDataObject extends ExtensionMetadata {
|
||||
private String type;
|
||||
private String id;
|
||||
private Map<String, String> attributes;
|
||||
|
||||
public PageMapDataObject(String type, String id) {
|
||||
this.type = type;
|
||||
this.id = id;
|
||||
attributes = new LinkedHashMap<>();
|
||||
}
|
||||
|
||||
public String getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public Map<String, String> getAttributes() {
|
||||
return attributes;
|
||||
}
|
||||
|
||||
public boolean hasAttribute(String name) {
|
||||
return attributes.containsKey(name);
|
||||
}
|
||||
|
||||
public String getAttribute(String name) {
|
||||
return attributes.get(name);
|
||||
}
|
||||
|
||||
public String addAttribute(String name, String value) {
|
||||
return attributes.put(name, value);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj)
|
||||
return true;
|
||||
if (obj == null)
|
||||
return false;
|
||||
if (getClass() != obj.getClass())
|
||||
return false;
|
||||
PageMapDataObject other = (PageMapDataObject) obj;
|
||||
return Objects.equals(attributes, other.attributes) && Objects.equals(id, other.id) && Objects.equals(type, other.type);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb = new StringBuilder();
|
||||
sb.append("{type = ").append(type);
|
||||
sb.append(", id = ").append(id);
|
||||
sb.append(", attributes = ").append(attributes);
|
||||
sb.append("}");
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, String[]> asMap() {
|
||||
String keyFormat = "%s::%s";
|
||||
String valueFormat = "%s: %s";
|
||||
String key = String.format(Locale.ROOT, keyFormat, (getType() == null ? "" : getType()), (getId() == null ? "" : getId()));
|
||||
String[] values = getAttributes().entrySet().stream().map((Entry<String, String> e) -> String.format(Locale.ROOT, valueFormat, e.getKey(), e.getValue())).toArray(String[]::new);
|
||||
return Map.of(key, values);
|
||||
}
|
||||
}
|
|
@ -55,6 +55,8 @@ public abstract class ExtensionHandler extends DefaultHandler {
|
|||
return new LinksHandler();
|
||||
case MOBILE:
|
||||
return new MobileHandler();
|
||||
case PAGEMAPS:
|
||||
return new PageMapsHandler();
|
||||
default:
|
||||
return null;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,119 @@
|
|||
/**
|
||||
* Copyright 2023 Crawler-Commons
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package crawlercommons.sitemaps.sax.extension;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import crawlercommons.sitemaps.extension.PageMap;
|
||||
import crawlercommons.sitemaps.extension.PageMapDataObject;
|
||||
|
||||
/**
|
||||
* Handle SAX events in the Google's Programmable Search Engine <a href=
|
||||
* "https://support.google.com/programmable-search/answer/1628213">PageMaps</a>
|
||||
* extension namespace.
|
||||
*/
|
||||
public class PageMapsHandler extends ExtensionHandler {
|
||||
|
||||
private PageMap currPageMap;
|
||||
private PageMapDataObject currDataObj;
|
||||
private String currAttrName;
|
||||
private StringBuilder currAttrVal = new StringBuilder();
|
||||
private String currAttrValFromAttr;
|
||||
|
||||
public PageMapsHandler() {
|
||||
reset();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() {
|
||||
super.reset();
|
||||
resetCurrent();
|
||||
}
|
||||
|
||||
private void resetCurrent() {
|
||||
currDataObj = null;
|
||||
currAttrName = null;
|
||||
currAttrVal.setLength(0);
|
||||
currAttrValFromAttr = null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
|
||||
switch (localName) {
|
||||
case "PageMap":
|
||||
currPageMap = new PageMap();
|
||||
break;
|
||||
case "DataObject":
|
||||
currDataObj = new PageMapDataObject(attributes.getValue("type"), attributes.getValue("id"));
|
||||
if (currPageMap == null) {
|
||||
// ignore lonesome DataObject elements
|
||||
} else {
|
||||
currPageMap.addDataObject(currDataObj);
|
||||
}
|
||||
break;
|
||||
case "Attribute":
|
||||
currAttrVal.setLength(0);
|
||||
currAttrName = attributes.getValue("name");
|
||||
if (attributes.getValue("value") != null) {
|
||||
/*
|
||||
* The PageMaps specification
|
||||
* (https://support.google.com/programmable-search/answer/
|
||||
* 1628213) describes for PageMaps embedded in HTML that the
|
||||
* attribute value is given as element attribute named "value".
|
||||
* For sitemaps it should be given as character data. However,
|
||||
* some PageMaps sitemaps in the wild also use the HTML
|
||||
* mechanism. We fall back to the HTML mechanism if there is no
|
||||
* or white space only character data.
|
||||
*/
|
||||
currAttrValFromAttr = attributes.getValue("value");
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endElement(String uri, String localName, String qName) throws SAXException {
|
||||
switch (localName) {
|
||||
case "PageMap":
|
||||
attributes.add(currPageMap);
|
||||
break;
|
||||
case "DataObject":
|
||||
break;
|
||||
case "Attribute":
|
||||
String currAttrValStr = currAttrVal.toString().trim();
|
||||
if (currDataObj == null) {
|
||||
// ignore lonesome attributes
|
||||
} else if (currAttrValStr.isEmpty() && currAttrValFromAttr != null) {
|
||||
/*
|
||||
* If there is no or white space only character data, fall back
|
||||
* to the HTML mechanism and use the content of the attribute
|
||||
* "value".
|
||||
*/
|
||||
currDataObj.addAttribute(currAttrName, currAttrValFromAttr);
|
||||
} else {
|
||||
currDataObj.addAttribute(currAttrName, currAttrValStr);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void characters(char[] ch, int start, int length) throws SAXException {
|
||||
currAttrVal.append(String.valueOf(ch, start, length));
|
||||
}
|
||||
|
||||
}
|
|
@ -16,17 +16,32 @@
|
|||
|
||||
package crawlercommons.sitemaps;
|
||||
|
||||
import crawlercommons.sitemaps.extension.*;
|
||||
import org.junit.jupiter.api.Test;
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertNull;
|
||||
import static org.junit.jupiter.api.Assertions.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.time.ZonedDateTime;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.*;
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
import crawlercommons.sitemaps.extension.Extension;
|
||||
import crawlercommons.sitemaps.extension.ExtensionMetadata;
|
||||
import crawlercommons.sitemaps.extension.ImageAttributes;
|
||||
import crawlercommons.sitemaps.extension.LinkAttributes;
|
||||
import crawlercommons.sitemaps.extension.MobileAttributes;
|
||||
import crawlercommons.sitemaps.extension.NewsAttributes;
|
||||
import crawlercommons.sitemaps.extension.PageMap;
|
||||
import crawlercommons.sitemaps.extension.PageMapDataObject;
|
||||
import crawlercommons.sitemaps.extension.VideoAttributes;
|
||||
|
||||
public class SiteMapParserExtensionTest {
|
||||
|
||||
|
@ -257,4 +272,56 @@ public class SiteMapParserExtensionTest {
|
|||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(74, sm.getSiteMapUrls().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPageMapSitemap() throws UnknownFormatException, IOException {
|
||||
SiteMapParser parser = new SiteMapParser();
|
||||
parser.setStrictNamespace(true);
|
||||
parser.enableExtension(Extension.PAGEMAPS);
|
||||
|
||||
String urlStr = "http://www.example.com/pagemaps-sitemap.xml";
|
||||
URL url = new URL(urlStr);
|
||||
AbstractSiteMap asm = parse(parser, "src/test/resources/sitemaps/extension/pagemaps-sitemap.xml", url);
|
||||
|
||||
assertEquals(false, asm.isIndex());
|
||||
assertEquals(true, asm instanceof SiteMap);
|
||||
SiteMap sm = (SiteMap) asm;
|
||||
assertEquals(2, sm.getSiteMapUrls().size());
|
||||
assertEquals(urlStr, sm.getUrl().toString());
|
||||
// System.out.println(sm.toString());
|
||||
for (SiteMapURL u : sm.getSiteMapUrls()) {
|
||||
System.out.println(u.toString());
|
||||
for (Entry<Extension, ExtensionMetadata[]> x : u.getAttributes().entrySet()) {
|
||||
assertEquals(Extension.PAGEMAPS, x.getKey());
|
||||
System.out.println(x.getValue().getClass());
|
||||
PageMap pageMap = (PageMap) x.getValue()[0];
|
||||
List<PageMapDataObject> dataObjects = pageMap.getPageMapDataObjects();
|
||||
PageMapDataObject dataObject;
|
||||
switch (u.getUrl().toString()) {
|
||||
case "http://www.example.com/foo":
|
||||
assertEquals(2, dataObjects.size());
|
||||
dataObject = dataObjects.get(0);
|
||||
assertEquals("document", dataObject.getType());
|
||||
assertEquals("one", dataObject.getId());
|
||||
assertEquals("Doc One", dataObject.getAttribute("name"));
|
||||
assertEquals("3.5", dataObject.getAttribute("review"));
|
||||
dataObject = dataObjects.get(1);
|
||||
assertEquals("image", dataObject.getType());
|
||||
assertNull(dataObject.getId());
|
||||
assertEquals("http://www.example.com/foo.gif", dataObject.getAttribute("image_src"));
|
||||
break;
|
||||
case "http://www.example.com/bar":
|
||||
assertEquals(1, dataObjects.size());
|
||||
dataObject = dataObjects.get(0);
|
||||
assertEquals("document", dataObject.getType());
|
||||
assertEquals("two", dataObject.getId());
|
||||
assertEquals("Doc Two", dataObject.getAttribute("name"));
|
||||
assertEquals("4.0", dataObject.getAttribute("review"));
|
||||
break;
|
||||
}
|
||||
System.out.println(x.getKey() + ": " + Arrays.toString(x.getValue()));
|
||||
System.out.println(x.getValue().length);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,71 @@
|
|||
/**
|
||||
* Copyright 2023 Crawler-Commons
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package crawlercommons.sitemaps.extension;
|
||||
|
||||
import static org.junit.jupiter.api.Assertions.assertEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotEquals;
|
||||
import static org.junit.jupiter.api.Assertions.assertNotNull;
|
||||
|
||||
import org.junit.jupiter.api.Test;
|
||||
|
||||
public class PageMapsTest {
|
||||
|
||||
@Test
|
||||
public void testPageMapAttributesEquals() {
|
||||
PageMap a = new PageMap();
|
||||
PageMapDataObject da = new PageMapDataObject("test", "a");
|
||||
da.addAttribute("foo", "bar");
|
||||
a.addDataObject(da);
|
||||
assertEquals(a, a);
|
||||
assertNotNull(a.toString());
|
||||
assertEquals(1, a.getPageMapDataObjects().size());
|
||||
assertEquals("test", a.getPageMapDataObjects().get(0).getType());
|
||||
assertEquals("a", a.getPageMapDataObjects().get(0).getId());
|
||||
assertEquals("bar", a.getPageMapDataObjects().get(0).getAttribute("foo"));
|
||||
|
||||
PageMap b = new PageMap();
|
||||
PageMapDataObject db = new PageMapDataObject("test", "a");
|
||||
db.addAttribute("foo", "bar");
|
||||
b.addDataObject(db);
|
||||
assertEquals(da, db);
|
||||
assertEquals(a, b);
|
||||
db.addAttribute("hello", "world");
|
||||
assertNotEquals(da, db);
|
||||
assertNotEquals(a, b);
|
||||
assertEquals(b, b);
|
||||
assertNotNull(a.toString());
|
||||
assertEquals(1, b.getPageMapDataObjects().size());
|
||||
assertEquals("test", b.getPageMapDataObjects().get(0).getType());
|
||||
assertEquals("a", b.getPageMapDataObjects().get(0).getId());
|
||||
assertEquals("bar", b.getPageMapDataObjects().get(0).getAttribute("foo"));
|
||||
assertEquals("world", b.getPageMapDataObjects().get(0).getAttribute("hello"));
|
||||
assertEquals(1, b.asMap().size());
|
||||
assertNotNull(b.asMap().get("test::a"));
|
||||
assertEquals(2, b.asMap().get("test::a").length);
|
||||
|
||||
PageMap c = new PageMap();
|
||||
PageMapDataObject dc = new PageMapDataObject("test", "c");
|
||||
dc.addAttribute("abc", "xyz");
|
||||
c.addDataObject(dc);
|
||||
assertEquals(c, c);
|
||||
assertNotEquals(a, c);
|
||||
assertNotNull(a.toString());
|
||||
assertEquals(1, c.getPageMapDataObjects().size());
|
||||
assertEquals("test", c.getPageMapDataObjects().get(0).getType());
|
||||
assertEquals("c", c.getPageMapDataObjects().get(0).getId());
|
||||
assertEquals("xyz", c.getPageMapDataObjects().get(0).getAttribute("abc"));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,30 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"
|
||||
xmlns:content="http://www.google.com/schemas/sitemap-content/1.0">
|
||||
<!-- source: Google's Programmable Search Engine documentation - PageMaps
|
||||
https://support.google.com/programmable-search/answer/1628213
|
||||
and Providing Structured Data
|
||||
https://developers.google.com/custom-search/docs/structured_data
|
||||
-->
|
||||
<url>
|
||||
<loc>http://www.example.com/foo</loc>
|
||||
<PageMap xmlns="http://www.google.com/schemas/sitemap-pagemap/1.0">
|
||||
<DataObject type="document" id="one">
|
||||
<Attribute name="name">Doc One</Attribute>
|
||||
<Attribute name="review">3.5</Attribute>
|
||||
</DataObject>
|
||||
<DataObject type="image">
|
||||
<Attribute name="image_src">http://www.example.com/foo.gif</Attribute>
|
||||
</DataObject>
|
||||
</PageMap>
|
||||
</url>
|
||||
<url>
|
||||
<loc>http://www.example.com/bar</loc>
|
||||
<PageMap xmlns="http://www.google.com/schemas/sitemap-pagemap/1.0">
|
||||
<DataObject type="document" id="two">
|
||||
<Attribute name="name">Doc Two</Attribute>
|
||||
<Attribute name="review">4.0</Attribute>
|
||||
</DataObject>
|
||||
</PageMap>
|
||||
</url>
|
||||
</urlset>
|
Loading…
Reference in New Issue