mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-08 15:36:04 +02:00
Upgrade to JDK 1.8
This commit is contained in:
parent
fc3378cb95
commit
18bbae908c
|
@ -1,7 +1,9 @@
|
|||
language: java
|
||||
|
||||
jdk:
|
||||
- oraclejdk8
|
||||
|
||||
script:
|
||||
- jdk_switcher use oraclejdk8
|
||||
- mvn install javadoc:aggregate
|
||||
|
||||
notifications:
|
||||
|
|
|
@ -108,9 +108,11 @@ public class EffectiveTldFinder {
|
|||
}
|
||||
|
||||
/**
|
||||
* @param hostname the hostname for which to find the
|
||||
* {@link crawlercommons.domains.EffectiveTldFinder.EffectiveTLD}
|
||||
* @return the {@link crawlercommons.domains.EffectiveTldFinder.EffectiveTLD}
|
||||
* @param hostname
|
||||
* the hostname for which to find the
|
||||
* {@link crawlercommons.domains.EffectiveTldFinder.EffectiveTLD}
|
||||
* @return the
|
||||
* {@link crawlercommons.domains.EffectiveTldFinder.EffectiveTLD}
|
||||
*/
|
||||
public static EffectiveTLD getEffectiveTLD(String hostname) {
|
||||
if (getInstance().domains.containsKey(hostname)) {
|
||||
|
@ -145,7 +147,8 @@ public class EffectiveTldFinder {
|
|||
* This method uses the effective TLD to determine which component of a FQDN
|
||||
* is the NIC-assigned domain name.
|
||||
*
|
||||
* @param hostname a string for which to obtain a NIC-assigned domain name
|
||||
* @param hostname
|
||||
* a string for which to obtain a NIC-assigned domain name
|
||||
* @return the NIC-assigned domain name
|
||||
*/
|
||||
public static String getAssignedDomain(String hostname) {
|
||||
|
|
|
@ -54,9 +54,12 @@ public class UserAgent implements Serializable {
|
|||
/**
|
||||
* Set user agent characteristics
|
||||
*
|
||||
* @param agentName an agent name string to associate with the crawler
|
||||
* @param emailAddress an agent email address string to associate with the crawler
|
||||
* @param webAddress a Web address string to associate with the crawler
|
||||
* @param agentName
|
||||
* an agent name string to associate with the crawler
|
||||
* @param emailAddress
|
||||
* an agent email address string to associate with the crawler
|
||||
* @param webAddress
|
||||
* a Web address string to associate with the crawler
|
||||
*/
|
||||
public UserAgent(String agentName, String emailAddress, String webAddress) {
|
||||
this(agentName, emailAddress, webAddress, DEFAULT_BROWSER_VERSION);
|
||||
|
@ -65,10 +68,14 @@ public class UserAgent implements Serializable {
|
|||
/**
|
||||
* Set user agent characteristics
|
||||
*
|
||||
* @param agentName an agent name string to associate with the crawler
|
||||
* @param emailAddress an agent email address string to associate with the crawler
|
||||
* @param webAddress a Web address string to associate with the crawler
|
||||
* @param browserVersion a browser version to mimic
|
||||
* @param agentName
|
||||
* an agent name string to associate with the crawler
|
||||
* @param emailAddress
|
||||
* an agent email address string to associate with the crawler
|
||||
* @param webAddress
|
||||
* a Web address string to associate with the crawler
|
||||
* @param browserVersion
|
||||
* a browser version to mimic
|
||||
*/
|
||||
public UserAgent(String agentName, String emailAddress, String webAddress, String browserVersion) {
|
||||
this(agentName, emailAddress, webAddress, browserVersion, DEFAULT_CRAWLER_VERSION);
|
||||
|
@ -77,11 +84,16 @@ public class UserAgent implements Serializable {
|
|||
/**
|
||||
* Set user agent characteristics
|
||||
*
|
||||
* @param agentName an agent name string to associate with the crawler
|
||||
* @param emailAddress an agent email address string to associate with the crawler
|
||||
* @param webAddress a Web address string to associate with the crawler
|
||||
* @param browserVersion a browser version to mimic
|
||||
* @param crawlerVersion the version of your crawler/crawl agent
|
||||
* @param agentName
|
||||
* an agent name string to associate with the crawler
|
||||
* @param emailAddress
|
||||
* an agent email address string to associate with the crawler
|
||||
* @param webAddress
|
||||
* a Web address string to associate with the crawler
|
||||
* @param browserVersion
|
||||
* a browser version to mimic
|
||||
* @param crawlerVersion
|
||||
* the version of your crawler/crawl agent
|
||||
*/
|
||||
public UserAgent(String agentName, String emailAddress, String webAddress, String browserVersion, String crawlerVersion) {
|
||||
this.agentName = agentName;
|
||||
|
@ -106,7 +118,8 @@ public class UserAgent implements Serializable {
|
|||
* @return User Agent String
|
||||
*/
|
||||
public String getUserAgentString() {
|
||||
// Mozilla/5.0 (compatible; mycrawler/1.0; +http://www.mydomain.com; mycrawler@mydomain.com)
|
||||
// Mozilla/5.0 (compatible; mycrawler/1.0; +http://www.mydomain.com;
|
||||
// mycrawler@mydomain.com)
|
||||
return String.format(Locale.getDefault(), "%s (compatible; %s%s; +%s; %s)", browserVersion, getAgentName(), crawlerConfiguration, webAddress, emailAddress);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,7 +21,9 @@ public abstract class URLFilter {
|
|||
/**
|
||||
* Returns a modified version of the input URL or null if the URL should be
|
||||
* removed
|
||||
* @param urlString a URL string to check against filter(s)
|
||||
*
|
||||
* @param urlString
|
||||
* a URL string to check against filter(s)
|
||||
* @return a filtered URL
|
||||
**/
|
||||
public abstract String filter(String urlString);
|
||||
|
|
|
@ -22,11 +22,11 @@ import java.io.Serializable;
|
|||
public abstract class BaseRobotsParser implements Serializable {
|
||||
|
||||
/**
|
||||
* Parse the robots.txt file in <i>content</i>, and return rules appropriate for
|
||||
* processing paths by <i>userAgent</i>. Note that multiple agent names may be
|
||||
* provided as comma-separated values; the order of these shouldn't matter,
|
||||
* as the file is parsed in order, and each agent name found in the file
|
||||
* will be compared to every agent name found in robotNames.
|
||||
* Parse the robots.txt file in <i>content</i>, and return rules appropriate
|
||||
* for processing paths by <i>userAgent</i>. Note that multiple agent names
|
||||
* may be provided as comma-separated values; the order of these shouldn't
|
||||
* matter, as the file is parsed in order, and each agent name found in the
|
||||
* file will be compared to every agent name found in robotNames.
|
||||
*
|
||||
* Also note that names are lower-cased before comparison, and that any
|
||||
* robot name you pass shouldn't contain commas or spaces; if the name has
|
||||
|
|
|
@ -86,7 +86,8 @@ public class RobotUtils {
|
|||
* @param fetcher
|
||||
* Fetcher for downloading robots.txt file
|
||||
* @param parser
|
||||
* a {@link crawlercommons.robots.BaseRobotsParser} to use for obtaining appropriate rules
|
||||
* a {@link crawlercommons.robots.BaseRobotsParser} to use for
|
||||
* obtaining appropriate rules
|
||||
* @param robotsUrl
|
||||
* URL to robots.txt file
|
||||
* @return Robot rules
|
||||
|
|
|
@ -65,7 +65,7 @@ public class SimpleRobotRules extends BaseRobotRules {
|
|||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
*
|
||||
* @see java.lang.Object#hashCode()
|
||||
*/
|
||||
@Override
|
||||
|
@ -79,7 +79,7 @@ public class SimpleRobotRules extends BaseRobotRules {
|
|||
|
||||
/*
|
||||
* (non-Javadoc)
|
||||
*
|
||||
*
|
||||
* @see java.lang.Object#equals(java.lang.Object)
|
||||
*/
|
||||
@Override
|
||||
|
@ -275,7 +275,7 @@ public class SimpleRobotRules extends BaseRobotRules {
|
|||
|
||||
/**
|
||||
* Is our ruleset set up to allow all access?
|
||||
*
|
||||
*
|
||||
* @return true if all URLs are allowed.
|
||||
*/
|
||||
@Override
|
||||
|
@ -285,7 +285,7 @@ public class SimpleRobotRules extends BaseRobotRules {
|
|||
|
||||
/**
|
||||
* Is our ruleset set up to disallow all access?
|
||||
*
|
||||
*
|
||||
* @return true if no URLs are allowed.
|
||||
*/
|
||||
@Override
|
||||
|
|
|
@ -104,7 +104,8 @@ public class SiteMapURL {
|
|||
/**
|
||||
* Set the URL.
|
||||
*
|
||||
* @param url of the sitemap
|
||||
* @param url
|
||||
* of the sitemap
|
||||
*/
|
||||
public void setUrl(URL url) {
|
||||
this.url = url;
|
||||
|
@ -138,7 +139,8 @@ public class SiteMapURL {
|
|||
/**
|
||||
* Set when this URL was last modified.
|
||||
*
|
||||
* @param lastModified the last time the sitemap was modified
|
||||
* @param lastModified
|
||||
* the last time the sitemap was modified
|
||||
*/
|
||||
public void setLastModified(String lastModified) {
|
||||
this.lastModified = SiteMap.convertToDate(lastModified);
|
||||
|
@ -147,7 +149,8 @@ public class SiteMapURL {
|
|||
/**
|
||||
* Set when this URL was last modified.
|
||||
*
|
||||
* @param lastModified the last time the sitemap was modified
|
||||
* @param lastModified
|
||||
* the last time the sitemap was modified
|
||||
*/
|
||||
public void setLastModified(Date lastModified) {
|
||||
this.lastModified = lastModified;
|
||||
|
@ -166,7 +169,8 @@ public class SiteMapURL {
|
|||
* Set the URL's priority to a value between [0.0 - 1.0] (Default Priority
|
||||
* is used if the given priority is out of range).
|
||||
*
|
||||
* @param priority a value between [0.0 - 1.0]
|
||||
* @param priority
|
||||
* a value between [0.0 - 1.0]
|
||||
*/
|
||||
public void setPriority(double priority) {
|
||||
|
||||
|
@ -183,7 +187,8 @@ public class SiteMapURL {
|
|||
* Set the URL's priority to a value between [0.0 - 1.0] (Default Priority
|
||||
* is used if the given priority missing or is out of range).
|
||||
*
|
||||
* @param priorityStr a value between [0.0 - 1.0]
|
||||
* @param priorityStr
|
||||
* a value between [0.0 - 1.0]
|
||||
*/
|
||||
public void setPriority(String priorityStr) {
|
||||
try {
|
||||
|
@ -211,8 +216,9 @@ public class SiteMapURL {
|
|||
/**
|
||||
* Set the URL's change frequency
|
||||
*
|
||||
* @param changeFreq a {@link crawlercommons.sitemaps.SiteMapURL.ChangeFrequency}
|
||||
* for this sitemap
|
||||
* @param changeFreq
|
||||
* a {@link crawlercommons.sitemaps.SiteMapURL.ChangeFrequency}
|
||||
* for this sitemap
|
||||
*/
|
||||
public void setChangeFrequency(ChangeFrequency changeFreq) {
|
||||
this.changeFreq = changeFreq;
|
||||
|
@ -222,8 +228,10 @@ public class SiteMapURL {
|
|||
* Set the URL's change frequency In case of a bad ChangeFrequency, the
|
||||
* current frequency in this instance will be set to NULL
|
||||
*
|
||||
* @param changeFreq a string representing a
|
||||
* {@link crawlercommons.sitemaps.SiteMapURL.ChangeFrequency} for this sitemap
|
||||
* @param changeFreq
|
||||
* a string representing a
|
||||
* {@link crawlercommons.sitemaps.SiteMapURL.ChangeFrequency} for
|
||||
* this sitemap
|
||||
*/
|
||||
public void setChangeFrequency(String changeFreq) {
|
||||
|
||||
|
@ -253,7 +261,9 @@ public class SiteMapURL {
|
|||
/**
|
||||
* Valid means that it follows the official guidelines that the siteMapURL
|
||||
* must be under the base url
|
||||
* @param valid whether the Sitemap is valid syntax or not
|
||||
*
|
||||
* @param valid
|
||||
* whether the Sitemap is valid syntax or not
|
||||
*/
|
||||
public void setValid(boolean valid) {
|
||||
this.valid = valid;
|
||||
|
@ -261,6 +271,7 @@ public class SiteMapURL {
|
|||
|
||||
/**
|
||||
* Is the siteMapURL under the base url ?
|
||||
*
|
||||
* @return true if the syntax is valid, false otherwise
|
||||
*/
|
||||
public boolean isValid() {
|
||||
|
|
|
@ -30,7 +30,9 @@ public class UnknownFormatException extends Exception {
|
|||
/**
|
||||
* Constructor receives some kind of message that is saved in an instance
|
||||
* variable.
|
||||
* @param err a String object to use within the Execption
|
||||
*
|
||||
* @param err
|
||||
* a String object to use within the Execption
|
||||
*/
|
||||
public UnknownFormatException(String err) {
|
||||
super(err);
|
||||
|
@ -40,6 +42,7 @@ public class UnknownFormatException extends Exception {
|
|||
/**
|
||||
* public method, callable by exception catcher. It returns the error
|
||||
* message.
|
||||
*
|
||||
* @return a populated Exception as a String
|
||||
*/
|
||||
public String getError() {
|
||||
|
|
Loading…
Reference in New Issue