1
0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-09-20 08:42:15 +02:00

EffectiveTldFinder to log loading of public suffix list, fixes #284

- log the location of the public suffix list (effective_tld_names.dat)
  during initialization
- log errors while loading as errors
- improve documentation about public suffix list and class initialization
This commit is contained in:
Sebastian Nagel 2020-02-17 15:49:22 +01:00
parent 8522cfdd34
commit c070453c5e
3 changed files with 26 additions and 19 deletions

View File

@ -1,6 +1,7 @@
Crawler-Commons Change Log
Current Development 1.1-SNAPSHOT (yyyy-mm-dd)
- EffectiveTldFinder to log loading of public suffix list (sebastian-nagel) #284
- SiteMapParser getPublicationDate in VideoAttributes may throw NPE (panthony, sebastian-nagel) #283
- SimpleRobotRulesParser: Trim log messages (jnioche, sebastian-nagel) #281
- SimpleRobotRulesParser: counter _numWarnings not thread-safe (sebastian-nagel, kkrugler) #278

View File

@ -23,6 +23,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.IDN;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
@ -39,26 +40,30 @@ import org.slf4j.LoggerFactory;
* of the various domain registrars and their assignment policies. The best
* publicly available knowledge base is the public suffix list maintained and
* available at <a href="https://publicsuffix.org/">publicsuffix.org</a>. This
* class implements the <a
* href="https://publicsuffix.org/list/">publicsuffix.org ruleset</a> and uses a
* copy of the public suffix list. data file format.
* class implements the
* <a href="https://publicsuffix.org/list/">publicsuffix.org ruleset</a> and
* uses a copy of the public suffix list.
*
* For more information, see
* <ul>
* <li><a href="http://www.publicsuffix.org">publicsuffix.org</a></li>
* <li><a href="https://www.publicsuffix.org/">publicsuffix.org</a></li>
* <li><a href="https://en.wikipedia.org/wiki/Public_Suffix_List">Wikipedia
* article about the public suffix list</a></li>
* <li>Mozilla's <a
* href="http://wiki.mozilla.org/Gecko:Effective_TLD_Service">Effective TLD
* <li>Mozilla's
* <a href="https://wiki.mozilla.org/Gecko:Effective_TLD_Service">Effective TLD
* Service</a>: for historic reasons the class name stems from the term
* &quot;effective top-level domain&quot; (eTLD)</li>
* </ul>
*
* This class just needs "effective_tld_names.dat" in the classpath. If you want
* to configure it with other data, call
* {@link EffectiveTldFinder#getInstance() EffectiveTldFinder.getInstance()}
* {@link EffectiveTldFinder#initialize(InputStream) .initialize(InputStream)}.
* Updates to the public suffix list can be found here:
* EffectiveTldFinder loads the public suffix list as file
* "effective_tld_names.dat" from the Java classpath. Make sure your classpath
* does not contain any other file with the same name, eg. an outdated list
* shipped with a third party library. To force EffectiveTldFinder to load an
* updated or modified public suffix list, call
* {@link EffectiveTldFinder#getInstance()
* EffectiveTldFinder.getInstance()}{@link EffectiveTldFinder#initialize(InputStream)
* .initialize(InputStream)}. Updates to the public suffix list can be found
* here:
* <ul>
* <li><a href= "https://publicsuffix.org/list/public_suffix_list.dat"
* >https://publicsuffix.org/list/public_suffix_list.dat</a></li>
@ -115,10 +120,16 @@ public class EffectiveTldFinder {
private boolean configured = false;
/**
* A singleton
* A singleton loading the public suffix list from the Java class path.
*/
private EffectiveTldFinder() {
initialize(this.getClass().getResourceAsStream(ETLD_DATA));
URL publicSuffixList = this.getClass().getResource(ETLD_DATA);
LOGGER.info("Loading public suffix list from class path: {}", publicSuffixList);
try (InputStream is = publicSuffixList.openStream()) {
initialize(is);
} catch (IOException e) {
LOGGER.error("Failed to load public suffix list {} from class path: {}", publicSuffixList, e);
}
}
/**
@ -167,9 +178,7 @@ public class EffectiveTldFinder {
}
configured = true;
} catch (IOException e) {
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("EffectiveTldFinder configuration failed: ", e);
}
LOGGER.error("EffectiveTldFinder configuration failed: ", e);
configured = false;
}
return configured;

View File

@ -22,7 +22,4 @@ log4j.appender.console.target=System.out
log4j.appender.console.layout=org.apache.log4j.PatternLayout
log4j.appender.console.layout.ConversionPattern=${cc.console.pattern}
# Custom Logging levels
log4j.logger.crawlercommons.url.EffectiveTldFinder=WARN