mirror of
https://github.com/crawler-commons/crawler-commons
synced 2024-05-25 11:46:03 +02:00
- address TODO of #184: do not use of private suffixes
for PaidLevelDomain detection - adapt unit tests: 'de.com' is a private suffix, consequently 'xxx.de.com' isn't a PaidLevelDomain - rebase on master, apply code format
This commit is contained in:
parent
cda1bdee82
commit
f598af8d28
|
@ -82,14 +82,14 @@ import org.slf4j.LoggerFactory;
|
|||
*/
|
||||
public class EffectiveTldFinder {
|
||||
private static final Logger LOGGER = LoggerFactory.getLogger(EffectiveTldFinder.class);
|
||||
|
||||
|
||||
public static final String ETLD_DATA = "/effective_tld_names.dat";
|
||||
public static final String COMMENT = "//";
|
||||
public static final String DOT_REGEX = "\\.";
|
||||
public static final String EXCEPTION = "!";
|
||||
public static final String WILD_CARD = "*.";
|
||||
public static final char DOT = '.';
|
||||
|
||||
|
||||
private static EffectiveTldFinder instance = null;
|
||||
private Map<String, EffectiveTLD> domains = null;
|
||||
private boolean configured = false;
|
||||
|
@ -184,7 +184,7 @@ public class EffectiveTldFinder {
|
|||
public static EffectiveTLD getEffectiveTLD(String hostname, boolean excludePrivate) {
|
||||
if (getInstance().domains.containsKey(hostname)) {
|
||||
EffectiveTLD foundTld = getInstance().domains.get(hostname);
|
||||
if (!excludePrivate || !foundTld.isPrivate) {
|
||||
if (!(excludePrivate && foundTld.isPrivate)) {
|
||||
return foundTld;
|
||||
}
|
||||
}
|
||||
|
@ -329,7 +329,7 @@ public class EffectiveTldFinder {
|
|||
} else {
|
||||
domain = line;
|
||||
}
|
||||
|
||||
|
||||
domain = normalizeName(domain);
|
||||
isPrivate = isPrivateDomain;
|
||||
}
|
||||
|
|
|
@ -54,12 +54,12 @@ public class PaidLevelDomain {
|
|||
}
|
||||
|
||||
// Now use support in EffectiveTldFinder
|
||||
String result = EffectiveTldFinder.getAssignedDomain(hostname, true);
|
||||
String result = EffectiveTldFinder.getAssignedDomain(hostname, true, true);
|
||||
if (result == null) {
|
||||
LOGGER.debug("Hostname {} isn't a valid FQDN", hostname);
|
||||
return hostname;
|
||||
LOGGER.debug("Hostname {} isn't a valid FQDN", hostname);
|
||||
return hostname;
|
||||
} else {
|
||||
return result;
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -40,11 +40,11 @@ public class PaidLevelDomainTest {
|
|||
|
||||
@Test
|
||||
public void testInvalidFQDN() {
|
||||
assertEquals("blah", PaidLevelDomain.getPLD("blah"));
|
||||
assertEquals("1.2.3", PaidLevelDomain.getPLD("1.2.3"));
|
||||
assertEquals("me.i", PaidLevelDomain.getPLD("me.i"));
|
||||
assertEquals("blah", PaidLevelDomain.getPLD("blah"));
|
||||
assertEquals("1.2.3", PaidLevelDomain.getPLD("1.2.3"));
|
||||
assertEquals("me.i", PaidLevelDomain.getPLD("me.i"));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public final void testIPv6() throws MalformedURLException, UnknownHostException {
|
||||
InetAddress inet = InetAddress.getByName("1080:0:0:0:8:800:200c:417a");
|
||||
|
@ -74,11 +74,12 @@ public class PaidLevelDomainTest {
|
|||
assertEquals("xxx.ne.jp", PaidLevelDomain.getPLD("www.xxx.ne.jp"));
|
||||
}
|
||||
|
||||
// In Germany you can have xxx.de.com
|
||||
// de.com (and com.de) are domains registered by CentralNic,
|
||||
// xxx.de.com and xxx.com.de are private domains
|
||||
@Test
|
||||
public final void testGermanDomains() {
|
||||
assertEquals("xxx.de.com", PaidLevelDomain.getPLD("xxx.de.com"));
|
||||
assertEquals("xxx.de.com", PaidLevelDomain.getPLD("www.xxx.de.com"));
|
||||
assertEquals("de.com", PaidLevelDomain.getPLD("xxx.de.com"));
|
||||
assertEquals("de.com", PaidLevelDomain.getPLD("www.xxx.de.com"));
|
||||
}
|
||||
|
||||
// Typical international domains look like xxx.it. So xxx.com.it is
|
||||
|
@ -89,18 +90,19 @@ public class PaidLevelDomainTest {
|
|||
assertEquals("xxx.it", PaidLevelDomain.getPLD("www.xxx.it"));
|
||||
assertEquals("com.it", PaidLevelDomain.getPLD("xxx.com.it"));
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public final void testFinnishDomains() {
|
||||
assertEquals("fi.com", PaidLevelDomain.getPLD("www.fi.com"));
|
||||
}
|
||||
|
||||
// TODO enable this test when getPLD uses new TLD support to exclude
|
||||
// private domains (See https://github.com/crawler-commons/crawler-commons/pull/186)
|
||||
@Ignore
|
||||
|
||||
@Test
|
||||
public final void testPrivateDomains() {
|
||||
assertEquals("blogspot.com", PaidLevelDomain.getPLD("myblog.blogspot.com"));
|
||||
/*
|
||||
* do not match "private" domains (based on public suffixes from the
|
||||
* private section of the public suffix list)
|
||||
*/
|
||||
assertEquals("blogspot.com", PaidLevelDomain.getPLD("myblog.blogspot.com"));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue