1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-25 11:46:03 +02:00

- address TODO of #184: do not use of private suffixes

for PaidLevelDomain detection
- adapt unit tests: 'de.com' is a private suffix,
  consequently 'xxx.de.com' isn't a PaidLevelDomain
- rebase on master, apply code format
This commit is contained in:
Sebastian Nagel 2017-11-06 13:33:36 +01:00
parent cda1bdee82
commit f598af8d28
3 changed files with 23 additions and 21 deletions

View File

@ -82,14 +82,14 @@ import org.slf4j.LoggerFactory;
*/
public class EffectiveTldFinder {
private static final Logger LOGGER = LoggerFactory.getLogger(EffectiveTldFinder.class);
public static final String ETLD_DATA = "/effective_tld_names.dat";
public static final String COMMENT = "//";
public static final String DOT_REGEX = "\\.";
public static final String EXCEPTION = "!";
public static final String WILD_CARD = "*.";
public static final char DOT = '.';
private static EffectiveTldFinder instance = null;
private Map<String, EffectiveTLD> domains = null;
private boolean configured = false;
@ -184,7 +184,7 @@ public class EffectiveTldFinder {
public static EffectiveTLD getEffectiveTLD(String hostname, boolean excludePrivate) {
if (getInstance().domains.containsKey(hostname)) {
EffectiveTLD foundTld = getInstance().domains.get(hostname);
if (!excludePrivate || !foundTld.isPrivate) {
if (!(excludePrivate && foundTld.isPrivate)) {
return foundTld;
}
}
@ -329,7 +329,7 @@ public class EffectiveTldFinder {
} else {
domain = line;
}
domain = normalizeName(domain);
isPrivate = isPrivateDomain;
}

View File

@ -54,12 +54,12 @@ public class PaidLevelDomain {
}
// Now use support in EffectiveTldFinder
String result = EffectiveTldFinder.getAssignedDomain(hostname, true);
String result = EffectiveTldFinder.getAssignedDomain(hostname, true, true);
if (result == null) {
LOGGER.debug("Hostname {} isn't a valid FQDN", hostname);
return hostname;
LOGGER.debug("Hostname {} isn't a valid FQDN", hostname);
return hostname;
} else {
return result;
return result;
}
}

View File

@ -40,11 +40,11 @@ public class PaidLevelDomainTest {
@Test
public void testInvalidFQDN() {
assertEquals("blah", PaidLevelDomain.getPLD("blah"));
assertEquals("1.2.3", PaidLevelDomain.getPLD("1.2.3"));
assertEquals("me.i", PaidLevelDomain.getPLD("me.i"));
assertEquals("blah", PaidLevelDomain.getPLD("blah"));
assertEquals("1.2.3", PaidLevelDomain.getPLD("1.2.3"));
assertEquals("me.i", PaidLevelDomain.getPLD("me.i"));
}
@Test
public final void testIPv6() throws MalformedURLException, UnknownHostException {
InetAddress inet = InetAddress.getByName("1080:0:0:0:8:800:200c:417a");
@ -74,11 +74,12 @@ public class PaidLevelDomainTest {
assertEquals("xxx.ne.jp", PaidLevelDomain.getPLD("www.xxx.ne.jp"));
}
// In Germany you can have xxx.de.com
// de.com (and com.de) are domains registered by CentralNic,
// xxx.de.com and xxx.com.de are private domains
@Test
public final void testGermanDomains() {
assertEquals("xxx.de.com", PaidLevelDomain.getPLD("xxx.de.com"));
assertEquals("xxx.de.com", PaidLevelDomain.getPLD("www.xxx.de.com"));
assertEquals("de.com", PaidLevelDomain.getPLD("xxx.de.com"));
assertEquals("de.com", PaidLevelDomain.getPLD("www.xxx.de.com"));
}
// Typical international domains look like xxx.it. So xxx.com.it is
@ -89,18 +90,19 @@ public class PaidLevelDomainTest {
assertEquals("xxx.it", PaidLevelDomain.getPLD("www.xxx.it"));
assertEquals("com.it", PaidLevelDomain.getPLD("xxx.com.it"));
}
@Test
public final void testFinnishDomains() {
assertEquals("fi.com", PaidLevelDomain.getPLD("www.fi.com"));
}
// TODO enable this test when getPLD uses new TLD support to exclude
// private domains (See https://github.com/crawler-commons/crawler-commons/pull/186)
@Ignore
@Test
public final void testPrivateDomains() {
assertEquals("blogspot.com", PaidLevelDomain.getPLD("myblog.blogspot.com"));
/*
* do not match "private" domains (based on public suffixes from the
* private section of the public suffix list)
*/
assertEquals("blogspot.com", PaidLevelDomain.getPLD("myblog.blogspot.com"));
}
}