1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-06-09 23:36:05 +02:00
crawler-commons/src/main/java/crawlercommons/url/PaidLevelDomain.java

135 lines
5.5 KiB
Java

/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package crawlercommons.url;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Locale;
import java.util.Set;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
* Routines to extract the PLD (paid-level domain, as per the IRLbot paper) from
* a hostname or URL.
*
*/
public class PaidLevelDomain {
private static final Logger LOGGER = LoggerFactory.getLogger(PaidLevelDomain.class);
private static final String CC_TLDS = "ac ad ae af ag ai al am an ao aq ar as at au aw ax az ba bb bd be bf bg bh bi "
+ "bj bl bm bn bo br bs bt bv bw by bz ca cc cd cf cg ch ci ck cl cm cn co cr cu " + "cv cx cy cz de dj dk dm do dz ec ee eg eh er es et eu fi fj fk fm fo fr ga gb "
+ "gd ge gf gg gh gi gl gm gn gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il " + "im in io iq ir is it je jm jo jp ke kg kh ki km kn kp kr kw ky kz la lb lc li "
+ "lk lr ls lt lu lv ly ma mc md me mf mg mh mk ml mm mn mo mp mq mr ms mt mu mv " + "mw mx my mz na nc ne nf ng ni nl no np nr nu nz om pa pe pf pg ph pk pl pm pn "
+ "pr ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl sm sn so sr " + "st su sv sy sz tc td tf tg th tj tk tl tm tn to tp tr tt tv tw tz ua ug uk um "
+ "us uy uz va vc ve vg vi vn vu wf ws ye yt yu za zm zw";
private static final String G_TLDS = "aero arpa asia biz cat com coop edu gov info int jobs me mil mobi museum name net " + "org pro rs tel";
private static final Set<String> ccTLDs = new HashSet<String>(Arrays.asList(CC_TLDS.split(" ")));
private static final Set<String> gTLDs = new HashSet<String>(Arrays.asList(G_TLDS.split(" ")));
private static final Pattern IPV4_ADDRESS_PATTERN = Pattern.compile("(?:\\d{1,3}\\.){3}\\d{1,3}");
/**
* Extract the PLD (paid-level domain) from the hostname. If the format
* isn't recognized, the original hostname is returned.
*
* @param hostname
* - hostname from URL, e.g. www.domain.com.it
* @return - PLD, e.g. domain.com.it
*/
public static String getPLD(String hostname) {
// First, check for weird [HHHH:HH::H] IPv6 format.
if (hostname.startsWith("[") && hostname.endsWith("]")) {
return hostname;
}
String[] subNames = hostname.split("\\.");
int numPieces = subNames.length;
if (numPieces <= 2) {
return hostname;
}
// Check for ddd.ddd.ddd.ddd IPv4 format
if ((numPieces == 4) && (IPV4_ADDRESS_PATTERN.matcher(hostname).matches())) {
return hostname;
}
int firstHostPiece = 0;
if (ccTLDs.contains(subNames[numPieces - 1].toLowerCase(Locale.getDefault()))) {
// We have a country code at the end. See if the preceding piece is
// either
// a two-letter name (country code or funky short gTLD), or one of
// the
// "well-known" gTLDs.
if (subNames[numPieces - 2].length() <= 2) {
// Must be xxx.co.jp format
firstHostPiece = numPieces - 3;
} else if (gTLDs.contains(subNames[numPieces - 2].toLowerCase(Locale.getDefault()))) {
// Must be xxx.com.mx format
firstHostPiece = numPieces - 3;
} else {
// Must be xxx.it format
firstHostPiece = numPieces - 2;
}
} else if (gTLDs.contains(subNames[numPieces - 1].toLowerCase(Locale.getDefault()))) {
if (ccTLDs.contains(subNames[numPieces - 2].toLowerCase(Locale.getDefault()))) {
// Must be xxx.de.com format
firstHostPiece = numPieces - 3;
} else {
// Must be xxx.com format
firstHostPiece = numPieces - 2;
}
} else {
LOGGER.debug("Unknown format for hostname: " + hostname);
}
if (firstHostPiece == 0) {
return hostname;
} else {
// Build the result from the firstHostPiece to numPices pieces.
StringBuilder result = new StringBuilder();
for (int i = firstHostPiece; i < numPieces; i++) {
result.append(subNames[i]);
result.append('.');
}
// Trim off final '.'
return result.deleteCharAt(result.length() - 1).toString();
}
} // getPLD
/**
* Extract the PLD (paid-level domain) from the URL.
*
* @param url
* - Valid URL, e.g. http://www.domain.com.it
* @return - PLD e.g. domain.com.it
*/
public static String getPLD(URL url) {
return getPLD(url.getHost());
} // getPLD
}