1
0
Fork 0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-05-03 22:26:15 +02:00

Initial commit of build system, plus some paid-level domain extraction code from Bixo.

This commit is contained in:
kkrugler_lists@transpac.com 2009-12-04 04:13:38 +00:00
commit 288dca1504
9 changed files with 906 additions and 0 deletions

54
build.properties Normal file
View File

@ -0,0 +1,54 @@
name=crawlercommons
# Must be kept in sync with <project><version> value in pom.xml
version=0.1-SNAPSHOT
# For location of local cache of dependent jars
# Default is ~/.m2/repository
# maven.repo.local=<path to directory>
# For JavaDoc
javadoc.package=crawlercommons
javadoc.year=2009
javadoc.title=Crawler-commons
javadoc.link.java=http://java.sun.com/j2se/1.6.0/docs/api/
jar.name=${ant.project.name}-${version}.jar
job.name=${ant.project.name}-job-${version}.jar
main.src.dir=src/main/java
test.src.dir=src/test/java
main.res.dir=src/main/resources
test.res.dir=src/test/resources
lib.dir=lib
build.dir=build
build.dir.main-classes=${build.dir}/classes-main
build.dir.test-classes=${build.dir}/classes-test
build.dir.main-classes-eclipse=${build.dir}/classes-main-eclipse
build.dir.test-classes-eclipse=${build.dir}/classes-test-eclipse
build.dir.test-reports=${build.dir}/test
build.javadoc=${build.dir}/java-doc
build.dir.dist=${build.dir}/${name}-dist-${version}
build.release.file=${build.dir}/${name}-dist-${version}.tgz
# Instrumented classes are deposited into this directory
instrumented.dir=${build.dir}/instrumented
# All reports go into this directory
reports.dir=${build.dir}/reports
# Coverage reports are deposited into these directories
coverage.html.dir=${reports.dir}/cobertura-html
javac.debug=on
javac.optimize=on
javac.deprecation=off
javac.version=1.6
javac.args=
javac.args.warnings=-Xlint:none
build.encoding=ISO-8859-1

311
build.xml Normal file
View File

@ -0,0 +1,311 @@
<project name="crawler-commons" default="compile">
<property name="root.dir" value="${basedir}" />
<property file="${root.dir}/build.properties" />
<!-- ================================================================== -->
<!-- General cleaning sources -->
<!-- ================================================================== -->
<target name="clean" description="--> clean the project">
<echo>cleaning ${ant.project.name}</echo>
<delete includeemptydirs="true" failonerror="false">
<fileset dir="${build.dir}" excludes="classes-*-eclipse/" />
</delete>
</target>
<!-- ================================================================== -->
<!-- Maven -->
<!-- ================================================================== -->
<target name="mvn-init" unless="compile.classpath" xmlns:artifact="urn:maven-artifact-ant">
<path id="maven.ant.tasks.classpath" path="${lib.dir}/maven-ant-tasks-2.0.10.jar" />
<typedef resource="org/apache/maven/artifact/ant/antlib.xml" uri="urn:maven-artifact-ant"
classpathref="maven.ant.tasks.classpath"/>
<condition property="maven.repo.local" value="${maven.repo.local}" else="${user.home}/.m2/repository">
<isset property="maven.repo.local"/>
</condition>
<echo>maven.repo.local=${maven.repo.local}</echo>
<artifact:localRepository id="local.repository" path="${maven.repo.local}"/>
<artifact:pom file="pom.xml" id="maven.project"/>
<artifact:dependencies pathId="compile.classpath" filesetId="compile.fileset" useScope="compile">
<pom refid="maven.project"/>
<localRepository refid="local.repository"/>
</artifact:dependencies>
<artifact:dependencies pathId="test.classpath" filesetId="test.fileset" useScope="test">
<pom refid="maven.project"/>
<localRepository refid="local.repository"/>
</artifact:dependencies>
<artifact:dependencies pathId="runtime.classpath" filesetId="runtime.fileset" useScope="runtime">
<pom refid="maven.project"/>
<localRepository refid="local.repository"/>
</artifact:dependencies>
</target>
<target name="mvn-install" xmlns:artifact="urn:maven-artifact-ant" depends="mvn-init">
<artifact:install file="${build.dir}/${jar.name}">
<pom refid="maven.project"/>
</artifact:install>
</target>
<!-- ================================================================== -->
<!-- Build sources -->
<!-- ================================================================== -->
<target name="compile"
depends="mvn-init"
description="--> compile main classes">
<mkdir dir="${build.dir.main-classes}" />
<javac encoding="${build.encoding}"
srcdir="${main.src.dir}"
includes="**/*.java"
destdir="${build.dir.main-classes}"
debug="${javac.debug}"
optimize="${javac.optimize}"
target="${javac.version}"
source="${javac.version}"
deprecation="${javac.deprecation}">
<compilerarg line="${javac.args} ${javac.args.warnings}" />
<classpath refid="compile.classpath" />
</javac>
</target>
<!-- ================================================================== -->
<!-- Unit Tests -->
<!-- ================================================================== -->
<target name="compile-test" depends="compile">
<echo>*** Building Unit Tests Sources ***</echo>
<mkdir dir="${build.dir.test-classes}" />
<path id="test.path">
<pathelement location="${build.dir.main-classes}" />
</path>
<javac encoding="${build.encoding}"
srcdir="${test.src.dir}"
includes="**/*.java"
destdir="${build.dir.test-classes}"
debug="${javac.debug}"
optimize="${javac.optimize}"
target="${javac.version}"
source="${javac.version}"
deprecation="${javac.deprecation}">
<compilerarg line="${javac.args} ${javac.args.warnings}" />
<classpath refid="test.classpath" />
<classpath refid="test.path" />
</javac>
</target>
<target name="test"
depends="compile-test"
description="--> run unit tests">
<delete dir="${build.dir.test-reports}" />
<mkdir dir="${build.dir.test-reports}" />
<junit showoutput="false"
printsummary="yes"
haltonfailure="no"
fork="yes"
maxmemory="256m"
dir="${basedir}"
errorProperty="tests.failed"
failureProperty="tests.failed">
<sysproperty key="net.sourceforge.cobertura.datafile"
file="${reports.dir}/crawlercommons_coverage.ser" />
<classpath>
<pathelement location="${instrumented.dir}" />
<pathelement location="${build.dir.main-classes}" />
<pathelement location="${build.dir.test-classes}" />
<pathelement location="${test.res.dir}" />
<path refid="test.classpath" />
</classpath>
<formatter type="plain" />
<batchtest fork="yes"
todir="${build.dir.test-reports}">
<fileset dir="${test.src.dir}">
<include name="**/*Test.java" />
<exclude name="**/Abstract*.java" />
</fileset>
</batchtest>
</junit>
<fail if="tests.failed">Tests failed!</fail>
</target>
<!-- ================================================================== -->
<!-- Build jar of sources -->
<!-- ================================================================== -->
<target name="jar" depends="compile">
<tstamp>
<format property="timestamp" pattern="MMM dd yyyy, HH:mm:ss" />
</tstamp>
<jar jarfile="${build.dir}/${jar.name}"
basedir="${build.dir.main-classes}">
<manifest>
<section name="crawler-commons">
<attribute name="Implementation-Title"
value="${ant.project.name}" />
<attribute name="Implementation-Version"
value="${version}" />
<attribute name="Compile-Time" value="${timestamp}" />
<attribute name="Compiled-By" value="${user.name}" />
</section>
</manifest>
</jar>
</target>
<!-- ================================================================== -->
<!-- Java Doc -->
<!-- ================================================================== -->
<target name="doc" depends="compile" description="--> create javadoc">
<mkdir dir="${build.javadoc}" />
<javadoc packagenames="${javadoc.package}"
destdir="${build.javadoc}"
author="true"
version="true"
use="true"
windowtitle="${name} ${version} API"
doctitle="${name} ${version} API">
<packageset dir="${main.src.dir}" />
<link href="${javadoc.link.java}" />
<classpath>
<path refid="compile.classpath" />
</classpath>
<group title="${javadoc.title}" packages="${javadoc.package}*" />
</javadoc>
</target>
<!-- ================================================================== -->
<!-- Install in local Maven repository -->
<!-- ================================================================== -->
<target name="install"
depends="test, mvn-install"
description="--> install SNAPSHOT jar to local repository">
</target>
<!-- ================================================================== -->
<!-- Generate a distribution -->
<!-- ================================================================== -->
<target name="dist"
depends="test, jar, doc"
description="--> create a tarball distribution">
<delete dir="${build.dir.dist}" />
<!-- create target directory -->
<mkdir dir="${build.dir.dist}" />
<mkdir dir="${build.dir.dist}/bin" />
<mkdir dir="${build.dir.dist}/docs" />
<mkdir dir="${build.dir.dist}/docs/licenses" />
<mkdir dir="${build.dir.dist}/docs/javadoc" />
<mkdir dir="${build.dir.dist}/docs/reports" />
<mkdir dir="${build.dir.dist}/lib" />
<mkdir dir="${build.dir.dist}/src" />
<!-- copy javadoc to target dir -->
<copy todir="${build.dir.dist}/docs/javadoc">
<fileset dir="${build.javadoc}" />
</copy>
<!-- copy any raw libs -->
<copy todir="${build.dir.dist}/lib" flatten="true">
<fileset dir="${lib.dir}" />
<path refId="compile.classpath" />
<path refId="runtime.classpath" />
</copy>
<!-- copy src -->
<copy todir="${build.dir.dist}/src">
<fileset dir="${basedir}/src" />
</copy>
<!-- copy project jar to dist dir -->
<copy todir="${build.dir.dist}">
<fileset file="${build.dir}/${jar.name}" />
</copy>
<!-- copy documents -->
<copy todir="${build.dir.dist}">
<fileset file="${basedir}/README" />
</copy>
<copy todir="${build.dir.dist}/docs">
<fileset file="${basedir}/doc/eclipse-formatter.xml" />
</copy>
<copy todir="${build.dir.dist}/docs/licenses">
<fileset dir="${basedir}/doc/licenses" />
</copy>
<tar longfile="gnu" compression="gzip" destfile="${build.release.file}">
<tarfileset dir="${build.dir.dist}" />
</tar>
</target>
<!-- ================================================================== -->
<!-- Generating eclipse file -->
<!-- ================================================================== -->
<target name="eclipse"
depends="mvn-init, clean-eclipse"
description="--> create the Eclipse project files">
<taskdef name="eclipse"
classname="prantl.ant.eclipse.EclipseTask"
classpathref="compile.classpath" />
<mkdir dir="${build.dir.main-classes-eclipse}" />
<mkdir dir="${build.dir.test-classes-eclipse}" />
<eclipse>
<settings>
<jdtcore compilercompliance="6.0" />
<resources encoding="UTF-8" />
</settings>
<project name="${ant.project.name}" />
<classpath>
<container path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6" />
<source path="${basedir}/src/main/java"
output="${build.dir.main-classes-eclipse}" />
<source path="${basedir}/src/main/resources"
output="${build.dir.main-classes-eclipse}" />
<source path="${basedir}/src/test/java"
output="${build.dir.test-classes-eclipse}" />
<source path="${basedir}/src/test/resources"
output="${build.dir.test-classes-eclipse}" />
<output path="${build.dir.main-classes-eclipse}" />
<library pathref="test.classpath" exported="false" />
</classpath>
</eclipse>
<concat destfile="${root.dir}/.settings/org.eclipse.jdt.core.prefs"
append="true">
<filelist dir="${root.dir}/doc/" files="eclipse-formatter.xml" />
</concat>
<replace file="${root.dir}/.settings/org.eclipse.jdt.core.prefs"
token="org.eclipse.jdt.core.compiler.compliance=6.0"
value="org.eclipse.jdt.core.compiler.compliance=1.6" />
</target>
<target name="clean-eclipse"
description="--> clean the Eclipse project files">
<delete file=".classpath" />
<delete file=".eclipse" />
<delete file=".project" />
<delete dir=".settings" />
</target>
</project>

1
doc/README Normal file
View File

@ -0,0 +1 @@
Placeholder for real README

267
doc/eclipse-formatter.xml Normal file
View File

@ -0,0 +1,267 @@
<?xml version="1.0" encoding="UTF-8"?>
<profiles version="11">
<profile kind="CodeFormatterProfile" name="Bixo" version="11">
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_if" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_assert" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.brace_position_for_enum_constant" value="end_of_line"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_semicolon" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.align_type_members_on_columns" value="false"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_case" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.comment.format_line_comments" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.number_of_empty_lines_to_preserve" value="1"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_between_brackets_in_array_type_reference" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_switch" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.blank_lines_between_type_declarations" value="1"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_return" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_method_body" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_annotation_type_declaration" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.indent_statements_compare_to_body" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_member" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.format_guardian_clause_on_one_line" value="false"/>
<setting id="org.eclipse.jdt.core.formatter.comment.insert_new_line_before_root_tags" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_for" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.tabulation.size" value="4"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_parameters" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_imports" value="1"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_case" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_constant_arguments" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_new_chunk" value="1"/>
<setting id="org.eclipse.jdt.core.formatter.continuation_indentation" value="4"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_binary_operator" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_parameters" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_for" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_superinterfaces" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_parameters_in_method_declaration" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_assignment" value="0"/>
<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_member_type" value="1"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_throws" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_conditional_expression" value="80"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_while" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.comment.indent_parameter_description" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.comment.format_html" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_allocation_expression" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_throws" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_enum_constant" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.comment.format_source_code" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_declarations" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_parameterized_type_reference" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_annotation" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_declaration" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_conditional" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_unary_operator" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_question_in_conditional" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_annotation_declaration" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.indentation.size" value="4"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_local_declarations" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_postfix_operator" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_enum_declaration" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_constant_arguments" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_for" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_constructor_declaration" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation_type_declaration" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_explicitconstructorcall_arguments" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.brace_position_for_anonymous_type_declaration" value="end_of_line"/>
<setting id="org.eclipse.jdt.core.formatter.lineSplit" value="200"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_type_declaration" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_block" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_type_declaration" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_while" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_constant" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_block_comment" value="false"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_at_in_annotation_type_declaration" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_constant" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_arguments" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_parameters" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.brace_position_for_array_initializer" value="end_of_line"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_superclass_in_type_declaration" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_cast" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_enum_declaration" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_synchronized" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.comment.format_header" value="false"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_for" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_else_in_if_statement" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_explicit_constructor_call" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_declaration" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_allocation_expression" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_multiple_fields" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_at_end_of_file_if_missing" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_explicitconstructorcall_arguments" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_block" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_paren_in_cast" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_finally_in_try_statement" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.keep_then_statement_on_same_line" value="false"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_binary_operator" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_annotation_declaration_header" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_constructor_declaration" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_declaration" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_expressions_in_array_initializer" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.brace_position_for_method_declaration" value="end_of_line"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_enum_constant" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_annotation_type_member_declaration" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_arguments" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_parameterized_type_reference" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation_type_member_declaration" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_field" value="0"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_method_declaration" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_declaration" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_parameters" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_parameters" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_switch" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.comment.format_javadoc_comments" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_allocation_expression" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.comment.format_block_comments" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_array_initializer" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_in_empty_anonymous_type_declaration" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_binary_expression" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_braces_in_array_initializer" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.wrap_before_binary_operator" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.blank_lines_after_package" value="1"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_catch" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_type_declaration" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_labeled_statement" value="do not insert"/>
<setting id="org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode" value="enabled"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_for" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_and_in_type_parameter" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_catch" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_while_in_do_statement" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.blank_lines_between_import_groups" value="1"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_throws" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_prefix_operator" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_ellipsis" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.brace_position_for_constructor_declaration" value="end_of_line"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_question_in_wildcard" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_javadoc_comment" value="false"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_allocation_expression" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_parameters" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.blank_lines_after_imports" value="1"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_conditional" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_declaration" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_parameterized_type_reference" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_before_catch_in_try_statement" value="do not insert"/>
<setting id="org.eclipse.jdt.core.compiler.problem.assertIdentifier" value="error"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_enum_constant" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.brace_position_for_block_in_case" value="end_of_line"/>
<setting id="org.eclipse.jdt.core.formatter.brace_position_for_enum_declaration" value="end_of_line"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_increments" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_for" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_first_class_body_declaration" value="0"/>
<setting id="org.eclipse.jdt.core.formatter.keep_else_statement_on_same_line" value="false"/>
<setting id="org.eclipse.jdt.core.formatter.indent_empty_lines" value="false"/>
<setting id="org.eclipse.jdt.core.formatter.comment.insert_new_line_for_parameter" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_throw" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_while" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_brace_in_block" value="insert"/>
<setting id="org.eclipse.jdt.core.compiler.source" value="1.5"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_increments" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_declaration_header" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_constructor_declaration" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.comment.line_length" value="80"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_prefix_operator" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.brace_position_for_type_declaration" value="end_of_line"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_assignment_operator" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_parameter" value="do not insert"/>
<setting id="org.eclipse.jdt.core.compiler.compliance" value="1.5"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_invocation" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_arguments" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.compact_else_if" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_reference" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_declarations" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_question_in_conditional" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_parameters" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_invocation" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.use_tabs_only_for_leading_indentations" value="false"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_arguments" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.brace_position_for_switch" value="end_of_line"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_parameters_in_constructor_declaration" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_brackets_in_array_allocation_expression" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_for" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_constructor_declaration" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_synchronized" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.number_of_blank_lines_at_beginning_of_method_body" value="0"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_annotation" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_parameterized_type_reference" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_switch" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_constructor_declaration" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_annotation" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_if" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_default" value="do not insert"/>
<setting id="org.eclipse.jdt.core.compiler.problem.enumIdentifier" value="error"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_annotation" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_enum_constant" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_reference" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_local_variable" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_type_reference" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_array_initializer" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_catch" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_synchronized" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.keep_empty_array_initializer_on_one_line" value="false"/>
<setting id="org.eclipse.jdt.core.compiler.codegen.targetPlatform" value="1.5"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_reference" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_switch" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_brace_in_array_initializer" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_compact_if" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_question_in_wildcard" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_colon_in_assert" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_method_declaration" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_ellipsis" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_qualified_allocation_expression" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.indent_statements_compare_to_block" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_type_header" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_parameters" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_arguments" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.keep_imple_if_on_one_line" value="false"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_invocation" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_local_declarations" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.brace_position_for_annotation_type_declaration" value="end_of_line"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_selector_in_method_invocation" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_constant_header" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_switch" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_assignment_operator" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.never_indent_line_comments_on_first_column" value="false"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_unary_operator" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_if" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_colon_in_labeled_statement" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_allocation_expression" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_cases" value="false"/>
<setting id="org.eclipse.jdt.core.formatter.continuation_indentation_for_array_initializer" value="4"/>
<setting id="org.eclipse.jdt.core.formatter.comment.indent_root_tags" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_enum_constants" value="0"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_parameterized_type_reference" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_throws" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_constructor_declaration" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.alignment_for_arguments_in_method_invocation" value="16"/>
<setting id="org.eclipse.jdt.core.formatter.tabulation.char" value="space"/>
<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_package" value="0"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.indent_breaks_compare_to_cases" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_inits" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_field_declarations" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_superinterfaces" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.put_empty_statement_on_new_line" value="true"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.blank_lines_before_method" value="1"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_field_declarations" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_inits" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_anonymous_type_declaration" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_cast" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_arguments" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.never_indent_block_comments_on_first_column" value="false"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_comma_in_array_initializer" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_enum_constant" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_invocation" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_postfix_operator" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.brace_position_for_block" value="end_of_line"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_after_opening_brace_in_array_initializer" value="insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_allocation_expression" value="do not insert"/>
<setting id="org.eclipse.jdt.core.formatter.insert_space_before_and_in_type_parameter" value="insert"/>
</profile>
</profiles>

1
doc/licenses/README Normal file
View File

@ -0,0 +1 @@
This directory should contain licenses for all of the dependent jars used by the project.

Binary file not shown.

95
pom.xml Normal file
View File

@ -0,0 +1,95 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>crawlercommons</groupId>
<artifactId>crawler-commons</artifactId>
<name>Crawler-commons</name>
<licenses>
<license>
<name>Apache License, Version 2.0</name>
<url>http://www.opensource.org/licenses/apache2.0.php</url>
</license>
</licenses>
<url>http://code.google.com/p/crawler-commons/</url>
<description>crawler-commons is a set of reusable Java components that implement functionality common to any web crawler.</description>
<scm>
<connection>scm:svn:http://crawler-commons.googlecode.com/svn/trunk/</connection>
<url>scm:svn:http://crawler-commons.googlecode.com/svn/trunk/</url>
</scm>
<packaging>jar</packaging>
<!-- Must be kept in sync with "version" value in build.properties -->
<version>0.1-SNAPSHOT</version>
<repositories>
<repository>
<id>Apache Snapshots</id>
<name>Apache snapshot repository</name>
<url>https://repository.apache.org/content/groups/snapshots-group/</url>
</repository>
</repositories>
<dependencies>
<!-- Compile time dependencies -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.0</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.14</version>
</dependency>
<!-- Test dependencies -->
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.7</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.mockito</groupId>
<artifactId>mockito-core</artifactId>
<version>1.8.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>jetty</groupId>
<artifactId>jetty</artifactId>
<!-- we'd like to use 6.0.2, but the version in central is missing the pom -->
<version>5.1.10</version>
<scope>test</scope>
</dependency>
<!-- Provided dependencies -->
<dependency>
<groupId>javax.servlet</groupId>
<artifactId>servlet-api</artifactId>
<version>2.5</version>
<scope>provided</scope>
</dependency>
<!-- Eclipse project generation dependencies -->
<dependency>
<groupId>ant-eclipse</groupId>
<artifactId>ant-eclipse-jvm1.2</artifactId>
<version>1.0</version>
<scope>eclipse</scope>
</dependency>
</dependencies>
</project>

View File

@ -0,0 +1,118 @@
package crawlercommons.url;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
/**
* Routines to extract the PLD (paid-level domain, as per the IRLbot paper) from a hostname or URL.
*
*/
public class PaidLevelDomain {
private static final Logger LOGGER = Logger.getLogger(PaidLevelDomain.class);
private static final String CC_TLDS =
"ac ad ae af ag ai al am an ao aq ar as at au aw ax az ba bb bd be bf bg bh bi " +
"bj bl bm bn bo br bs bt bv bw by bz ca cc cd cf cg ch ci ck cl cm cn co cr cu " +
"cv cx cy cz de dj dk dm do dz ec ee eg eh er es et eu fi fj fk fm fo fr ga gb " +
"gd ge gf gg gh gi gl gm gn gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il " +
"im in io iq ir is it je jm jo jp ke kg kh ki km kn kp kr kw ky kz la lb lc li " +
"lk lr ls lt lu lv ly ma mc md me mf mg mh mk ml mm mn mo mp mq mr ms mt mu mv " +
"mw mx my mz na nc ne nf ng ni nl no np nr nu nz om pa pe pf pg ph pk pl pm pn " +
"pr ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl sm sn so sr " +
"st su sv sy sz tc td tf tg th tj tk tl tm tn to tp tr tt tv tw tz ua ug uk um " +
"us uy uz va vc ve vg vi vn vu wf ws ye yt yu za zm zw";
private static final String G_TLDS =
"aero arpa asia biz cat com coop edu gov info int jobs mil mobi museum name net " +
"org pro tel";
private static final Set<String> ccTLDs = new HashSet<String>(Arrays.asList(CC_TLDS.split(" ")));
private static final Set<String> gTLDs = new HashSet<String>(Arrays.asList(G_TLDS.split(" ")));
private static final Pattern IPV4_ADDRESS_PATTERN = Pattern.compile("(?:\\d{1,3}\\.){3}\\d{1,3}");
/**
* Extract the PLD (paid-level domain) from the hostname. If the format isn't recognized,
* the original hostname is returned.
*
* @param hostname - hostname from URL, e.g. www.domain.com.it
* @return - PLD, e.g. domain.com.it
*/
public static String getPLD(String hostname) {
// First, check for weird [HHHH:HH::H] IPv6 format.
if (hostname.startsWith("[") && hostname.endsWith("]")) {
return hostname;
}
String[] subNames = hostname.split("\\.");
int numPieces = subNames.length;
if (numPieces <= 2) {
return hostname;
}
// Check for ddd.ddd.ddd.ddd IPv4 format
if ((numPieces == 4) && (IPV4_ADDRESS_PATTERN.matcher(hostname).matches())) {
return hostname;
}
int firstHostPiece = 0;
if (ccTLDs.contains(subNames[numPieces - 1].toLowerCase())) {
// We have a country code at the end. See if the preceding piece is either
// a two-letter name (country code or funky short gTLD), or one of the
// "well-known" gTLDs.
if (subNames[numPieces - 2].length() <= 2) {
// Must be xxx.co.jp format
firstHostPiece = numPieces - 3;
} else if (gTLDs.contains(subNames[numPieces - 2].toLowerCase())) {
// Must be xxx.com.mx format
firstHostPiece = numPieces - 3;
} else {
// Must be xxx.it format
firstHostPiece = numPieces - 2;
}
} else if (gTLDs.contains(subNames[numPieces - 1].toLowerCase())) {
if (ccTLDs.contains(subNames[numPieces - 2].toLowerCase())) {
// Must be xxx.de.com format
firstHostPiece = numPieces - 3;
} else {
// Must be xxx.com format
firstHostPiece = numPieces - 2;
}
} else {
LOGGER.debug("Unknown format for hostname: " + hostname);
}
if (firstHostPiece == 0) {
return hostname;
} else {
// Build the result from the firstHostPiece to numPices pieces.
StringBuilder result = new StringBuilder();
for (int i = firstHostPiece; i < numPieces; i++) {
result.append(subNames[i]);
result.append('.');
}
// Trim off final '.'
return result.deleteCharAt(result.length() - 1).toString();
}
} // getPLD
/**
* Extract the PLD (paid-level domain) from the URL.
*
* @param url - Valid URL, e.g. http://www.domain.com.it
* @return - PLD e.g. domain.com.it
*/
public static String getPLD(URL url) {
return getPLD(url.getHost());
} // getPLD
}

View File

@ -0,0 +1,59 @@
package crawlercommons.url;
import static org.junit.Assert.assertEquals;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.UnknownHostException;
import org.junit.Test;
public class PaidLevelDomainTest {
@Test
public final void testIPv4() throws MalformedURLException {
assertEquals("1.2.3.4", PaidLevelDomain.getPLD("1.2.3.4"));
URL url = new URL("http://1.2.3.4:8080/a/b/c?_queue=1");
assertEquals("1.2.3.4", PaidLevelDomain.getPLD(url));
}
public final void testIPv6() throws MalformedURLException, UnknownHostException {
InetAddress inet = InetAddress.getByName("1080:0:0:0:8:800:200c:417a");
URL url = new URL("http", inet.getHostAddress(), 8080, "a/b/c");
assertEquals("[1080:0:0:0:8:800:200c:417a]", PaidLevelDomain.getPLD(url));
}
public final void testStandardDomains() throws MalformedURLException {
assertEquals("xxx.com", PaidLevelDomain.getPLD("xxx.com"));
assertEquals("xxx.com", PaidLevelDomain.getPLD("www.xxx.com"));
assertEquals("xxx.com", PaidLevelDomain.getPLD("www.zzz.xxx.com"));
assertEquals("xxx.com", PaidLevelDomain.getPLD(new URL("https://www.zzz.xxx.com:9000/a/b?c=d")));
}
public final void testBizDomains() {
assertEquals("xxx.biz", PaidLevelDomain.getPLD("xxx.biz"));
assertEquals("xxx.biz", PaidLevelDomain.getPLD("www.xxx.biz"));
}
// Japan (and uk) have shortened gTLDs before the country code.
public final void testJapaneseDomains() {
assertEquals("xxx.co.jp", PaidLevelDomain.getPLD("xxx.co.jp"));
assertEquals("xxx.co.jp", PaidLevelDomain.getPLD("www.xxx.co.jp"));
assertEquals("xxx.ne.jp", PaidLevelDomain.getPLD("www.xxx.ne.jp"));
}
// In Germany you can have xxx.de.com
public final void testGermanDomains() {
assertEquals("xxx.de.com", PaidLevelDomain.getPLD("xxx.de.com"));
assertEquals("xxx.de.com", PaidLevelDomain.getPLD("www.xxx.de.com"));
}
// Typical international domains look like xxx.com.it
public final void testItalianDomains() {
assertEquals("xxx.com.it", PaidLevelDomain.getPLD("xxx.com.it"));
assertEquals("xxx.com.it", PaidLevelDomain.getPLD("www.xxx.com.it"));
}
}