From 7419e96d7426664e05898b9024955194a40db417 Mon Sep 17 00:00:00 2001 From: Sebastian Nagel Date: Sat, 9 Nov 2019 14:49:45 +0100 Subject: [PATCH] [BasicNormalizer] Empty path to be normalized to / even if there is an (empty) query, fixes #247 --- CHANGES.txt | 2 +- .../crawlercommons/filters/basic/BasicURLNormalizer.java | 5 +++++ src/test/resources/normalizer/weirdToNormalizedUrls.csv | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index d28e267..b35d421 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,7 +1,7 @@ Crawler-Commons Change Log Current Development 1.1-SNAPSHOT (yyyy-mm-dd) -- [BasicNormalizer] Trailing question mark in url query test should be added (Chaiavi, sebastian-nagel) #247 +- [BasicNormalizer] Empty path before query to be normalized to `/` (Chaiavi, sebastian-nagel) #247 - EffectiveTldFinder to validate returned domain names for length restrictions (sebastian-nagel, Chaiavi) #251 - Upgrade unit tests to use JUnit v5.x and parameterized tests (Chaiavi) #249, #253, #255 - [Robots] Robots parser to always handle absolute sitemap URL even without valid base URL (pr3mar, kkrugler, sebastian-nagel) #240 diff --git a/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java b/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java index 5a4a72f..0ba8576 100644 --- a/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java +++ b/src/main/java/crawlercommons/filters/basic/BasicURLNormalizer.java @@ -130,6 +130,9 @@ public class BasicURLNormalizer extends URLFilter { if (file == null || "".equals(file)) { // add a slash file = "/"; changed = true; + } else if (!file.startsWith("/")) { + file = "/" + file; + changed = true; } if (url.getRef() != null) { // remove the ref @@ -196,6 +199,8 @@ public class BasicURLNormalizer extends URLFilter { // if path is empty return a single slash if (file.isEmpty()) { file = "/"; + } else if (!file.startsWith("/")) { + file = "/" + file; } return file; diff --git a/src/test/resources/normalizer/weirdToNormalizedUrls.csv b/src/test/resources/normalizer/weirdToNormalizedUrls.csv index 3b20d8f..8c1f523 100644 --- a/src/test/resources/normalizer/weirdToNormalizedUrls.csv +++ b/src/test/resources/normalizer/weirdToNormalizedUrls.csv @@ -130,3 +130,5 @@ http:///////, http:/ # empty path with trailing question mark (empty query) #247 http://example.com?,http://example.com/? +http://example.com?a=1,http://example.com/?a=1 +