1
0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-09-20 08:42:15 +02:00

Upgrade the toString() method of the Base/simple RobotRules #264

This commit is contained in:
Avi Hayun 2020-04-29 20:00:43 +03:00
parent 95207d0928
commit cb21c29a57
3 changed files with 16 additions and 13 deletions

View File

@ -1,15 +1,17 @@
Crawler-Commons Change Log
Current Development 1.1-SNAPSHOT (yyyy-mm-dd)
- [Robots] Upgrade the toString() method of the Base/Simple RobotRules (Avi Hayun) #264
- Upgrade GitIgnore (Avi Hayun) #260
- [Robots] Deduplicate sitemap links (sebastian-nagel) #261
- EffectiveTldFinder to log loading of public suffix list (sebastian-nagel) #284
- SiteMapParser getPublicationDate in VideoAttributes may throw NPE (panthony, sebastian-nagel) #283
- SimpleRobotRulesParser: Trim log messages (jnioche, sebastian-nagel) #281
- SimpleRobotRulesParser: counter _numWarnings not thread-safe (sebastian-nagel, kkrugler) #278
- ParameterizedTest not executed by mvn builds (sebastian-nagel) #273
- [BasicNormalizer] Empty path before query to be normalized to `/` (Chaiavi, sebastian-nagel) #247
- EffectiveTldFinder to validate returned domain names for length restrictions (sebastian-nagel, Chaiavi) #251
- Upgrade unit tests to use JUnit v5.x and parameterized tests (Chaiavi) #249, #253, #255
- [BasicNormalizer] Empty path before query to be normalized to `/` (Avi Hayun, sebastian-nagel) #247
- EffectiveTldFinder to validate returned domain names for length restrictions (sebastian-nagel, Avi Hayun) #251
- Upgrade unit tests to use JUnit v5.x and parameterized tests (Avi Hayun) #249, #253, #255
- [Robots] Robots parser to always handle absolute sitemap URL even without valid base URL (pr3mar, kkrugler, sebastian-nagel) #240
Release 1.0 (2019-03-19)

View File

@ -41,7 +41,7 @@ public abstract class BaseRobotRules implements Serializable {
private LinkedHashSet<String> _sitemaps;
public BaseRobotRules() {
_sitemaps = new LinkedHashSet<String>();
_sitemaps = new LinkedHashSet<>();
}
public long getCrawlDelay() {
@ -116,13 +116,15 @@ public abstract class BaseRobotRules implements Serializable {
sb.append(" - crawl delay: ").append(delay).append('\n');
}
int nSitemaps = getSitemaps().size();
List<String> sitemaps = getSitemaps();
int nSitemaps = sitemaps.size();
if (nSitemaps == 0) {
sb.append(" - no sitemap URLs\n");
} else {
sb.append(" - number of sitemap URLs: ").append(nSitemaps).append('\n');
if (nSitemaps <= 10) {
sb.append(String.join("\n", getSitemaps())).append("\n\n");
int numOfSitemapsToShow = Math.min(nSitemaps, 10);
for (int i = 0; i < numOfSitemapsToShow; i++) {
sb.append(sitemaps.get(i)).append("\n");
}
}

View File

@ -129,7 +129,7 @@ public class SimpleRobotRules extends BaseRobotRules {
super();
_mode = mode;
_rules = new ArrayList<RobotRule>();
_rules = new ArrayList<>();
}
public void clearRules() {
@ -366,11 +366,10 @@ public class SimpleRobotRules extends BaseRobotRules {
sb.append('\n');
} else {
sb.append(" - number of rules: ").append(nRules).append('\n');
if (nRules <= 10) {
for (int i = 0; i < nRules; i++) {
RobotRule r = _rules.get(i);
sb.append(r._allow ? " A" : " Disa").append("llow: ").append(r._prefix).append('\n');
}
int numOfRulesToShow = Math.min(nRules, 10);
for (int i = 0; i < numOfRulesToShow; i++) {
RobotRule r = _rules.get(i);
sb.append(r._allow ? " A" : " Disa").append("llow: ").append(r._prefix).append('\n');
}
}
return sb.toString();