1
0
mirror of https://github.com/crawler-commons/crawler-commons synced 2024-09-21 17:01:35 +02:00

Add HTTP status code and reason to FetchedResult.

https://code.google.com/p/crawler-commons/issues/detail?id=6
This commit is contained in:
kkrugler_lists@transpac.com 2013-01-23 23:05:21 +00:00
parent 4653ced854
commit 7300003e13
5 changed files with 51 additions and 9 deletions

View File

@ -33,7 +33,8 @@ public class FetchedResult {
private final String _newBaseUrl;
private final int _numRedirects;
private final String _hostAddress;
private final int _statusCode; // HTTP status code
private final String _reasonPhrase; // HTTP reason phrase, or null
private Payload _payload;
public FetchedResult( String baseUrl,
@ -46,7 +47,9 @@ public class FetchedResult {
Payload payload,
String newBaseUrl,
int numRedirects,
String hostAddress){
String hostAddress,
int statusCode,
String reasonPhrase) {
_payload = payload;
if (baseUrl == null) {
@ -83,6 +86,8 @@ public class FetchedResult {
_newBaseUrl = newBaseUrl;
_numRedirects = numRedirects;
_hostAddress = hostAddress;
_statusCode = statusCode;
_reasonPhrase = reasonPhrase;
}
public Payload getPayload() {
@ -136,4 +141,12 @@ public class FetchedResult {
public String getHostAddress() {
return _hostAddress;
}
public int getStatusCode() {
return _statusCode;
}
public String getReasonPhrase() {
return _reasonPhrase;
}
}

View File

@ -86,7 +86,7 @@ public class SimpleFileFetcher extends BaseFetcher {
long responseRate = (content.length * 1000L) / totalReadTime;
String contentType = "application/octet-stream";
return new FetchedResult(url, url, System.currentTimeMillis(), new Metadata(), content, contentType,
(int)responseRate, payload, url, 0, "localhost");
(int)responseRate, payload, url, 0, "localhost", HttpStatus.SC_OK, null);
} catch (FileNotFoundException e) {
throw new HttpFetchException(url, "Error fetching " + url, HttpStatus.SC_NOT_FOUND, new Metadata());
} catch (IOException e) {

View File

@ -490,6 +490,8 @@ public class SimpleHttpFetcher extends BaseHttpFetcher {
String contentType = "";
String mimeType = "";
String hostAddress = null;
int statusCode = HttpStatus.SC_INTERNAL_SERVER_ERROR;
String reasonPhrase = null;
// Create a local instance of cookie store, and bind to local context
// Without this we get killed w/lots of threads, due to sync() on single cookie store.
@ -513,9 +515,11 @@ public class SimpleHttpFetcher extends BaseHttpFetcher {
headerMap.add(header.getName(), header.getValue());
}
int httpStatus = response.getStatusLine().getStatusCode();
statusCode = response.getStatusLine().getStatusCode();
reasonPhrase = response.getStatusLine().getReasonPhrase();
if (LOGGER.isTraceEnabled()) {
fetchTrace.append("; status code: " + httpStatus);
fetchTrace.append("; status code: " + statusCode);
if (headerMap.get(HttpHeaders.CONTENT_LENGTH) != null) {
fetchTrace.append("; Content-Length: " + headerMap.get(HttpHeaders.CONTENT_LENGTH));
}
@ -525,9 +529,9 @@ public class SimpleHttpFetcher extends BaseHttpFetcher {
}
}
if ((httpStatus < 200) || (httpStatus >= 300)) {
if ((statusCode < 200) || (statusCode >= 300)) {
// We can't just check against SC_OK, as some wackos return 201, 202, etc
throw new HttpFetchException(url, "Error fetching " + url, httpStatus, headerMap);
throw new HttpFetchException(url, "Error fetching " + url + " due to \"" + reasonPhrase + "\"", statusCode, headerMap);
}
redirectedUrl = extractRedirectedUrl(url, localContext);
@ -754,7 +758,9 @@ public class SimpleHttpFetcher extends BaseHttpFetcher {
payload,
newBaseUrl,
numRedirects,
hostAddress);
hostAddress,
statusCode,
reasonPhrase);
}
private boolean isTextMimeType(String mimeType) {

View File

@ -40,6 +40,7 @@ public class SimpleFileFetcherTest {
SimpleFileFetcher fetcher = new SimpleFileFetcher();
FetchedResult result = fetcher.get(url);
Assert.assertEquals(0, result.getNumRedirects());
Assert.assertEquals(HttpStatus.SC_OK, result.getStatusCode());
String fetchedContent = new String(result.getContent(), "us-ascii");
Assert.assertEquals("Now is the time for all good men to come to the aid of their country.", fetchedContent);
@ -66,6 +67,7 @@ public class SimpleFileFetcherTest {
SimpleFileFetcher fetcher = new SimpleFileFetcher();
FetchedResult result = fetcher.get(url);
Assert.assertEquals(HttpStatus.SC_OK, result.getStatusCode());
Assert.assertEquals(0, result.getContentLength());
Assert.assertEquals(0, result.getResponseRate());
}

View File

@ -47,6 +47,7 @@ import crawlercommons.fetcher.AbortedFetchException;
import crawlercommons.fetcher.AbortedFetchReason;
import crawlercommons.fetcher.BaseFetcher;
import crawlercommons.fetcher.FetchedResult;
import crawlercommons.fetcher.HttpFetchException;
import crawlercommons.fetcher.IOFetchException;
import crawlercommons.fetcher.Payload;
import crawlercommons.fetcher.RedirectFetchException;
@ -84,6 +85,7 @@ public class SimpleHttpFetcherTest {
return _webServer.getServer();
}
// TODO - merge this code with RedirectResponseHandler class in crawlercommons.test package.
@SuppressWarnings("serial")
private class RedirectResponseHandler extends AbstractHttpHandler {
@ -245,7 +247,7 @@ public class SimpleHttpFetcherTest {
String url = "http://localhost:8089/test.html";
FetchedResult result = fetcher.get(url);
assertEquals(HttpStatus.SC_OK, result.getStatusCode());
assertTrue("Content size should be truncated", result.getContent().length <= fetcher.getDefaultMaxContentSize());
}
@ -259,7 +261,9 @@ public class SimpleHttpFetcherTest {
String urlToFetch = "http://localhost:8089/karlie.html";
FetchedResult result1 = fetcher.get(urlToFetch);
assertEquals(HttpStatus.SC_OK, result1.getStatusCode());
FetchedResult result2 = fetcher.get(urlToFetch);
assertEquals(HttpStatus.SC_OK, result2.getStatusCode());
// Verify that we got the same data from each fetch request.
assertEquals(1000, result1.getContent().length);
@ -445,4 +449,21 @@ public class SimpleHttpFetcherTest {
assertEquals("127.0.0.1", hostAddress);
}
@Test
public final void testMissingPage() throws Exception {
startServer(new ResourcesResponseHandler(), 8089);
BaseFetcher fetcher = new SimpleHttpFetcher(1, TestUtils.CC_TEST_AGENT);
String url = "http://localhost:8089/this-page-will-not-exist.html";
try {
fetcher.get(url);
fail("Should have thrown exception");
} catch (HttpFetchException e) {
assertEquals(HttpStatus.SC_NOT_FOUND, e.getHttpStatus());
// Make sure the reason gets into the exception message.
assertTrue(e.getMessage().contains("Not Found"));
}
}
}