chore: extend argparse + reword exception messages

* also add an example urls2crawl text file with one url per line
This commit is contained in:
surtur 2020-12-12 21:25:34 +01:00
parent a82ff94e6a
commit 4d468d13ef
Signed by: wanderer
GPG Key ID: 19CE1EC1D9E0486D
2 changed files with 41 additions and 6 deletions

View File

@ -89,8 +89,8 @@ class ParserCallback extends HTMLEditorKit.ParserCallback {
System.err.println("Adding URI: "+uri.toString());
}
} catch (Exception e) {
System.err.println("Nalezeno nekorektn<74> URI: "+href);
e.printStackTrace();
System.err.println("Bad bad URI found: "+href);
System.err.println(e);
}
}
@ -117,16 +117,35 @@ class ParserCallback extends HTMLEditorKit.ParserCallback {
public class crawler {
public static void main(String[] args) {
if (args.length < 1) {
System.err.println("Missing parameter - start URL");
System.err.println("Not enough parameters");
return;
}
LinkedList<URIinfo> foundURIs=new LinkedList<URIinfo>();
HashSet<URI> visitedURIs=new HashSet<URI>();
URI uri;
try {
uri = new URI(args[0]+"/");
foundURIs.add(new URIinfo(uri, 0));
visitedURIs.add(uri);
/* if arg 0 = -f go for a file with urls to crawl, else assume the arg is an URL */
if (args[0].equals("-f")) {
/* read urls from the file pls */
File file = new File(args[1]);
Scanner sc = new Scanner(file);
while (sc.hasNextLine())
{
String nuurlpart = sc.nextLine();
if (!nuurlpart.endsWith("/")) {nuurlpart += "/";}
uri = new URI(nuurlpart);
foundURIs.add(new URIinfo(uri, 0));
visitedURIs.add(uri);
}
} else {
String nuurlpart = args[0];
if (!nuurlpart.endsWith("/")) {nuurlpart += "/";}
uri = new URI(args[0]);
foundURIs.add(new URIinfo(uri, 0));
visitedURIs.add(uri);
}
/**
* zde zpracujte dalsi parametry - maxDepth a debugLevel...
*/
@ -144,6 +163,20 @@ public class crawler {
reader.close();
} catch (FileNotFoundException e) {
System.err.println("Error loading page - does it exist?");
System.err.println(e);
} catch (ConnectException e){
System.err.println("Dang - connection refused");
System.err.println(e);
} catch (UnknownHostException e) {
if (e.getMessage().endsWith(".onion")){
System.err.println("This host is not on clearweb - skipping for now.");
} else {
System.err.println("The DNS record for this host might no longer exist.");
}
System.err.println(e);
} catch (IOException e) {
System.err.println("This host returned a 403.");
System.err.println(e);
}
}
} catch (Exception e) {

2
urls2crawl.txt Normal file
View File

@ -0,0 +1,2 @@
https://stalluminati.neocities.org/matrix/
https://git.dotya.ml