Skip to content

Commit

Permalink
some more stuff is in the trash
Browse files Browse the repository at this point in the history
  • Loading branch information
anuejn committed Jul 8, 2015
1 parent 7e4edc1 commit 265903e
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 34 deletions.
11 changes: 11 additions & 0 deletions src/de/sfn_kassel/plone_crawler/test/Crawler.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
public class Crawler extends Thread{
OnTaskQueEmptyListener onTaskQueEmptyListener;
OnTaskFinished onTaskFinished;
String startname;

public Crawler(OnTaskQueEmptyListener onTaskQueEmptyListener, OnTaskFinished onTaskFinished) {
this.onTaskFinished = onTaskFinished;
Expand All @@ -11,14 +12,24 @@ public Crawler(OnTaskQueEmptyListener onTaskQueEmptyListener, OnTaskFinished onT

@Override
public void run() {
startname = this.getName();
setNamePostfix("initializing");
int i = 0;
while (true) {
try {
Page currentPage = onTaskQueEmptyListener.taskQueEmpty();
setNamePostfix("downloading" + " (" + i + ")");
currentPage.loadPage();
i++;
setNamePostfix("waiting" + " (" + i + ")");
onTaskFinished.taskQueEmpty(currentPage);
} catch(Exception e) {
// e.printStackTrace();
}
}
}

private void setNamePostfix(String s) {
this.setName(startname + " [" + s + "]");
}
}
34 changes: 34 additions & 0 deletions src/de/sfn_kassel/plone_crawler/test/HashLink.java
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
package de.sfn_kassel.plone_crawler.test;

import java.net.URL;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;

public class HashLink {
URL url;

public HashLink(URL url) {
this.url = url;
}

public static String hash(String original) throws NoSuchAlgorithmException {
MessageDigest md = MessageDigest.getInstance("MD5");
md.update(original.getBytes());
Expand All @@ -14,4 +21,31 @@ public static String hash(String original) throws NoSuchAlgorithmException {
}
return sb.toString();
}

public String getNameHash(String rootName) {
// System.err.print(url.toString() + " -> ");
if (!url.toString().equals(rootName)) {
String end = url.toString().substring(url.toString().lastIndexOf('.'));
if (end.length() <= 4 && !end.equals(".de")) {
String beginning = url.toString().substring(0, url.toString().lastIndexOf('.') - 1);
try {
// System.out.println(HashLink.hash(beginning) + end);
return HashLink.hash(beginning) + end;
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
return null;
}
} else {
try {
// System.out.println(HashLink.hash(url.toString()) + ".html");
return HashLink.hash(url.toString()) + ".html";
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
return null;
}
}
} else {
return "index.html";
}
}
}
26 changes: 5 additions & 21 deletions src/de/sfn_kassel/plone_crawler/test/Page.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@
import java.net.URLConnection;
import java.security.NoSuchAlgorithmException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Scanner;
import java.util.stream.Stream;

import de.sfn_kassel.plone_crawler.test.PageContent.Type;

public class Page {
Expand Down Expand Up @@ -48,9 +45,11 @@ public Page[] getLinks() {
for (int i = 0; i < src.length; i++) {
src[i] = src[i].split("\"")[0];
}
System.out.println("ping");
String[] raw = (String[]) Stream.concat(Arrays.stream(href), Arrays.stream(src)).toArray();
System.out.println("pong");
// System.out.println("ping");
// String[] raw = (String[]) Stream.concat(Arrays.stream(href),
// Arrays.stream(href)).toArray();
String[] raw = href; // TODO: Fix me
// System.out.println("pong");
ArrayList<Page> links = new ArrayList<Page>();
for (int i = 0; i < raw.length; i++) {
try {
Expand All @@ -72,21 +71,6 @@ public Page[] getLinks() {
return links.toArray(new Page[1]);
}

public String getNameHash() {
String beginning = url.toString().substring(0, url.toString().lastIndexOf('.') - 1);
String end = url.toString().substring(url.toString().lastIndexOf('.'));
if (end.equals(".de"))
return "index.html";

try {
return HashLink.hash(beginning) + end;
} catch (NoSuchAlgorithmException e) {
e.printStackTrace();
return null;
}
}


@Override
public String toString() {
return "[Page: " + this.url.toString() + " loaded: " + (content != null) + "]";
Expand Down
21 changes: 8 additions & 13 deletions src/de/sfn_kassel/plone_crawler/test/Test.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,14 @@ public boolean check(URL object) {
boolean exsist = false;
synchronized (Urls) {
for (String page : Urls) {
if (page.equals(object.toString())) {
if (page.equals(new HashLink(object).getNameHash(startpage))) {
exsist = true;
break;
}
}
}

// System.err.println(exsist ? "true " : "false " + object.toString());
return !object.toString().contains("#") && object.toString().contains(startpage)
&& !object.toString().contains("@") && !exsist;
}
Expand All @@ -38,29 +40,22 @@ public synchronized void taskQueEmpty(Page finished) {
// System.out.println("done: " + donePages.size() + " | todo: "
// + futurePages.size() + " "
// + finished.url.toString());
System.out.println("1");
int before = futurePages.size() + 1;
synchronized (donePages) {
System.out.println("2");
donePages.add(finished);
System.out.println(finished.url.toString() + " -> " + new HashLink(finished.url).getNameHash(startpage));
synchronized (futurePages) {
System.out.println("db");
for (Page p : finished.getLinks()) {
System.out.println("db2");
if (URLChecker.check(p.url)) {
System.out.println("deadlock");
synchronized (Urls) {
System.out.println("2.5");
Urls.add(p.url.toString());
Urls.add(new HashLink(p.url).getNameHash(startpage));
}
futurePages.add(p);
}
System.out.println("3");
}
System.out.println("jo");
}
}
System.out.println(before + "," + (futurePages.size() - before) + "," + donePages.size());
// System.out.println(before + "," + (fguturePages.size() - before) + "," + donePages.size());
}
};

Expand All @@ -77,14 +72,14 @@ public synchronized Page taskQueEmpty() {
}
Page returnPage = futurePages.get(0);
futurePages.remove(0);
System.out.println(returnPage.getNameHash());
// System.out.println(returnPage.url.toString());
return returnPage;
}
}
}
};

for (int i = 0; i < 1; i++) {
for (int i = 0; i < 10; i++) {
Crawler c = new Crawler(onTaskQueEmptyListener, onTaskFinished);
c.setName("Crawler Thread " + i);
c.start();
Expand Down

0 comments on commit 265903e

Please sign in to comment.