-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d30bbb2
commit 58361fe
Showing
7 changed files
with
1,191 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,161 @@ | ||
package CrawlerAndQueryEngine; | ||
|
||
import java.util.*; | ||
import java.net.*; | ||
import org.attoparser.simple.*; | ||
|
||
/** | ||
* A markup handler which is called by the Attoparser markup parser as it parses the input; | ||
* responsible for building the actual web index. | ||
*/ | ||
public class CrawlingMarkupHandler extends AbstractSimpleMarkupHandler { | ||
|
||
//The WebIndex object used to index all pages | ||
private WebIndex index = new WebIndex(); | ||
//Queue of text representing every word on a page | ||
private Queue<String> text = new LinkedList<>(); | ||
//The last word being built in the page | ||
private StringBuilder lastWord = new StringBuilder(); | ||
//Set of all URL's that have been visited | ||
private HashMap<URL, Page> pastPages = new HashMap<>(); | ||
//New URL's that still need to be visited | ||
private List<URL> newURLS = new LinkedList<>(); | ||
//The URL currently being parsed | ||
private Page currentPage; | ||
|
||
|
||
public CrawlingMarkupHandler() {} | ||
|
||
/** | ||
* This method returns the complete index that has been crawled thus far when called. | ||
*/ | ||
public Index getIndex() { | ||
// TODO: Implement this! | ||
return index; | ||
} | ||
|
||
/** | ||
* This method returns any new URLs found to the Crawler; upon being called, the set of new URLs | ||
* should be cleared. | ||
*/ | ||
public List<URL> newURLs() { | ||
List<URL> list = newURLS; | ||
newURLS = new LinkedList<>(); | ||
return list; | ||
} | ||
|
||
/** | ||
* Called when the parser first starts reading a document. | ||
* @param startTimeNanos the current time (in nanoseconds) when parsing starts | ||
* @param line the line of the document where parsing starts | ||
* @param col the column of the document where parsing starts | ||
*/ | ||
public void handleDocumentStart(long startTimeNanos, int line, int col) { | ||
// TODO: Implement this. | ||
text = new LinkedList<>(); | ||
lastWord = new StringBuilder(); | ||
} | ||
|
||
/** | ||
* Called when the parser finishes reading a document. | ||
* @param endTimeNanos the current time (in nanoseconds) when parsing ends | ||
* @param totalTimeNanos the difference between current times at the start | ||
* and end of parsing | ||
* @param line the line of the document where parsing ends | ||
* @param col the column of the document where the parsing ends | ||
*/ | ||
public void handleDocumentEnd(long endTimeNanos, long totalTimeNanos, int line, int col) { | ||
// TODO: Implement this. | ||
//System.out.println("End of document"); | ||
if(lastWord.length() > 0){ | ||
text.add(lastWord.toString()); | ||
} | ||
lastWord = new StringBuilder(); | ||
index.addPhrase(text); | ||
} | ||
|
||
/** | ||
* Called at the start of any tag. | ||
* @param elementName the element name (such as "div") | ||
* @param attributes the element attributes map, or null if it has no attributes | ||
* @param line the line in the document where this elements appears | ||
* @param col the column in the document where this element appears | ||
*/ | ||
public void handleOpenElement(String elementName, Map<String, String> attributes, int line, int col) { | ||
// TODO: Implement this. | ||
if(attributes == null || attributes.keySet() == null) {return;} | ||
|
||
//If the element contains a URl that hasn't been visited, | ||
//add it to newURLS | ||
if(elementName.toLowerCase().equals("a")) { | ||
for(String s: attributes.keySet()) { | ||
if(s.toLowerCase().equals("href")) { | ||
String fin_link = attributes.get(s); | ||
URL next = null; | ||
try { | ||
next = new URL(currentPage.getURL(), fin_link); | ||
|
||
if(!pastPages.keySet().contains(next)) { | ||
String urlString = next.toString(); | ||
if(urlString.endsWith(".html") || urlString.endsWith(".htm")){ | ||
newURLS.add(next); | ||
pastPages.put(next, new Page(next)); | ||
} | ||
} else { | ||
pastPages.get(next).increment(); | ||
} | ||
} catch(MalformedURLException e) { | ||
} | ||
} | ||
} | ||
} | ||
|
||
} | ||
|
||
/** | ||
* Called at the end of any tag. | ||
* @param elementName the element name (such as "div"). | ||
* @param line the line in the document where this elements appears. | ||
* @param col the column in the document where this element appears. | ||
*/ | ||
public void handleCloseElement(String elementName, int line, int col) { | ||
// TODO: Implement this. | ||
//System.out.println("End element: " + elementName); | ||
} | ||
|
||
/** | ||
* Called whenever characters are found inside a tag. Note that the parser is not | ||
* required to return all characters in the tag in a single chunk. Whitespace is | ||
* also returned as characters. | ||
* @param ch buffer containint characters; do not modify this buffer | ||
* @param start location of 1st character in ch | ||
* @param length number of characters in ch | ||
*/ | ||
public void handleText(char ch[], int start, int length, int line, int col) { | ||
// TODO: Implement this. | ||
//System.out.print("Characters: \""); | ||
|
||
for(int i = start; i < start + length; i++) { | ||
//Tokenize the incoming stream of characters | ||
if(Character.isLetterOrDigit(ch[i])) { | ||
lastWord.append(Character.toLowerCase(ch[i])); | ||
} else { | ||
if(lastWord.length() > 0){ | ||
text.add(lastWord.toString()); | ||
lastWord = new StringBuilder(); | ||
} | ||
} | ||
} | ||
|
||
} | ||
|
||
public void setCurrentURL(URL currentURL){ | ||
Page p = pastPages.get(currentURL); | ||
if(p == null) { | ||
p = new Page(currentURL); | ||
pastPages.put(currentURL, p); | ||
} | ||
index.setCurrentPage(p); | ||
this.currentPage = p; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
package CrawlerAndQueryEngine; | ||
|
||
import java.io.*; | ||
|
||
/** | ||
* A serializable index, using Java's native Serializable interface and ObjectStream. Provides | ||
* methods to load and save indexes. | ||
*/ | ||
public class Index implements Serializable { | ||
private static final long serialVersionUID = 1L; | ||
|
||
/** | ||
* Loads an Index from the given file, throwing an exception if there is an error during the | ||
* loading process. The actual concrete type of the Index will be automatically determined by | ||
* the ObjectStream. | ||
* @param filename The file to load the index from. | ||
*/ | ||
public static Index load(String filename) throws IOException, ClassNotFoundException { | ||
// Uses Java 7's try-with-resources to attempt to open the file, automatically closing it | ||
// upon completion or failure. | ||
try(ObjectInputStream oin = new ObjectInputStream(new FileInputStream(filename))) { | ||
return (Index) oin.readObject(); | ||
} | ||
} | ||
|
||
/** | ||
* Saves an Index to the given file, throwing an exception if there is an error during the | ||
* saving process. | ||
*/ | ||
public void save(String filename) throws IOException { | ||
// Uses Java 7's try-with-resources to attempt to open the file, automatically closing it | ||
// upon completion or failure. | ||
try(ObjectOutputStream oout = new ObjectOutputStream(new FileOutputStream(filename))) { | ||
oout.writeObject(this); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,53 @@ | ||
package CrawlerAndQueryEngine; | ||
import java.io.Serializable; | ||
import java.net.URL; | ||
|
||
public class Page implements Serializable, Comparable { | ||
private static final long serialVersionUID = 1L; | ||
|
||
// The URL the page was located at. | ||
private URL url; | ||
|
||
private int connectedness = 1; | ||
public void increment() { | ||
connectedness++; | ||
} | ||
public int getConnectedness() { | ||
return connectedness; | ||
} | ||
|
||
private int ID; | ||
public Page(URL url, int ID) { | ||
this.url = url; | ||
this.ID = ID; | ||
} | ||
public void setID(int ID) { | ||
this.ID = ID; | ||
} | ||
|
||
|
||
|
||
/** | ||
* Creates a Page with a given URL. | ||
* @param url The url of the page. | ||
*/ | ||
public Page(URL url) { | ||
this.url = url; | ||
} | ||
|
||
/** | ||
* @return the URL of the page. | ||
*/ | ||
public URL getURL() { return url; } | ||
|
||
|
||
@Override | ||
public int hashCode(){ | ||
return ID; | ||
//return url.hashCode(); | ||
} | ||
|
||
public int compareTo(Object other){ | ||
return ((Page) other).connectedness - connectedness; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
package CrawlerAndQueryEngine; | ||
|
||
import java.io.*; | ||
import java.net.*; | ||
import java.util.*; | ||
|
||
import org.attoparser.simple.*; | ||
import org.attoparser.config.ParseConfiguration; | ||
|
||
/** | ||
* The entry-point for WebCrawler; takes in a list of URLs to start crawling from and saves an index | ||
* to index.db. | ||
*/ | ||
public class WebCrawler { | ||
|
||
public static void main(String[] args) { | ||
// Basic usage information | ||
if (args.length == 0) { | ||
System.err.println("Error: No URLs specified."); | ||
System.exit(1); | ||
} | ||
|
||
// We'll throw all of the args into a queue for processing. | ||
Queue<URL> remaining = new LinkedList<>(); | ||
for (String url : args) { | ||
try { | ||
remaining.add(new URL(url)); | ||
} catch (MalformedURLException e) { | ||
// Throw this one out! | ||
System.err.printf("Error: URL '%s' was malformed and will be ignored!%n", url); | ||
} | ||
} | ||
|
||
// Create a parser from the attoparser library, and our handler for markup. | ||
ISimpleMarkupParser parser = new SimpleMarkupParser(ParseConfiguration.htmlConfiguration()); | ||
CrawlingMarkupHandler handler = new CrawlingMarkupHandler(); | ||
|
||
// Try to start crawling, adding new URLS as we see them. | ||
try { | ||
while (!remaining.isEmpty()) { | ||
// Parse the next URL's page | ||
//parser.parse(new InputStreamReader(remaining.poll().openStream()), handler); | ||
|
||
URL nextURL = remaining.remove(); | ||
handler.setCurrentURL(nextURL); | ||
try { | ||
parser.parse(new InputStreamReader(nextURL.openStream()), handler); | ||
} catch (FileNotFoundException e) { | ||
|
||
} | ||
// Add any new URLs | ||
remaining.addAll(handler.newURLs()); | ||
} | ||
|
||
handler.getIndex().save("index.db"); | ||
} catch (Exception e) { | ||
// Bad exception handling :( | ||
System.err.println("Error: Index generation failed!"); | ||
e.printStackTrace(); | ||
System.exit(1); | ||
} | ||
} | ||
} |
Oops, something went wrong.