Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
ishanashah committed Jan 3, 2019
1 parent d30bbb2 commit 58361fe
Show file tree
Hide file tree
Showing 7 changed files with 1,191 additions and 0 deletions.
161 changes: 161 additions & 0 deletions CrawlerAndQueryEngine/CrawlingMarkupHandler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
package CrawlerAndQueryEngine;

import java.util.*;
import java.net.*;
import org.attoparser.simple.*;

/**
* A markup handler which is called by the Attoparser markup parser as it parses the input;
* responsible for building the actual web index.
*/
public class CrawlingMarkupHandler extends AbstractSimpleMarkupHandler {

//The WebIndex object used to index all pages
private WebIndex index = new WebIndex();
//Queue of text representing every word on a page
private Queue<String> text = new LinkedList<>();
//The last word being built in the page
private StringBuilder lastWord = new StringBuilder();
//Set of all URL's that have been visited
private HashMap<URL, Page> pastPages = new HashMap<>();
//New URL's that still need to be visited
private List<URL> newURLS = new LinkedList<>();
//The URL currently being parsed
private Page currentPage;


public CrawlingMarkupHandler() {}

/**
* This method returns the complete index that has been crawled thus far when called.
*/
public Index getIndex() {
// TODO: Implement this!
return index;
}

/**
* This method returns any new URLs found to the Crawler; upon being called, the set of new URLs
* should be cleared.
*/
public List<URL> newURLs() {
List<URL> list = newURLS;
newURLS = new LinkedList<>();
return list;
}

/**
* Called when the parser first starts reading a document.
* @param startTimeNanos the current time (in nanoseconds) when parsing starts
* @param line the line of the document where parsing starts
* @param col the column of the document where parsing starts
*/
public void handleDocumentStart(long startTimeNanos, int line, int col) {
// TODO: Implement this.
text = new LinkedList<>();
lastWord = new StringBuilder();
}

/**
* Called when the parser finishes reading a document.
* @param endTimeNanos the current time (in nanoseconds) when parsing ends
* @param totalTimeNanos the difference between current times at the start
* and end of parsing
* @param line the line of the document where parsing ends
* @param col the column of the document where the parsing ends
*/
public void handleDocumentEnd(long endTimeNanos, long totalTimeNanos, int line, int col) {
// TODO: Implement this.
//System.out.println("End of document");
if(lastWord.length() > 0){
text.add(lastWord.toString());
}
lastWord = new StringBuilder();
index.addPhrase(text);
}

/**
* Called at the start of any tag.
* @param elementName the element name (such as "div")
* @param attributes the element attributes map, or null if it has no attributes
* @param line the line in the document where this elements appears
* @param col the column in the document where this element appears
*/
public void handleOpenElement(String elementName, Map<String, String> attributes, int line, int col) {
// TODO: Implement this.
if(attributes == null || attributes.keySet() == null) {return;}

//If the element contains a URl that hasn't been visited,
//add it to newURLS
if(elementName.toLowerCase().equals("a")) {
for(String s: attributes.keySet()) {
if(s.toLowerCase().equals("href")) {
String fin_link = attributes.get(s);
URL next = null;
try {
next = new URL(currentPage.getURL(), fin_link);

if(!pastPages.keySet().contains(next)) {
String urlString = next.toString();
if(urlString.endsWith(".html") || urlString.endsWith(".htm")){
newURLS.add(next);
pastPages.put(next, new Page(next));
}
} else {
pastPages.get(next).increment();
}
} catch(MalformedURLException e) {
}
}
}
}

}

/**
* Called at the end of any tag.
* @param elementName the element name (such as "div").
* @param line the line in the document where this elements appears.
* @param col the column in the document where this element appears.
*/
public void handleCloseElement(String elementName, int line, int col) {
// TODO: Implement this.
//System.out.println("End element: " + elementName);
}

/**
* Called whenever characters are found inside a tag. Note that the parser is not
* required to return all characters in the tag in a single chunk. Whitespace is
* also returned as characters.
* @param ch buffer containint characters; do not modify this buffer
* @param start location of 1st character in ch
* @param length number of characters in ch
*/
public void handleText(char ch[], int start, int length, int line, int col) {
// TODO: Implement this.
//System.out.print("Characters: \"");

for(int i = start; i < start + length; i++) {
//Tokenize the incoming stream of characters
if(Character.isLetterOrDigit(ch[i])) {
lastWord.append(Character.toLowerCase(ch[i]));
} else {
if(lastWord.length() > 0){
text.add(lastWord.toString());
lastWord = new StringBuilder();
}
}
}

}

public void setCurrentURL(URL currentURL){
Page p = pastPages.get(currentURL);
if(p == null) {
p = new Page(currentURL);
pastPages.put(currentURL, p);
}
index.setCurrentPage(p);
this.currentPage = p;
}
}
37 changes: 37 additions & 0 deletions CrawlerAndQueryEngine/Index.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
package CrawlerAndQueryEngine;

import java.io.*;

/**
* A serializable index, using Java's native Serializable interface and ObjectStream. Provides
* methods to load and save indexes.
*/
public class Index implements Serializable {
private static final long serialVersionUID = 1L;

/**
* Loads an Index from the given file, throwing an exception if there is an error during the
* loading process. The actual concrete type of the Index will be automatically determined by
* the ObjectStream.
* @param filename The file to load the index from.
*/
public static Index load(String filename) throws IOException, ClassNotFoundException {
// Uses Java 7's try-with-resources to attempt to open the file, automatically closing it
// upon completion or failure.
try(ObjectInputStream oin = new ObjectInputStream(new FileInputStream(filename))) {
return (Index) oin.readObject();
}
}

/**
* Saves an Index to the given file, throwing an exception if there is an error during the
* saving process.
*/
public void save(String filename) throws IOException {
// Uses Java 7's try-with-resources to attempt to open the file, automatically closing it
// upon completion or failure.
try(ObjectOutputStream oout = new ObjectOutputStream(new FileOutputStream(filename))) {
oout.writeObject(this);
}
}
}
53 changes: 53 additions & 0 deletions CrawlerAndQueryEngine/Page.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
package CrawlerAndQueryEngine;
import java.io.Serializable;
import java.net.URL;

public class Page implements Serializable, Comparable {
private static final long serialVersionUID = 1L;

// The URL the page was located at.
private URL url;

private int connectedness = 1;
public void increment() {
connectedness++;
}
public int getConnectedness() {
return connectedness;
}

private int ID;
public Page(URL url, int ID) {
this.url = url;
this.ID = ID;
}
public void setID(int ID) {
this.ID = ID;
}



/**
* Creates a Page with a given URL.
* @param url The url of the page.
*/
public Page(URL url) {
this.url = url;
}

/**
* @return the URL of the page.
*/
public URL getURL() { return url; }


@Override
public int hashCode(){
return ID;
//return url.hashCode();
}

public int compareTo(Object other){
return ((Page) other).connectedness - connectedness;
}
}
63 changes: 63 additions & 0 deletions CrawlerAndQueryEngine/WebCrawler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package CrawlerAndQueryEngine;

import java.io.*;
import java.net.*;
import java.util.*;

import org.attoparser.simple.*;
import org.attoparser.config.ParseConfiguration;

/**
* The entry-point for WebCrawler; takes in a list of URLs to start crawling from and saves an index
* to index.db.
*/
public class WebCrawler {

public static void main(String[] args) {
// Basic usage information
if (args.length == 0) {
System.err.println("Error: No URLs specified.");
System.exit(1);
}

// We'll throw all of the args into a queue for processing.
Queue<URL> remaining = new LinkedList<>();
for (String url : args) {
try {
remaining.add(new URL(url));
} catch (MalformedURLException e) {
// Throw this one out!
System.err.printf("Error: URL '%s' was malformed and will be ignored!%n", url);
}
}

// Create a parser from the attoparser library, and our handler for markup.
ISimpleMarkupParser parser = new SimpleMarkupParser(ParseConfiguration.htmlConfiguration());
CrawlingMarkupHandler handler = new CrawlingMarkupHandler();

// Try to start crawling, adding new URLS as we see them.
try {
while (!remaining.isEmpty()) {
// Parse the next URL's page
//parser.parse(new InputStreamReader(remaining.poll().openStream()), handler);

URL nextURL = remaining.remove();
handler.setCurrentURL(nextURL);
try {
parser.parse(new InputStreamReader(nextURL.openStream()), handler);
} catch (FileNotFoundException e) {

}
// Add any new URLs
remaining.addAll(handler.newURLs());
}

handler.getIndex().save("index.db");
} catch (Exception e) {
// Bad exception handling :(
System.err.println("Error: Index generation failed!");
e.printStackTrace();
System.exit(1);
}
}
}
Loading

0 comments on commit 58361fe

Please sign in to comment.