Add files via upload

ishanashah · Jan 3, 2019 · 58361fe · 58361fe
1 parent d30bbb2
commit 58361fe
Show file tree

Hide file tree

Showing 7 changed files with 1,191 additions and 0 deletions.
diff --git a/CrawlerAndQueryEngine/CrawlingMarkupHandler.java b/CrawlerAndQueryEngine/CrawlingMarkupHandler.java
@@ -0,0 +1,161 @@
+package CrawlerAndQueryEngine;
+
+import java.util.*;
+import java.net.*;
+import org.attoparser.simple.*;
+
+/**
+ * A markup handler which is called by the Attoparser markup parser as it parses the input;
+ * responsible for building the actual web index.
+ */
+public class CrawlingMarkupHandler extends AbstractSimpleMarkupHandler {
+
+    //The WebIndex object used to index all pages
+    private WebIndex index = new WebIndex();
+    //Queue of text representing every word on a page
+    private Queue<String> text = new LinkedList<>();
+    //The last word being built in the page
+    private StringBuilder lastWord = new StringBuilder();
+    //Set of all URL's that have been visited
+    private HashMap<URL, Page> pastPages = new HashMap<>();
+    //New URL's that still need to be visited
+    private List<URL> newURLS = new LinkedList<>();
+    //The URL currently being parsed
+    private Page currentPage;
+
+
+    public CrawlingMarkupHandler() {}
+
+    /**
+    * This method returns the complete index that has been crawled thus far when called.
+    */
+    public Index getIndex() {
+        // TODO: Implement this!
+        return index;
+    }
+
+    /**
+    * This method returns any new URLs found to the Crawler; upon being called, the set of new URLs
+    * should be cleared.
+    */
+    public List<URL> newURLs() {
+        List<URL> list = newURLS;
+        newURLS = new LinkedList<>();
+        return list;
+    }
+
+    /**
+    * Called when the parser first starts reading a document.
+    * @param startTimeNanos  the current time (in nanoseconds) when parsing starts
+    * @param line            the line of the document where parsing starts
+    * @param col             the column of the document where parsing starts
+    */
+    public void handleDocumentStart(long startTimeNanos, int line, int col) {
+        // TODO: Implement this.
+        text = new LinkedList<>();
+        lastWord = new StringBuilder();
+    }
+
+    /**
+    * Called when the parser finishes reading a document.
+    * @param endTimeNanos    the current time (in nanoseconds) when parsing ends
+    * @param totalTimeNanos  the difference between current times at the start
+    *                        and end of parsing
+    * @param line            the line of the document where parsing ends
+    * @param col             the column of the document where the parsing ends
+    */
+    public void handleDocumentEnd(long endTimeNanos, long totalTimeNanos, int line, int col) {
+        // TODO: Implement this.
+        //System.out.println("End of document");
+        if(lastWord.length() > 0){
+            text.add(lastWord.toString());
+        }
+        lastWord = new StringBuilder();
+        index.addPhrase(text);
+    }
+
+    /**
+    * Called at the start of any tag.
+    * @param elementName the element name (such as "div")
+    * @param attributes  the element attributes map, or null if it has no attributes
+    * @param line        the line in the document where this elements appears
+    * @param col         the column in the document where this element appears
+    */
+    public void handleOpenElement(String elementName, Map<String, String> attributes, int line, int col) {
+        // TODO: Implement this.
+        if(attributes == null || attributes.keySet() == null) {return;}
+
+        //If the element contains a URl that hasn't been visited,
+        //add it to newURLS
+        if(elementName.toLowerCase().equals("a")) {
+            for(String s: attributes.keySet()) {
+                if(s.toLowerCase().equals("href")) {
+                    String fin_link = attributes.get(s);
+                    URL next = null;
+                    try {
+                        next = new URL(currentPage.getURL(), fin_link);
+
+                        if(!pastPages.keySet().contains(next)) {
+                            String urlString = next.toString();
+                            if(urlString.endsWith(".html") || urlString.endsWith(".htm")){
+                                newURLS.add(next);
+                                pastPages.put(next, new Page(next));
+                            }
+                        } else {
+                            pastPages.get(next).increment();
+                        }
+                    } catch(MalformedURLException e) {
+                    }
+                }
+            }
+        }
+
+    }
+
+    /**
+    * Called at the end of any tag.
+    * @param elementName the element name (such as "div").
+    * @param line        the line in the document where this elements appears.
+    * @param col         the column in the document where this element appears.
+    */
+    public void handleCloseElement(String elementName, int line, int col) {
+        // TODO: Implement this.
+        //System.out.println("End element:   " + elementName);
+    }
+
+    /**
+    * Called whenever characters are found inside a tag. Note that the parser is not
+    * required to return all characters in the tag in a single chunk. Whitespace is
+    * also returned as characters.
+    * @param ch      buffer containint characters; do not modify this buffer
+    * @param start   location of 1st character in ch
+    * @param length  number of characters in ch
+    */
+    public void handleText(char ch[], int start, int length, int line, int col) {
+        // TODO: Implement this.
+        //System.out.print("Characters:    \"");
+
+        for(int i = start; i < start + length; i++) {
+            //Tokenize the incoming stream of characters
+            if(Character.isLetterOrDigit(ch[i])) {
+                lastWord.append(Character.toLowerCase(ch[i]));
+            } else {
+                if(lastWord.length() > 0){
+                    text.add(lastWord.toString());
+                    lastWord = new StringBuilder();
+                }
+            }
+        }
+
+    }
+
+    public void setCurrentURL(URL currentURL){
+        Page p = pastPages.get(currentURL);
+        if(p == null) {
+            p = new Page(currentURL);
+            pastPages.put(currentURL, p);
+        }
+        index.setCurrentPage(p);
+        this.currentPage = p;
+    }
+}
diff --git a/CrawlerAndQueryEngine/Index.java b/CrawlerAndQueryEngine/Index.java
@@ -0,0 +1,37 @@
+package CrawlerAndQueryEngine;
+
+import java.io.*;
+
+/**
+ * A serializable index, using Java's native Serializable interface and ObjectStream.  Provides
+ * methods to load and save indexes.
+ */
+public class Index implements Serializable {
+    private static final long serialVersionUID = 1L;
+
+    /**
+     * Loads an Index from the given file, throwing an exception if there is an error during the
+     * loading process. The actual concrete type of the Index will be automatically determined by
+     * the ObjectStream.
+     * @param filename The file to load the index from.
+     */
+    public static Index load(String filename) throws IOException, ClassNotFoundException {
+        // Uses Java 7's try-with-resources to attempt to open the file, automatically closing it
+        // upon completion or failure.
+        try(ObjectInputStream oin = new ObjectInputStream(new FileInputStream(filename))) {
+            return (Index) oin.readObject();
+        }
+    }
+
+    /**
+     * Saves an Index to the given file, throwing an exception if there is an error during the
+     * saving process.
+     */
+    public void save(String filename) throws IOException {
+        // Uses Java 7's try-with-resources to attempt to open the file, automatically closing it
+        // upon completion or failure.
+        try(ObjectOutputStream oout = new ObjectOutputStream(new FileOutputStream(filename))) {
+            oout.writeObject(this);
+        }
+    }
+}
diff --git a/CrawlerAndQueryEngine/Page.java b/CrawlerAndQueryEngine/Page.java
@@ -0,0 +1,53 @@
+package CrawlerAndQueryEngine;
+import java.io.Serializable;
+import java.net.URL;
+
+public class Page implements Serializable, Comparable {
+    private static final long serialVersionUID = 1L;
+
+    // The URL the page was located at.
+    private URL url;
+
+    private int connectedness = 1;
+    public void increment() {
+        connectedness++;
+    }
+    public int getConnectedness() {
+        return connectedness;
+    }
+
+    private int ID;
+    public Page(URL url, int ID) {
+        this.url = url;
+        this.ID = ID;
+    }
+    public void setID(int ID) {
+        this.ID = ID;
+    }
+
+
+
+    /**
+     * Creates a Page with a given URL.
+     * @param url The url of the page.
+     */
+    public Page(URL url) {
+        this.url = url;
+    }
+
+    /**
+     * @return the URL of the page.
+     */
+    public URL getURL() { return url; }
+
+
+    @Override
+    public int hashCode(){
+        return ID;
+        //return url.hashCode();
+    }
+
+    public int compareTo(Object other){
+        return ((Page) other).connectedness - connectedness;
+    }
+}
diff --git a/CrawlerAndQueryEngine/WebCrawler.java b/CrawlerAndQueryEngine/WebCrawler.java
@@ -0,0 +1,63 @@
+package CrawlerAndQueryEngine;
+
+import java.io.*;
+import java.net.*;
+import java.util.*;
+
+import org.attoparser.simple.*;
+import org.attoparser.config.ParseConfiguration;
+
+/**
+ * The entry-point for WebCrawler; takes in a list of URLs to start crawling from and saves an index
+ * to index.db.
+ */
+public class WebCrawler {
+
+    public static void main(String[] args) {
+        // Basic usage information
+        if (args.length == 0) {
+            System.err.println("Error: No URLs specified.");
+            System.exit(1);
+        }
+
+        // We'll throw all of the args into a queue for processing.
+        Queue<URL> remaining = new LinkedList<>();
+        for (String url : args) {
+            try {
+                remaining.add(new URL(url));
+            } catch (MalformedURLException e) {
+                // Throw this one out!
+                System.err.printf("Error: URL '%s' was malformed and will be ignored!%n", url);
+            }
+        }
+
+        // Create a parser from the attoparser library, and our handler for markup.
+        ISimpleMarkupParser parser = new SimpleMarkupParser(ParseConfiguration.htmlConfiguration());
+        CrawlingMarkupHandler handler = new CrawlingMarkupHandler();
+
+        // Try to start crawling, adding new URLS as we see them.
+        try {
+            while (!remaining.isEmpty()) {
+                // Parse the next URL's page
+                //parser.parse(new InputStreamReader(remaining.poll().openStream()), handler);
+
+                URL nextURL = remaining.remove();
+                handler.setCurrentURL(nextURL);
+                try {
+                    parser.parse(new InputStreamReader(nextURL.openStream()), handler);
+                } catch (FileNotFoundException e) {
+
+                }
+                // Add any new URLs
+                remaining.addAll(handler.newURLs());
+            }
+
+            handler.getIndex().save("index.db");
+        } catch (Exception e) {
+            // Bad exception handling :(
+            System.err.println("Error: Index generation failed!");
+            e.printStackTrace();
+            System.exit(1);
+        }
+    }
+}