Html support (becheran#7)

* Fix benches * First teset * WIP * Fix unit tests * Working html support * Update docs * Fix markup types empty * Update readme
hoijui · Jan 11, 2020 · fc67389 · fc67389
1 parent 9bdb785
commit fc67389
Show file tree

Hide file tree

Showing 40 changed files with 252 additions and 30 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,6 +14,10 @@ Types for Changes:
 
 ## [Unreleased] - ReleaseDate
 
+## Added
+
+* HTML support
+
 ## Fixed
 
 * No panic for not UTF-8 encoded files

diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 [![Build Status](https://gitlab.com/becheran/mlc_ci/badges/master/pipeline.svg)](https://gitlab.com/becheran/mlc_ci/pipelines)
 [![](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
 
-Check for broken links in markdown files. Can easily be integrated in your CI/CD pipeline to prevent broken links in your docs.
+Check for broken links in markup files. Currently `html` and `markdown` files are supported. The Markup Link Checker can easily be integrated in your CI/CD pipeline to prevent broken links in your markdown docs.
 
 ## Install
 
@@ -67,10 +67,3 @@ All contributions and comments welcome! Open an issue or create a Pull Request w
 ## License
 
 This project is licensed under the MIT License - see the [LICENSE file](https://github.com/becheran/mlc/blob/master/LICENSE) for details.
-
-## Planned Features
-
-- Timeout for requests as cl argument
-- Improve speed
-- Add .ignore file support
-- Support other markup files such as tex or html
diff --git a/benches/benchmark/html/many_links.html b/benches/benchmark/html/many_links.html
@@ -0,0 +1,14 @@
+<html>
+<h1>Hello, world!</h1>
+<p>bla bla <a hreflang="en" href="https://www.w3schools.com">Visit W3Schools.com!</a> bla bla </p>
+<p>bla bla <a href  = "https://www.w3schools.com">Visit W3Schools.com!</a> bla bla </p>
+<p>multiline
+    <a 
+    href=  
+    "https://www.w3schools.com"   >Visit W3Schools.com!
+    </> bla bla
+</p>
+sdjklf slfkj <!--
+<p>commented </p>
+-->
+</html>
diff --git a/...s/benchmark/md_file_endings/NotMardown.nm → benches/benchmark/html/no_links.html b/...s/benchmark/md_file_endings/NotMardown.nm → benches/benchmark/html/no_links.html
diff --git a/benches/benchmark/html/xhtml.xhtml b/benches/benchmark/html/xhtml.xhtml
@@ -0,0 +1,15 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
+"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+
+<html xmlns="http://www.w3.org/1999/xhtml">
+
+<head>
+  <title>Title of document</title>
+  <a href  = "https://www.w3schools.com">Visit W3Schools.com!</a>
+</head>
+
+<body>
+  some content
+</body>
+
+</html>
diff --git a/...k/many_links/many_links (another copy).md → ...n/many_links/many_links (another copy).md b/...k/many_links/many_links (another copy).md → ...n/many_links/many_links (another copy).md
diff --git a/...benchmark/many_links/many_links (copy).md → .../markdown/many_links/many_links (copy).md b/...benchmark/many_links/many_links (copy).md → .../markdown/many_links/many_links (copy).md
diff --git a/benches/benchmark/many_links/many_links.md → ...nchmark/markdown/many_links/many_links.md b/benches/benchmark/many_links/many_links.md → ...nchmark/markdown/many_links/many_links.md
diff --git a/...nchmark/md_file_endings/F3_with_umlaut.md → ...arkdown/md_file_endings/F3_with_umlaut.md b/...nchmark/md_file_endings/F3_with_umlaut.md → ...arkdown/md_file_endings/F3_with_umlaut.md
diff --git a/benches/benchmark/md_file_endings/f10.text → ...rk/markdown/md_file_endings/NotMardown.nm b/benches/benchmark/md_file_endings/f10.text → ...rk/markdown/md_file_endings/NotMardown.nm
diff --git a/benches/benchmark/md_file_endings/f1.md → .../benchmark/markdown/md_file_endings/f1.md b/benches/benchmark/md_file_endings/f1.md → .../benchmark/markdown/md_file_endings/f1.md
diff --git a/benches/benchmark/md_file_endings/f11.Rmd → ...nchmark/markdown/md_file_endings/f10.text b/benches/benchmark/md_file_endings/f11.Rmd → ...nchmark/markdown/md_file_endings/f10.text
diff --git a/benches/benchmark/md_file_endings/f12.mkd → ...enchmark/markdown/md_file_endings/f11.Rmd b/benches/benchmark/md_file_endings/f12.mkd → ...enchmark/markdown/md_file_endings/f11.Rmd
diff --git a/...hes/benchmark/md_file_endings/f4.markdown → ...enchmark/markdown/md_file_endings/f12.mkd b/...hes/benchmark/md_file_endings/f4.markdown → ...enchmark/markdown/md_file_endings/f12.mkd
diff --git a/benches/benchmark/md_file_endings/f2.MD → .../benchmark/markdown/md_file_endings/f2.MD b/benches/benchmark/md_file_endings/f2.MD → .../benchmark/markdown/md_file_endings/f2.MD
diff --git a/benches/benchmark/md_file_endings/f5.mkdown → ...mark/markdown/md_file_endings/f4.markdown b/benches/benchmark/md_file_endings/f5.mkdown → ...mark/markdown/md_file_endings/f4.markdown
diff --git a/benches/benchmark/md_file_endings/f6.mkdn → ...chmark/markdown/md_file_endings/f5.mkdown b/benches/benchmark/md_file_endings/f6.mkdn → ...chmark/markdown/md_file_endings/f5.mkdown
diff --git a/benches/benchmark/md_file_endings/f7.mdwn → ...enchmark/markdown/md_file_endings/f6.mkdn b/benches/benchmark/md_file_endings/f7.mdwn → ...enchmark/markdown/md_file_endings/f6.mkdn
diff --git a/benches/benchmark/md_file_endings/f9.mdtext → ...enchmark/markdown/md_file_endings/f7.mdwn b/benches/benchmark/md_file_endings/f9.mdtext → ...enchmark/markdown/md_file_endings/f7.mdwn
diff --git a/benches/benchmark/md_file_endings/f8.mdtxt → ...nchmark/markdown/md_file_endings/f8.mdtxt b/benches/benchmark/md_file_endings/f8.mdtxt → ...nchmark/markdown/md_file_endings/f8.mdtxt
diff --git a/benches/benchmark/markdown/md_file_endings/f9.mdtext b/benches/benchmark/markdown/md_file_endings/f9.mdtext
diff --git a/benches/benchmark/md_file_endings/notmd → .../benchmark/markdown/md_file_endings/notmd b/benches/benchmark/md_file_endings/notmd → .../benchmark/markdown/md_file_endings/notmd
diff --git a/...benchmark/no_links/no_links (3rd copy).md → .../markdown/no_links/no_links (3rd copy).md b/...benchmark/no_links/no_links (3rd copy).md → .../markdown/no_links/no_links (3rd copy).md
diff --git a/...benchmark/no_links/no_links (4th copy).md → .../markdown/no_links/no_links (4th copy).md b/...benchmark/no_links/no_links (4th copy).md → .../markdown/no_links/no_links (4th copy).md
diff --git a/...benchmark/no_links/no_links (5th copy).md → .../markdown/no_links/no_links (5th copy).md b/...benchmark/no_links/no_links (5th copy).md → .../markdown/no_links/no_links (5th copy).md
diff --git a/...benchmark/no_links/no_links (6th copy).md → .../markdown/no_links/no_links (6th copy).md b/...benchmark/no_links/no_links (6th copy).md → .../markdown/no_links/no_links (6th copy).md
diff --git a/...benchmark/no_links/no_links (7th copy).md → .../markdown/no_links/no_links (7th copy).md b/...benchmark/no_links/no_links (7th copy).md → .../markdown/no_links/no_links (7th copy).md
diff --git a/...hmark/no_links/no_links (another copy).md → ...kdown/no_links/no_links (another copy).md b/...hmark/no_links/no_links (another copy).md → ...kdown/no_links/no_links (another copy).md
diff --git a/...hes/benchmark/no_links/no_links (copy).md → ...mark/markdown/no_links/no_links (copy).md b/...hes/benchmark/no_links/no_links (copy).md → ...mark/markdown/no_links/no_links (copy).md
diff --git a/benches/benchmark/no_links/no_links.md → ...s/benchmark/markdown/no_links/no_links.md b/benches/benchmark/no_links/no_links.md → ...s/benchmark/markdown/no_links/no_links.md
diff --git a/...enchmark/withUmlaut_ö/LinksWithUmläuts.md → ...markdown/withUmlaut_ö/LinksWithUmläuts.md b/...enchmark/withUmlaut_ö/LinksWithUmläuts.md → ...markdown/withUmlaut_ö/LinksWithUmläuts.md
diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs
@@ -11,6 +11,8 @@ fn end_to_end_benchmark() {
         folder: String::from("./benches/benchmark"),
         log_level: logger::LogLevel::Debug,
         markup_types: vec![MarkupType::Markdown],
+        ignore_links: vec![],
+        no_web_links: false,
     };
     let _ = mlc::run(&config);
 }

diff --git a/docs/reference.md b/docs/reference.md
@@ -2,11 +2,12 @@
 
 ## CLI Arguments
 
-| Argument        | Short | Description |
-|-----------------|-------|-------------|
-| `--help`        | `-h`  | Print help |
-| `--debug`       | `-d`  | Show verbose debug information |
-| `--no-web-links`|       | Do not check any web links |
-| `--version`     | `-V`  | Print current version of mlc |
-| `--ignore-links`|       | List of links which shall be ignored. Use simple `?` and `*` wildcards. For example `--ignore-links "http*://crates.io*"` will skipp all links to the crates.io website. See the [used lib](https://github.com/becheran/wildmatch) for more information.  |
-| `<directory>`   |       | Path to directory which shall be checked with all sub-dirs. Can also be a specific filename which shall be checked. |
+| Argument         | Short | Description |
+|------------------|-------|-------------|
+| `--help`         | `-h`  | Print help |
+| `--debug`        | `-d`  | Show verbose debug information |
+| `--no-web-links` |       | Do not check any web links |
+| `--version`      | `-V`  | Print current version of mlc |
+| `--ignore-links` |       | List of links which shall be ignored. Use simple `?` and `*` wildcards. For example `--ignore-links "http*://crates.io*"` will skipp all links to the crates.io website. See the [used lib](https://github.com/becheran/wildmatch) for more information.  |
+| `--markup-types` | `-t`  | List of markup types which shall be checked [possible values: md, html] |
+| `<directory>`    |       | Path to directory which shall be checked with all sub-dirs. Can also be a specific filename which shall be checked. |
diff --git a/src/cli.rs b/src/cli.rs
@@ -32,6 +32,15 @@ pub fn parse_args() -> Config {
                 .min_values(1)
                 .required(false),
         )
+        .arg(
+            Arg::with_name("markup_types")
+                .long("markup-types")
+                .short("t")
+                .help("List of markup types which shall be checked")
+                .min_values(1)
+                .possible_values(&["md", "html"])
+                .required(false),
+        )
         .version(crate_version!())
         .author(crate_authors!())
         .about(crate_description!())
@@ -42,8 +51,17 @@ pub fn parse_args() -> Config {
     } else {
         logger::LogLevel::Warn
     };
-    let folder = matches.value_of("directory").unwrap_or("./").parse().unwrap();
-    let markup_types = vec![MarkupType::Markdown];
+    let folder = matches
+        .value_of("directory")
+        .unwrap_or("./")
+        .parse()
+        .unwrap();
+
+    let mut markup_types = vec![MarkupType::Markdown, MarkupType::HTML];
+    if let Some(types) = matches.values_of("markup_types") {
+        markup_types = types.map(|x| x.parse().unwrap()).collect();
+    }
+
     let no_web_links = matches.is_present("no_web_links");
     let ignore_links: Vec<WildMatch> = matches
         .values_of("ignore_links")

diff --git a/src/link_extractors/html_link_extractor.rs b/src/link_extractors/html_link_extractor.rs
@@ -0,0 +1,152 @@
+use crate::link_extractors::link_extractor::LinkExtractor;
+use crate::link_extractors::link_extractor::MarkupLink;
+
+pub struct HtmlLinkExtractor();
+
+enum ParserState {
+    Text,
+    Comment,
+    Anchor,
+    EqualSign,
+    Link,
+}
+impl LinkExtractor for HtmlLinkExtractor {
+    fn find_links(&self, text: &str) -> Vec<MarkupLink> {
+        let mut result: Vec<MarkupLink> = Vec::new();
+        let mut state: ParserState = ParserState::Text;
+        for (line, line_str) in text.lines().enumerate() {
+            let line_chars: Vec<char> = line_str.chars().collect();
+            let mut column: usize = 0;
+            while line_chars.get(column).is_some() {
+                match state {
+                    ParserState::Comment => {
+                        if line_chars.get(column) == Some(&'-')
+                            && line_chars.get(column + 1) == Some(&'-')
+                            && line_chars.get(column + 2) == Some(&'>')
+                        {
+                            column += 2;
+                            state = ParserState::Text;
+                        }
+                    }
+                    ParserState::Text => {
+                        if line_chars.get(column) == Some(&'<')
+                            && line_chars.get(column + 1) == Some(&'!')
+                            && line_chars.get(column + 2) == Some(&'-')
+                            && line_chars.get(column + 3) == Some(&'-')
+                        {
+                            column += 3;
+                            state = ParserState::Comment;
+                        } else if line_chars.get(column) == Some(&'<')
+                            && line_chars.get(column + 1) == Some(&'a')
+                        {
+                            column += 1;
+                            state = ParserState::Anchor;
+                        }
+                    }
+                    ParserState::Anchor => {
+                        if line_chars.get(column) == Some(&'h')
+                            && line_chars.get(column + 1) == Some(&'r')
+                            && line_chars.get(column + 2) == Some(&'e')
+                            && line_chars.get(column + 3) == Some(&'f')
+                        {
+                            column += 3;
+                            state = ParserState::EqualSign;
+                        }
+                    }
+                    ParserState::EqualSign => {
+                        match line_chars.get(column) {
+                            Some(x) if x.is_whitespace() => {}
+                            Some(x) if x == &'=' => state = ParserState::Link,
+                            Some(_) => state = ParserState::Anchor,
+                            None => {}
+                        };
+                    }
+                    ParserState::Link => {
+                        match line_chars.get(column) {
+                            Some(x) if !x.is_whitespace() && x != &'"' => {
+                                let link_column = column;
+                                while line_chars.get(column).is_some()
+                                    && !line_chars[column].is_whitespace()
+                                    && line_chars[column] != '"'
+                                {
+                                    column += 1;
+                                }
+                                while let Some(c) = line_chars.get(column) {
+                                    if c.is_whitespace() || c == &'"'{
+                                        break;
+                                    }
+                                    column += 1;
+                                }
+                                let link = (&line_chars[link_column..column])
+                                    .iter()
+                                    .collect::<String>();
+                                result.push(MarkupLink {
+                                    column: link_column + 1,
+                                    line: line + 1,
+                                    target: link.to_string(),
+                                });
+                                state = ParserState::Text;
+                            }
+                            Some(_) | None => {}
+                        };
+                    }
+                }
+                column += 1;
+            }
+        }
+        result
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use ntest::test_case;
+
+    #[test]
+    fn no_link() {
+        let le = HtmlLinkExtractor();
+        let input = "]This is not a <has> no link <h1>Bla</h1> attribute.";
+        let result = le.find_links(&input);
+        assert!(result.is_empty());
+    }
+
+    #[test]
+    fn commented() {
+        let le = HtmlLinkExtractor();
+        let input = "df <!-- <a href=\"http://wiki.selfhtml.org\"> haha</a> -->";
+        let result = le.find_links(&input);
+        assert!(result.is_empty());
+    }
+
+    #[test_case(
+        "<a href=\"https://www.w3schools.com\">Visit W3Schools.com!</a>",
+        1,
+        10
+    )]
+    #[test_case(
+        "<a\nhref\n=\n  \"https://www.w3schools.com\">\nVisit W3Schools.com!\n</a>",
+        4,
+        4
+    )]
+    #[test_case(
+        "<a hreflang=\"en\" href=\"https://www.w3schools.com\">Visit W3Schools.com!</a>",
+        1,
+        24
+    )]
+    #[test_case(
+        "<!--comment--><a href=\"https://www.w3schools.com\">Visit W3Schools.com!</a>",
+        1,
+        24
+    )]
+    fn links(input: &str, line: usize, column: usize) {
+        let le = HtmlLinkExtractor();
+        let result = le.find_links(&input);
+        let expected = MarkupLink {
+            target: "https://www.w3schools.com".to_string(),
+            line: line,
+            column: column,
+        };
+        assert_eq!(vec![expected], result);
+    }
+}
diff --git a/src/link_extractors/link_extractor.rs b/src/link_extractors/link_extractor.rs
@@ -1,7 +1,8 @@
+use super::html_link_extractor::HtmlLinkExtractor;
 use super::markdown_link_extractor::MarkdownLinkExtractor;
 use crate::markup::{MarkupFile, MarkupType};
-use std::fs;
 use std::fmt;
+use std::fs;
 
 /// Links found in markup files
 #[derive(PartialEq)]
@@ -16,7 +17,11 @@ pub struct MarkupLink {
 
 impl fmt::Debug for MarkupLink {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(f, "{} (line {}, column {})", self.target, self.line, self.column)
+        write!(
+            f,
+            "{} (line {}, column {})",
+            self.target, self.line, self.column
+        )
     }
 }
 
@@ -28,16 +33,19 @@ pub fn find_links(file: &MarkupFile) -> Vec<MarkupLink> {
     match fs::read_to_string(path) {
         Ok(text) => link_extractor.find_links(&text),
         Err(e) => {
-            warn!("File '{}'. IO Error: \"{}\". Check your file encoding.", path, e);
+            warn!(
+                "File '{}'. IO Error: \"{}\". Check your file encoding.",
+                path, e
+            );
             vec![]
         }
     }
 }
 
-fn link_extractor_factory(markup_type: &MarkupType) -> impl LinkExtractor {
+fn link_extractor_factory(markup_type: &MarkupType) -> Box<dyn LinkExtractor> {
     match markup_type {
-        MarkupType::Markdown => MarkdownLinkExtractor(),
-        MarkupType::HTML => unimplemented!(),
+        MarkupType::Markdown => Box::new(MarkdownLinkExtractor()),
+        MarkupType::HTML => Box::new(HtmlLinkExtractor()),
     }
 }
 

diff --git a/src/link_extractors/mod.rs b/src/link_extractors/mod.rs
@@ -1,2 +1,3 @@
 pub mod link_extractor;
-mod markdown_link_extractor;
+mod markdown_link_extractor;
+mod html_link_extractor;
diff --git a/src/markup.rs b/src/markup.rs
@@ -1,3 +1,5 @@
+use std::str::FromStr;
+
 #[derive(Debug)]
 pub struct MarkupFile {
     pub markup_type: MarkupType,
@@ -10,6 +12,18 @@ pub enum MarkupType {
     HTML,
 }
 
+impl FromStr for MarkupType {
+    type Err = ();
+
+    fn from_str(s: &str) -> Result<MarkupType, ()> {
+        match s {
+            "md" => Ok(MarkupType::Markdown),
+            "html" => Ok(MarkupType::HTML),
+            _ => Err(()),
+        }
+    }
+}
+
 impl MarkupType {
     pub fn file_extensions(&self) -> Vec<String> {
         match self {
@@ -28,4 +42,4 @@ impl MarkupType {
             MarkupType::HTML => vec!["html".to_string(), "xhtml".to_string()],
         }
     }
-}
+}
diff --git a/tests/file_traversal.rs b/tests/file_traversal.rs
@@ -5,7 +5,7 @@ use mlc::Config;
 
 #[test]
 fn find_markdown_files() {
-    let path = "./benches/benchmark/md_file_endings".to_string();
+    let path = "./benches/benchmark/markdown/md_file_endings".to_string();
     let config: Config = Config {
         folder: path,
         markup_types: vec![MarkupType::Markdown],
@@ -19,7 +19,7 @@ fn find_markdown_files() {
 
 #[test]
 fn empty_folder() {
-    let path = "./benches/benchmark/empty".to_string();
+    let path = "./benches/benchmark/markdown/empty".to_string();
     let config: Config = Config {
         folder: path,
         markup_types: vec![MarkupType::Markdown],