From fcc9aa789257b59fc12e4826f958d08a4e245ac9 Mon Sep 17 00:00:00 2001
From: "Spannbauer, Adam M" <u775749@ntmcon02.emn.com>
Date: Mon, 11 Dec 2017 10:53:25 -0500
Subject: [PATCH] fixed #12; ie give option to bypass assumption that each
 row/vecelement are different documents

---
 R/bind_lexrank.R                         |  2 +-
 R/sentenceParse.R                        | 23 ++++++++++------
 R/sentenceSimil.R                        |  2 +-
 R/tokenize.R                             |  7 ++++-
 R/unnest_sentences.R                     | 26 +++++++++++++++---
 man/unnest_sentences_.Rd                 |  8 ++++--
 tests/testthat/test-bind_lexrank.R       |  8 ++----
 tests/testthat/test-bind_lexrank_.R      | 16 +++++------
 tests/testthat/test-lexRankFromSimil.R   | 12 ++++----
 tests/testthat/test-sentenceSimil.R      | 35 +++++++++++++++++++++---
 tests/testthat/test-sentenceTokenParse.R |  2 +-
 tests/testthat/test-unnest_sentences.R   | 18 ++++++++++++
 tests/testthat/test-unnest_sentences_.R  | 20 +++++++++++++-
 13 files changed, 136 insertions(+), 43 deletions(-)

diff --git a/R/bind_lexrank.R b/R/bind_lexrank.R
index f87d142..7892744 100644
--- a/R/bind_lexrank.R
+++ b/R/bind_lexrank.R
@@ -48,7 +48,7 @@ bind_lexrank_ <- function(tbl, text, doc_id, sent_id=NULL, level=c("sentences",
   if(!(doc_id %in% names(tbl))) stop("doc_id column not found in tbl")
   if(!is.character(level)) stop("level must be character")
   if(length(level) > 1) {
-    warning("only first element of level will be used")
+    # warning("only first element of level will be used")
     level = level[1]
   }
   if(!(level %in% c("sentences", "tokens"))) stop("invalid value of level; accepted values for level are 'sentences' and 'tokens'")
diff --git a/R/sentenceParse.R b/R/sentenceParse.R
index 2a0cdc2..39bfdcb 100644
--- a/R/sentenceParse.R
+++ b/R/sentenceParse.R
@@ -16,22 +16,29 @@ sentenceParse <- function(text, docId = "create") {
   if(length(text) < 1) stop("text must be at least length 1")
   docId <- as.character(docId)
   if(length(docId)==1 & docId[1]=="create") {
-      createDocIds <- TRUE
-    } else if(length(docId)==length(text)) {
-      createDocIds <- FALSE
-    } else if(length(docId)!=length(text)) stop("docId vector must be same length as text vector")
-
-
+    createDocIds <- TRUE
+  } else if(length(docId)==length(text)) {
+    createDocIds <- FALSE
+  } else if(length(docId)!=length(text)) stop("docId vector must be same length as text vector")
+  
+  
   sentences <- sentence_parser(text)
   sentenceDfList <- lapply(seq_along(sentences), function(i) {
     sentVec <- trimws(sentences[[i]])
     if(createDocIds) {
       data.frame(docId=i, sentenceId=paste0(i,"_",seq_along(sentVec)), sentence=sentVec, stringsAsFactors = FALSE)
     } else if(!createDocIds) {
-      data.frame(docId=docId[i], sentenceId=paste0(docId[i],"_",seq_along(sentVec)), sentence=sentVec, stringsAsFactors = FALSE)
+      data.frame(docId=docId[i], sentence=sentVec, stringsAsFactors = FALSE)
     }
   })
-  sentenceDf <- dplyr::bind_rows(sentenceDfList)
+  sentenceDf <- do.call('rbind', sentenceDfList)
+  sentenceDfList <- split(sentenceDf, sentenceDf$docId)
+  sentenceDfList <- lapply(sentenceDfList, function(dfi) {
+    dfi$sentenceId <- paste0(dfi$docId, "_", 1:nrow(dfi))
+    dfi[,c("docId","sentenceId","sentence")]
+  })
+  sentenceDf <- do.call('rbind', sentenceDfList)
   class(sentenceDf) <- "data.frame"
+  rownames(sentenceDf) <- NULL
   return(sentenceDf)
 }
diff --git a/R/sentenceSimil.R b/R/sentenceSimil.R
index d2d2147..600b404 100644
--- a/R/sentenceSimil.R
+++ b/R/sentenceSimil.R
@@ -46,7 +46,7 @@ sentenceSimil <- function(sentenceId, token, docId=NULL, sentencesAsDocs=FALSE){
     dplyr::summarise(tf = n()) %>%
     dplyr::ungroup() %>%
     dplyr::group_by(token) %>%
-    dplyr::mutate(idf = 1+log(ndoc/n())) %>%
+    dplyr::mutate(idf = 1+log(ndoc/length(unique(docId)))) %>%
     dplyr::mutate(tfidf = tf*idf) %>%
     dplyr::ungroup()
 
diff --git a/R/tokenize.R b/R/tokenize.R
index b15a5b0..1ca1708 100644
--- a/R/tokenize.R
+++ b/R/tokenize.R
@@ -58,7 +58,12 @@ tokenize <- function(text, removePunc=TRUE, removeNum=TRUE, toLower=TRUE, stemWo
     if(length(nonStopTok) == 0) NA_character_ else nonStopTok
   })
   if(stemWords) {
-    text <- lapply(text, SnowballC::wordStem)
+    text <- lapply(text, function(w) {
+      w_na = which(is.na(w))
+      out = SnowballC::wordStem(w)
+      out[w_na] = NA
+      out
+    })
   }
 
   tokenList <- lapply(text, function(tokens) {
diff --git a/R/unnest_sentences.R b/R/unnest_sentences.R
index f5fffa3..81279de 100644
--- a/R/unnest_sentences.R
+++ b/R/unnest_sentences.R
@@ -4,6 +4,7 @@
 #' @param tbl dataframe containing column of text to be split into sentences
 #' @param output name of column to be created to store parsed sentences
 #' @param input name of input column of text to be parsed into sentences
+#' @param doc_id column of document ids; if not provided it will be assumed that each row is a different document
 #' @param output_id name of column to be created to store sentence ids
 #' @param drop whether original input column should get dropped
 #' @return A data.frame of parsed sentences and sentence ids
@@ -22,7 +23,7 @@
 #'   unnest_sentences(sents, text)
 
 #' @export
-unnest_sentences_ <- function(tbl, output, input, output_id="sent_id", drop=TRUE) {
+unnest_sentences_ <- function(tbl, output, input, doc_id=NULL, output_id="sent_id", drop=TRUE) {
   if(!is.data.frame(tbl)) stop("tbl must be a dataframe")
   if(!(input %in% names(tbl))) stop("input column not found in tbl")
   if(!is.character(tbl[[input]])) stop("input column must be character")
@@ -31,6 +32,9 @@ unnest_sentences_ <- function(tbl, output, input, output_id="sent_id", drop=TRUE
     output_id <- output_id[1]
   }
   if(!is.logical(drop)) stop("drop must be logical")
+  if(!is.null(doc_id)) {
+    if(!(doc_id %in% names(tbl))) stop("doc_id column not found in tbl")
+  }
   
   text <- tbl[[input]]
   parsed_sents <- sentence_parser(text)
@@ -43,15 +47,29 @@ unnest_sentences_ <- function(tbl, output, input, output_id="sent_id", drop=TRUE
   tbl[[output_id]] <- sent_ids
   tbl[[output]]    <- parsed_sents
   
-  tidyr::unnest(tbl)
+  out = tidyr::unnest(tbl)
+  if(!is.null(doc_id)) {
+    out_tbl_list = split(out, out[[doc_id]])
+    out_tbl_list = lapply(out_tbl_list, function(dfi) {
+      dfi[[output_id]] = seq_along(dfi[[output_id]])
+      dfi
+    })
+    out = do.call('rbind', out_tbl_list)
+  }
+  rownames(out) = NULL
+  return(out)
 }
 
 #' @rdname unnest_sentences_
 #' @export
-unnest_sentences <- function(tbl, output, input, output_id='sent_id', drop=TRUE) {
+unnest_sentences <- function(tbl, output, input, doc_id=NULL, output_id='sent_id', drop=TRUE) {
   output_str <- as.character(substitute(output))
   input_str  <- as.character(substitute(input))
   out_id_str <- as.character(substitute(output_id))
+  doc_id <- as.character(substitute(doc_id))
+  if (length(doc_id) == 0) doc_id = NULL
   
-  unnest_sentences_(tbl, output_str, input_str, out_id_str, drop)
+  unnest_sentences_(tbl=tbl, output = output_str, 
+                    input = input_str, doc_id = doc_id,
+                    output_id = out_id_str, drop = drop)
 }
diff --git a/man/unnest_sentences_.Rd b/man/unnest_sentences_.Rd
index 57cf63b..901fc96 100644
--- a/man/unnest_sentences_.Rd
+++ b/man/unnest_sentences_.Rd
@@ -5,9 +5,11 @@
 \alias{unnest_sentences}
 \title{Split a column of text into sentences}
 \usage{
-unnest_sentences_(tbl, output, input, output_id = "sent_id", drop = TRUE)
+unnest_sentences_(tbl, output, input, doc_id = NULL, output_id = "sent_id",
+  drop = TRUE)
 
-unnest_sentences(tbl, output, input, output_id = "sent_id", drop = TRUE)
+unnest_sentences(tbl, output, input, doc_id = NULL, output_id = "sent_id",
+  drop = TRUE)
 }
 \arguments{
 \item{tbl}{dataframe containing column of text to be split into sentences}
@@ -16,6 +18,8 @@ unnest_sentences(tbl, output, input, output_id = "sent_id", drop = TRUE)
 
 \item{input}{name of input column of text to be parsed into sentences}
 
+\item{doc_id}{column of document ids; if not provided it will be assumed that each row is a different document}
+
 \item{output_id}{name of column to be created to store sentence ids}
 
 \item{drop}{whether original input column should get dropped}
diff --git a/tests/testthat/test-bind_lexrank.R b/tests/testthat/test-bind_lexrank.R
index d3d5463..5179802 100644
--- a/tests/testthat/test-bind_lexrank.R
+++ b/tests/testthat/test-bind_lexrank.R
@@ -54,7 +54,7 @@ test_that("test input checking", {
   expect_error(bind_lexrank(df, sents, fake))
   expect_error(bind_lexrank(NULL, sents, doc_id))
   expect_error(bind_lexrank(df, sents, doc_id, level="fake"))
-  expect_warning(bind_lexrank(df, sents, doc_id, level=c("sentences","tokens")))
+  # expect_warning(bind_lexrank(df, sents, doc_id, level=c("sentences","tokens")))
   
   df <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), 
                    sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
@@ -70,7 +70,7 @@ test_that("test input checking", {
   
   expect_error(bind_lexrank(df, tokens, doc_id, fake, level="tokens"))
   expect_error(bind_lexrank(df, tokens, doc_id, level="tokens"))
-  expect_warning(bind_lexrank(df, tokens, doc_id, sent_id, level=c("tokens","sentences")))
+  # expect_warning(bind_lexrank(df, tokens, doc_id, sent_id, level=c("tokens","sentences")))
 })
 
 # test output val ------------------------------------------------------
@@ -116,9 +116,7 @@ test_that("output value", {
                                           "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."),
                                 tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", 
                                            "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"),
-                                lexrank = c(0.07143, 0.07143, 0.07143, 0.07143, NA, 0.07143, 0.07143, 0.07143, 
-                                            0.07143, 0.07143, NA, 0.07143, NA, 0.07143, 0.07143, 0.07143, 
-                                            NA, 0.07143, NA),
+                                lexrank = c(0.16667, NA, 0.16667, NA, NA, NA, NA, 0.16667, 0.16667, NA, NA, 0.16667, NA, 0.16667, NA, NA, NA, NA, NA),
                                 stringsAsFactors = FALSE)
   
   expect_equal(test_result, expected_result)
diff --git a/tests/testthat/test-bind_lexrank_.R b/tests/testthat/test-bind_lexrank_.R
index 044e887..524ec4b 100644
--- a/tests/testthat/test-bind_lexrank_.R
+++ b/tests/testthat/test-bind_lexrank_.R
@@ -54,7 +54,7 @@ test_that("test input checking", {
   expect_error(bind_lexrank_(df, "sents", "fake"))
   expect_error(bind_lexrank_(NULL, "sents", "doc_id"))
   expect_error(bind_lexrank_(df, "sents", "doc_id", level="fake"))
-  expect_warning(bind_lexrank_(df, "sents", "doc_id", level=c("sentences","tokens")))
+  # expect_warning(bind_lexrank_(df, "sents", "doc_id", level=c("sentences","tokens")))
   
   df <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), 
                    sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
@@ -70,16 +70,16 @@ test_that("test input checking", {
   
   expect_error(bind_lexrank_(df, "tokens", "doc_id", "fake", level="tokens"))
   expect_error(bind_lexrank_(df, "tokens", "doc_id", level="tokens"))
-  expect_warning(bind_lexrank_(df, "tokens", "doc_id", "sent_id", level=c("tokens","sentences")))
+  # expect_warning(bind_lexrank_(df, "tokens", "doc_id", "sent_id", level=c("tokens","sentences")))
 })
 
 # test output val ------------------------------------------------------
 test_that("output value", {
   df <- data.frame(doc_id = 1:3, 
-                    text = c("Testing the system. Second sentence for you.", 
-                             "System testing the tidy documents df.", 
-                             "Documents will be parsed and lexranked."),
-                    stringsAsFactors = FALSE) %>% 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE) %>% 
     unnest_sentences(sents, text)
   
   test_result     <- bind_lexrank_(df, "sents", "doc_id", level="sentences")
@@ -116,9 +116,7 @@ test_that("output value", {
                                           "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."),
                                 tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", 
                                            "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"),
-                                lexrank = c(0.07143, 0.07143, 0.07143, 0.07143, NA, 0.07143, 0.07143, 0.07143, 
-                                            0.07143, 0.07143, NA, 0.07143, NA, 0.07143, 0.07143, 0.07143, 
-                                            NA, 0.07143, NA),
+                                lexrank = c(0.16667, NA, 0.16667, NA, NA, NA, NA, 0.16667, 0.16667, NA, NA, 0.16667, NA, 0.16667, NA, NA, NA, NA, NA),
                                 stringsAsFactors = FALSE)
   
   expect_equal(test_result, expected_result)
diff --git a/tests/testthat/test-lexRankFromSimil.R b/tests/testthat/test-lexRankFromSimil.R
index c04fd99..f8dd04a 100644
--- a/tests/testthat/test-lexRankFromSimil.R
+++ b/tests/testthat/test-lexRankFromSimil.R
@@ -49,8 +49,8 @@ test_that("object out value", {
                            token = tokenDf$token,
                            docId = tokenDf$docId)
   
-  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal) %>% 
-    dplyr::mutate(value = round(value, 5))
+  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal)
+  testResult$value = round(testResult$value, 5)
   
   expectedResult <- data.frame(sentenceId = c("1_1", "2_1", "3_1"),
                                value = c(0.25676, 0.48649, 0.25676),
@@ -58,8 +58,8 @@ test_that("object out value", {
   
   expect_identical(testResult, expectedResult)
   
-  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, continuous = TRUE) %>% 
-    dplyr::mutate(value = round(value, 5))
+  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, continuous = TRUE)
+  testResult$value = round(testResult$value, 5)
   
   expectedResult <- data.frame(sentenceId = c("1_1", "2_1", "3_1"),
                                value = c(0.25676, 0.48649, 0.25676),
@@ -67,8 +67,8 @@ test_that("object out value", {
   
   expect_identical(testResult, expectedResult)
   
-  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, usePageRank = FALSE) %>% 
-    dplyr::mutate(value = round(value, 5))
+  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, usePageRank = FALSE)
+  testResult$value = round(testResult$value, 5)
   
   expectedResult <- data.frame(sentenceId = c("2_1", "1_1", "3_1"),
                                value = c(2, 1, 1),
diff --git a/tests/testthat/test-sentenceSimil.R b/tests/testthat/test-sentenceSimil.R
index 8e332b6..760f8a3 100644
--- a/tests/testthat/test-sentenceSimil.R
+++ b/tests/testthat/test-sentenceSimil.R
@@ -9,7 +9,8 @@ test_that("testing result str and class", {
   
   testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId,
                               token = tokenDf$token,
-                              docId = tokenDf$docId)
+                              docId = tokenDf$docId,
+                              sentencesAsDocs = FALSE)
   
   expect_equal(class(testResult), "data.frame")
   expect_equal(names(testResult), c("sent1","sent2","similVal"))
@@ -17,8 +18,20 @@ test_that("testing result str and class", {
   expect_true(is.character(testResult$sent1))
   expect_true(is.character(testResult$sent2))
   expect_true(is.numeric(testResult$similVal))
-})
   
+  testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId,
+                              token = tokenDf$token,
+                              docId = tokenDf$docId,
+                              sentencesAsDocs = TRUE)
+  
+  expect_equal(class(testResult), "data.frame")
+  expect_equal(names(testResult), c("sent1","sent2","similVal"))
+  
+  expect_true(is.character(testResult$sent1))
+  expect_true(is.character(testResult$sent2))
+  expect_true(is.numeric(testResult$similVal))
+})
+
 
 test_that("bad input", {
   expect_error(sentenceSimil(sentenceId = c("1_1"),
@@ -55,8 +68,22 @@ test_that("output value check", {
   
   testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId,
                               token = tokenDf$token,
-                              docId = tokenDf$docId) %>% 
-    dplyr::mutate(similVal = round(similVal, 5))
+                              docId = tokenDf$docId,
+                              sentencesAsDocs = FALSE)
+  testResult$similVal = round(testResult$similVal, 5)
+  
+  expectedResult <- data.frame(sent1 = c("1_1", "1_1", "2_1"),
+                               sent2 = c("2_1", "3_1", "3_1"),
+                               similVal = c(0.48624, 0, 0.48624),
+                               stringsAsFactors = FALSE)
+  
+  expect_equal(testResult, expectedResult)
+  
+  testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId,
+                              token = tokenDf$token,
+                              docId = tokenDf$docId,
+                              sentencesAsDocs = TRUE)
+  testResult$similVal = round(testResult$similVal, 5)
   
   expectedResult <- data.frame(sent1 = c("1_1", "1_1", "2_1"),
                                sent2 = c("2_1", "3_1", "3_1"),
diff --git a/tests/testthat/test-sentenceTokenParse.R b/tests/testthat/test-sentenceTokenParse.R
index 29e7fec..a0317f8 100644
--- a/tests/testthat/test-sentenceTokenParse.R
+++ b/tests/testthat/test-sentenceTokenParse.R
@@ -31,7 +31,7 @@ test_that("All clean options TRUE", {
   expectedResultTokens <- lexRankr::tokenize(testDocs) %>% 
     unlist() %>% 
     .[which(!is.na(.))]
-
+  
   expect_equal(testResult$sentences, expectedResultSentences)
   expect_equal(testResult$tokens$token, expectedResultTokens)
   
diff --git a/tests/testthat/test-unnest_sentences.R b/tests/testthat/test-unnest_sentences.R
index ebc65c4..82ec421 100644
--- a/tests/testthat/test-unnest_sentences.R
+++ b/tests/testthat/test-unnest_sentences.R
@@ -31,6 +31,7 @@ test_that("test input checking", {
   expect_error(unnest_sentences(df, out, fake))
   expect_error(unnest_sentences(NULL, out, text))
   expect_error(unnest_sentences(df, out, text, drop = NULL))
+  expect_error(unnest_sentences(df, out, text, doc_id = fake))
 })
 
 # test output val ------------------------------------------------------
@@ -51,5 +52,22 @@ test_that("output value", {
                                 stringsAsFactors = FALSE)
   
   expect_equal(test_result, expected_result)
+  
+  df <- data.frame(doc_id = c(1,1,3), 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE)
+  
+  test_result     <- unnest_sentences(df, out, text, doc_id = doc_id)
+  expected_result <- data.frame(doc_id = c(1L, 1L, 1L, 3L), 
+                                sent_id = c(1L, 2L, 3L, 1L), 
+                                out = c("Testing the system.", 
+                                        "Second sentence for you.", 
+                                        "System testing the tidy documents df.", 
+                                        "Documents will be parsed and lexranked."),
+                                stringsAsFactors = FALSE)
+  
+  expect_equal(test_result, expected_result)
 })
 
diff --git a/tests/testthat/test-unnest_sentences_.R b/tests/testthat/test-unnest_sentences_.R
index 825d7ad..18b9824 100644
--- a/tests/testthat/test-unnest_sentences_.R
+++ b/tests/testthat/test-unnest_sentences_.R
@@ -31,7 +31,8 @@ test_that("test input checking", {
   expect_error(unnest_sentences_(df, "out", "fake"))
   expect_error(unnest_sentences_(NULL, "out", "text"))
   expect_error(unnest_sentences_(df, "out", "text", drop = NULL))
-  expect_warning(unnest_sentences_(df, "out", "text", c("test","test2")))
+  expect_error(unnest_sentences(df, "out", "text", doc_id = "fake"))
+  expect_warning(unnest_sentences_(df, "out", "text", output_id=c("test","test2")))
 })
 
 # test output val ------------------------------------------------------
@@ -52,5 +53,22 @@ test_that("output value", {
                                 stringsAsFactors = FALSE)
   
   expect_equal(test_result, expected_result)
+  
+  df <- data.frame(doc_id = c(1,1,3), 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE)
+  
+  test_result     <- unnest_sentences_(df, "out", "text", doc_id = "doc_id")
+  expected_result <- data.frame(doc_id = c(1L, 1L, 1L, 3L), 
+                                sent_id = c(1L, 2L, 3L, 1L), 
+                                out = c("Testing the system.", 
+                                        "Second sentence for you.", 
+                                        "System testing the tidy documents df.", 
+                                        "Documents will be parsed and lexranked."),
+                                stringsAsFactors = FALSE)
+  
+  expect_equal(test_result, expected_result)
 })