fixed #12; ie give option to bypass assumption that each row/veceleme…

…nt are different documents
AdamSpannbauer · Dec 11, 2017 · fcc9aa7 · fcc9aa7
1 parent 8e43ecf
commit fcc9aa7
Show file tree

Hide file tree

Showing 13 changed files with 136 additions and 43 deletions.
diff --git a/R/bind_lexrank.R b/R/bind_lexrank.R
@@ -48,7 +48,7 @@ bind_lexrank_ <- function(tbl, text, doc_id, sent_id=NULL, level=c("sentences",
   if(!(doc_id %in% names(tbl))) stop("doc_id column not found in tbl")
   if(!is.character(level)) stop("level must be character")
   if(length(level) > 1) {
-    warning("only first element of level will be used")
+    # warning("only first element of level will be used")
     level = level[1]
   }
   if(!(level %in% c("sentences", "tokens"))) stop("invalid value of level; accepted values for level are 'sentences' and 'tokens'")

diff --git a/R/sentenceParse.R b/R/sentenceParse.R
@@ -16,22 +16,29 @@ sentenceParse <- function(text, docId = "create") {
   if(length(text) < 1) stop("text must be at least length 1")
   docId <- as.character(docId)
   if(length(docId)==1 & docId[1]=="create") {
-      createDocIds <- TRUE
-    } else if(length(docId)==length(text)) {
-      createDocIds <- FALSE
-    } else if(length(docId)!=length(text)) stop("docId vector must be same length as text vector")
-
-
+    createDocIds <- TRUE
+  } else if(length(docId)==length(text)) {
+    createDocIds <- FALSE
+  } else if(length(docId)!=length(text)) stop("docId vector must be same length as text vector")
+  
+  
   sentences <- sentence_parser(text)
   sentenceDfList <- lapply(seq_along(sentences), function(i) {
     sentVec <- trimws(sentences[[i]])
     if(createDocIds) {
       data.frame(docId=i, sentenceId=paste0(i,"_",seq_along(sentVec)), sentence=sentVec, stringsAsFactors = FALSE)
     } else if(!createDocIds) {
-      data.frame(docId=docId[i], sentenceId=paste0(docId[i],"_",seq_along(sentVec)), sentence=sentVec, stringsAsFactors = FALSE)
+      data.frame(docId=docId[i], sentence=sentVec, stringsAsFactors = FALSE)
     }
   })
-  sentenceDf <- dplyr::bind_rows(sentenceDfList)
+  sentenceDf <- do.call('rbind', sentenceDfList)
+  sentenceDfList <- split(sentenceDf, sentenceDf$docId)
+  sentenceDfList <- lapply(sentenceDfList, function(dfi) {
+    dfi$sentenceId <- paste0(dfi$docId, "_", 1:nrow(dfi))
+    dfi[,c("docId","sentenceId","sentence")]
+  })
+  sentenceDf <- do.call('rbind', sentenceDfList)
   class(sentenceDf) <- "data.frame"
+  rownames(sentenceDf) <- NULL
   return(sentenceDf)
 }
diff --git a/R/sentenceSimil.R b/R/sentenceSimil.R
@@ -46,7 +46,7 @@ sentenceSimil <- function(sentenceId, token, docId=NULL, sentencesAsDocs=FALSE){
     dplyr::summarise(tf = n()) %>%
     dplyr::ungroup() %>%
     dplyr::group_by(token) %>%
-    dplyr::mutate(idf = 1+log(ndoc/n())) %>%
+    dplyr::mutate(idf = 1+log(ndoc/length(unique(docId)))) %>%
     dplyr::mutate(tfidf = tf*idf) %>%
     dplyr::ungroup()
 

diff --git a/R/tokenize.R b/R/tokenize.R
@@ -58,7 +58,12 @@ tokenize <- function(text, removePunc=TRUE, removeNum=TRUE, toLower=TRUE, stemWo
     if(length(nonStopTok) == 0) NA_character_ else nonStopTok
   })
   if(stemWords) {
-    text <- lapply(text, SnowballC::wordStem)
+    text <- lapply(text, function(w) {
+      w_na = which(is.na(w))
+      out = SnowballC::wordStem(w)
+      out[w_na] = NA
+      out
+    })
   }
 
   tokenList <- lapply(text, function(tokens) {

diff --git a/R/unnest_sentences.R b/R/unnest_sentences.R
@@ -4,6 +4,7 @@
 #' @param tbl dataframe containing column of text to be split into sentences
 #' @param output name of column to be created to store parsed sentences
 #' @param input name of input column of text to be parsed into sentences
+#' @param doc_id column of document ids; if not provided it will be assumed that each row is a different document
 #' @param output_id name of column to be created to store sentence ids
 #' @param drop whether original input column should get dropped
 #' @return A data.frame of parsed sentences and sentence ids
@@ -22,7 +23,7 @@
 #'   unnest_sentences(sents, text)
 
 #' @export
-unnest_sentences_ <- function(tbl, output, input, output_id="sent_id", drop=TRUE) {
+unnest_sentences_ <- function(tbl, output, input, doc_id=NULL, output_id="sent_id", drop=TRUE) {
   if(!is.data.frame(tbl)) stop("tbl must be a dataframe")
   if(!(input %in% names(tbl))) stop("input column not found in tbl")
   if(!is.character(tbl[[input]])) stop("input column must be character")
@@ -31,6 +32,9 @@ unnest_sentences_ <- function(tbl, output, input, output_id="sent_id", drop=TRUE
     output_id <- output_id[1]
   }
   if(!is.logical(drop)) stop("drop must be logical")
+  if(!is.null(doc_id)) {
+    if(!(doc_id %in% names(tbl))) stop("doc_id column not found in tbl")
+  }
 
   text <- tbl[[input]]
   parsed_sents <- sentence_parser(text)
@@ -43,15 +47,29 @@ unnest_sentences_ <- function(tbl, output, input, output_id="sent_id", drop=TRUE
   tbl[[output_id]] <- sent_ids
   tbl[[output]]    <- parsed_sents
 
-  tidyr::unnest(tbl)
+  out = tidyr::unnest(tbl)
+  if(!is.null(doc_id)) {
+    out_tbl_list = split(out, out[[doc_id]])
+    out_tbl_list = lapply(out_tbl_list, function(dfi) {
+      dfi[[output_id]] = seq_along(dfi[[output_id]])
+      dfi
+    })
+    out = do.call('rbind', out_tbl_list)
+  }
+  rownames(out) = NULL
+  return(out)
 }
 
 #' @rdname unnest_sentences_
 #' @export
-unnest_sentences <- function(tbl, output, input, output_id='sent_id', drop=TRUE) {
+unnest_sentences <- function(tbl, output, input, doc_id=NULL, output_id='sent_id', drop=TRUE) {
   output_str <- as.character(substitute(output))
   input_str  <- as.character(substitute(input))
   out_id_str <- as.character(substitute(output_id))
+  doc_id <- as.character(substitute(doc_id))
+  if (length(doc_id) == 0) doc_id = NULL
 
-  unnest_sentences_(tbl, output_str, input_str, out_id_str, drop)
+  unnest_sentences_(tbl=tbl, output = output_str, 
+                    input = input_str, doc_id = doc_id,
+                    output_id = out_id_str, drop = drop)
 }
diff --git a/man/unnest_sentences_.Rd b/man/unnest_sentences_.Rd
diff --git a/tests/testthat/test-bind_lexrank.R b/tests/testthat/test-bind_lexrank.R
@@ -54,7 +54,7 @@ test_that("test input checking", {
   expect_error(bind_lexrank(df, sents, fake))
   expect_error(bind_lexrank(NULL, sents, doc_id))
   expect_error(bind_lexrank(df, sents, doc_id, level="fake"))
-  expect_warning(bind_lexrank(df, sents, doc_id, level=c("sentences","tokens")))
+  # expect_warning(bind_lexrank(df, sents, doc_id, level=c("sentences","tokens")))
 
   df <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), 
                    sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
@@ -70,7 +70,7 @@ test_that("test input checking", {
 
   expect_error(bind_lexrank(df, tokens, doc_id, fake, level="tokens"))
   expect_error(bind_lexrank(df, tokens, doc_id, level="tokens"))
-  expect_warning(bind_lexrank(df, tokens, doc_id, sent_id, level=c("tokens","sentences")))
+  # expect_warning(bind_lexrank(df, tokens, doc_id, sent_id, level=c("tokens","sentences")))
 })
 
 # test output val ------------------------------------------------------
@@ -116,9 +116,7 @@ test_that("output value", {
                                           "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."),
                                 tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", 
                                            "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"),
-                                lexrank = c(0.07143, 0.07143, 0.07143, 0.07143, NA, 0.07143, 0.07143, 0.07143, 
-                                            0.07143, 0.07143, NA, 0.07143, NA, 0.07143, 0.07143, 0.07143, 
-                                            NA, 0.07143, NA),
+                                lexrank = c(0.16667, NA, 0.16667, NA, NA, NA, NA, 0.16667, 0.16667, NA, NA, 0.16667, NA, 0.16667, NA, NA, NA, NA, NA),
                                 stringsAsFactors = FALSE)
 
   expect_equal(test_result, expected_result)

diff --git a/tests/testthat/test-bind_lexrank_.R b/tests/testthat/test-bind_lexrank_.R
@@ -54,7 +54,7 @@ test_that("test input checking", {
   expect_error(bind_lexrank_(df, "sents", "fake"))
   expect_error(bind_lexrank_(NULL, "sents", "doc_id"))
   expect_error(bind_lexrank_(df, "sents", "doc_id", level="fake"))
-  expect_warning(bind_lexrank_(df, "sents", "doc_id", level=c("sentences","tokens")))
+  # expect_warning(bind_lexrank_(df, "sents", "doc_id", level=c("sentences","tokens")))
 
   df <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), 
                    sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 
@@ -70,16 +70,16 @@ test_that("test input checking", {
 
   expect_error(bind_lexrank_(df, "tokens", "doc_id", "fake", level="tokens"))
   expect_error(bind_lexrank_(df, "tokens", "doc_id", level="tokens"))
-  expect_warning(bind_lexrank_(df, "tokens", "doc_id", "sent_id", level=c("tokens","sentences")))
+  # expect_warning(bind_lexrank_(df, "tokens", "doc_id", "sent_id", level=c("tokens","sentences")))
 })
 
 # test output val ------------------------------------------------------
 test_that("output value", {
   df <- data.frame(doc_id = 1:3, 
-                    text = c("Testing the system. Second sentence for you.", 
-                             "System testing the tidy documents df.", 
-                             "Documents will be parsed and lexranked."),
-                    stringsAsFactors = FALSE) %>% 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE) %>% 
     unnest_sentences(sents, text)
 
   test_result     <- bind_lexrank_(df, "sents", "doc_id", level="sentences")
@@ -116,9 +116,7 @@ test_that("output value", {
                                           "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."),
                                 tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", 
                                            "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"),
-                                lexrank = c(0.07143, 0.07143, 0.07143, 0.07143, NA, 0.07143, 0.07143, 0.07143, 
-                                            0.07143, 0.07143, NA, 0.07143, NA, 0.07143, 0.07143, 0.07143, 
-                                            NA, 0.07143, NA),
+                                lexrank = c(0.16667, NA, 0.16667, NA, NA, NA, NA, 0.16667, 0.16667, NA, NA, 0.16667, NA, 0.16667, NA, NA, NA, NA, NA),
                                 stringsAsFactors = FALSE)
 
   expect_equal(test_result, expected_result)

diff --git a/tests/testthat/test-lexRankFromSimil.R b/tests/testthat/test-lexRankFromSimil.R
@@ -49,26 +49,26 @@ test_that("object out value", {
                            token = tokenDf$token,
                            docId = tokenDf$docId)
 
-  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal) %>% 
-    dplyr::mutate(value = round(value, 5))
+  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal)
+  testResult$value = round(testResult$value, 5)
 
   expectedResult <- data.frame(sentenceId = c("1_1", "2_1", "3_1"),
                                value = c(0.25676, 0.48649, 0.25676),
                                stringsAsFactors = FALSE)
 
   expect_identical(testResult, expectedResult)
 
-  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, continuous = TRUE) %>% 
-    dplyr::mutate(value = round(value, 5))
+  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, continuous = TRUE)
+  testResult$value = round(testResult$value, 5)
 
   expectedResult <- data.frame(sentenceId = c("1_1", "2_1", "3_1"),
                                value = c(0.25676, 0.48649, 0.25676),
                                stringsAsFactors = FALSE)
 
   expect_identical(testResult, expectedResult)
 
-  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, usePageRank = FALSE) %>% 
-    dplyr::mutate(value = round(value, 5))
+  testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, usePageRank = FALSE)
+  testResult$value = round(testResult$value, 5)
 
   expectedResult <- data.frame(sentenceId = c("2_1", "1_1", "3_1"),
                                value = c(2, 1, 1),

diff --git a/tests/testthat/test-sentenceSimil.R b/tests/testthat/test-sentenceSimil.R
@@ -9,16 +9,29 @@ test_that("testing result str and class", {
 
   testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId,
                               token = tokenDf$token,
-                              docId = tokenDf$docId)
+                              docId = tokenDf$docId,
+                              sentencesAsDocs = FALSE)
 
   expect_equal(class(testResult), "data.frame")
   expect_equal(names(testResult), c("sent1","sent2","similVal"))
 
   expect_true(is.character(testResult$sent1))
   expect_true(is.character(testResult$sent2))
   expect_true(is.numeric(testResult$similVal))
-})
 
+  testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId,
+                              token = tokenDf$token,
+                              docId = tokenDf$docId,
+                              sentencesAsDocs = TRUE)
+
+  expect_equal(class(testResult), "data.frame")
+  expect_equal(names(testResult), c("sent1","sent2","similVal"))
+
+  expect_true(is.character(testResult$sent1))
+  expect_true(is.character(testResult$sent2))
+  expect_true(is.numeric(testResult$similVal))
+})
+
 
 test_that("bad input", {
   expect_error(sentenceSimil(sentenceId = c("1_1"),
@@ -55,8 +68,22 @@ test_that("output value check", {
 
   testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId,
                               token = tokenDf$token,
-                              docId = tokenDf$docId) %>% 
-    dplyr::mutate(similVal = round(similVal, 5))
+                              docId = tokenDf$docId,
+                              sentencesAsDocs = FALSE)
+  testResult$similVal = round(testResult$similVal, 5)
+
+  expectedResult <- data.frame(sent1 = c("1_1", "1_1", "2_1"),
+                               sent2 = c("2_1", "3_1", "3_1"),
+                               similVal = c(0.48624, 0, 0.48624),
+                               stringsAsFactors = FALSE)
+
+  expect_equal(testResult, expectedResult)
+
+  testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId,
+                              token = tokenDf$token,
+                              docId = tokenDf$docId,
+                              sentencesAsDocs = TRUE)
+  testResult$similVal = round(testResult$similVal, 5)
 
   expectedResult <- data.frame(sent1 = c("1_1", "1_1", "2_1"),
                                sent2 = c("2_1", "3_1", "3_1"),

diff --git a/tests/testthat/test-sentenceTokenParse.R b/tests/testthat/test-sentenceTokenParse.R
@@ -31,7 +31,7 @@ test_that("All clean options TRUE", {
   expectedResultTokens <- lexRankr::tokenize(testDocs) %>% 
     unlist() %>% 
     .[which(!is.na(.))]
-
+  
   expect_equal(testResult$sentences, expectedResultSentences)
   expect_equal(testResult$tokens$token, expectedResultTokens)
 

diff --git a/tests/testthat/test-unnest_sentences.R b/tests/testthat/test-unnest_sentences.R
@@ -31,6 +31,7 @@ test_that("test input checking", {
   expect_error(unnest_sentences(df, out, fake))
   expect_error(unnest_sentences(NULL, out, text))
   expect_error(unnest_sentences(df, out, text, drop = NULL))
+  expect_error(unnest_sentences(df, out, text, doc_id = fake))
 })
 
 # test output val ------------------------------------------------------
@@ -51,5 +52,22 @@ test_that("output value", {
                                 stringsAsFactors = FALSE)
 
   expect_equal(test_result, expected_result)
+
+  df <- data.frame(doc_id = c(1,1,3), 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE)
+
+  test_result     <- unnest_sentences(df, out, text, doc_id = doc_id)
+  expected_result <- data.frame(doc_id = c(1L, 1L, 1L, 3L), 
+                                sent_id = c(1L, 2L, 3L, 1L), 
+                                out = c("Testing the system.", 
+                                        "Second sentence for you.", 
+                                        "System testing the tidy documents df.", 
+                                        "Documents will be parsed and lexranked."),
+                                stringsAsFactors = FALSE)
+
+  expect_equal(test_result, expected_result)
 })
 
diff --git a/tests/testthat/test-unnest_sentences_.R b/tests/testthat/test-unnest_sentences_.R
@@ -31,7 +31,8 @@ test_that("test input checking", {
   expect_error(unnest_sentences_(df, "out", "fake"))
   expect_error(unnest_sentences_(NULL, "out", "text"))
   expect_error(unnest_sentences_(df, "out", "text", drop = NULL))
-  expect_warning(unnest_sentences_(df, "out", "text", c("test","test2")))
+  expect_error(unnest_sentences(df, "out", "text", doc_id = "fake"))
+  expect_warning(unnest_sentences_(df, "out", "text", output_id=c("test","test2")))
 })
 
 # test output val ------------------------------------------------------
@@ -52,5 +53,22 @@ test_that("output value", {
                                 stringsAsFactors = FALSE)
 
   expect_equal(test_result, expected_result)
+
+  df <- data.frame(doc_id = c(1,1,3), 
+                   text = c("Testing the system. Second sentence for you.", 
+                            "System testing the tidy documents df.", 
+                            "Documents will be parsed and lexranked."),
+                   stringsAsFactors = FALSE)
+
+  test_result     <- unnest_sentences_(df, "out", "text", doc_id = "doc_id")
+  expected_result <- data.frame(doc_id = c(1L, 1L, 1L, 3L), 
+                                sent_id = c(1L, 2L, 3L, 1L), 
+                                out = c("Testing the system.", 
+                                        "Second sentence for you.", 
+                                        "System testing the tidy documents df.", 
+                                        "Documents will be parsed and lexranked."),
+                                stringsAsFactors = FALSE)
+
+  expect_equal(test_result, expected_result)
 })