From fcc9aa789257b59fc12e4826f958d08a4e245ac9 Mon Sep 17 00:00:00 2001 From: "Spannbauer, Adam M" Date: Mon, 11 Dec 2017 10:53:25 -0500 Subject: [PATCH] fixed #12; ie give option to bypass assumption that each row/vecelement are different documents --- R/bind_lexrank.R | 2 +- R/sentenceParse.R | 23 ++++++++++------ R/sentenceSimil.R | 2 +- R/tokenize.R | 7 ++++- R/unnest_sentences.R | 26 +++++++++++++++--- man/unnest_sentences_.Rd | 8 ++++-- tests/testthat/test-bind_lexrank.R | 8 ++---- tests/testthat/test-bind_lexrank_.R | 16 +++++------ tests/testthat/test-lexRankFromSimil.R | 12 ++++---- tests/testthat/test-sentenceSimil.R | 35 +++++++++++++++++++++--- tests/testthat/test-sentenceTokenParse.R | 2 +- tests/testthat/test-unnest_sentences.R | 18 ++++++++++++ tests/testthat/test-unnest_sentences_.R | 20 +++++++++++++- 13 files changed, 136 insertions(+), 43 deletions(-) diff --git a/R/bind_lexrank.R b/R/bind_lexrank.R index f87d142..7892744 100644 --- a/R/bind_lexrank.R +++ b/R/bind_lexrank.R @@ -48,7 +48,7 @@ bind_lexrank_ <- function(tbl, text, doc_id, sent_id=NULL, level=c("sentences", if(!(doc_id %in% names(tbl))) stop("doc_id column not found in tbl") if(!is.character(level)) stop("level must be character") if(length(level) > 1) { - warning("only first element of level will be used") + # warning("only first element of level will be used") level = level[1] } if(!(level %in% c("sentences", "tokens"))) stop("invalid value of level; accepted values for level are 'sentences' and 'tokens'") diff --git a/R/sentenceParse.R b/R/sentenceParse.R index 2a0cdc2..39bfdcb 100644 --- a/R/sentenceParse.R +++ b/R/sentenceParse.R @@ -16,22 +16,29 @@ sentenceParse <- function(text, docId = "create") { if(length(text) < 1) stop("text must be at least length 1") docId <- as.character(docId) if(length(docId)==1 & docId[1]=="create") { - createDocIds <- TRUE - } else if(length(docId)==length(text)) { - createDocIds <- FALSE - } else if(length(docId)!=length(text)) stop("docId vector must be same length as text vector") - - + createDocIds <- TRUE + } else if(length(docId)==length(text)) { + createDocIds <- FALSE + } else if(length(docId)!=length(text)) stop("docId vector must be same length as text vector") + + sentences <- sentence_parser(text) sentenceDfList <- lapply(seq_along(sentences), function(i) { sentVec <- trimws(sentences[[i]]) if(createDocIds) { data.frame(docId=i, sentenceId=paste0(i,"_",seq_along(sentVec)), sentence=sentVec, stringsAsFactors = FALSE) } else if(!createDocIds) { - data.frame(docId=docId[i], sentenceId=paste0(docId[i],"_",seq_along(sentVec)), sentence=sentVec, stringsAsFactors = FALSE) + data.frame(docId=docId[i], sentence=sentVec, stringsAsFactors = FALSE) } }) - sentenceDf <- dplyr::bind_rows(sentenceDfList) + sentenceDf <- do.call('rbind', sentenceDfList) + sentenceDfList <- split(sentenceDf, sentenceDf$docId) + sentenceDfList <- lapply(sentenceDfList, function(dfi) { + dfi$sentenceId <- paste0(dfi$docId, "_", 1:nrow(dfi)) + dfi[,c("docId","sentenceId","sentence")] + }) + sentenceDf <- do.call('rbind', sentenceDfList) class(sentenceDf) <- "data.frame" + rownames(sentenceDf) <- NULL return(sentenceDf) } diff --git a/R/sentenceSimil.R b/R/sentenceSimil.R index d2d2147..600b404 100644 --- a/R/sentenceSimil.R +++ b/R/sentenceSimil.R @@ -46,7 +46,7 @@ sentenceSimil <- function(sentenceId, token, docId=NULL, sentencesAsDocs=FALSE){ dplyr::summarise(tf = n()) %>% dplyr::ungroup() %>% dplyr::group_by(token) %>% - dplyr::mutate(idf = 1+log(ndoc/n())) %>% + dplyr::mutate(idf = 1+log(ndoc/length(unique(docId)))) %>% dplyr::mutate(tfidf = tf*idf) %>% dplyr::ungroup() diff --git a/R/tokenize.R b/R/tokenize.R index b15a5b0..1ca1708 100644 --- a/R/tokenize.R +++ b/R/tokenize.R @@ -58,7 +58,12 @@ tokenize <- function(text, removePunc=TRUE, removeNum=TRUE, toLower=TRUE, stemWo if(length(nonStopTok) == 0) NA_character_ else nonStopTok }) if(stemWords) { - text <- lapply(text, SnowballC::wordStem) + text <- lapply(text, function(w) { + w_na = which(is.na(w)) + out = SnowballC::wordStem(w) + out[w_na] = NA + out + }) } tokenList <- lapply(text, function(tokens) { diff --git a/R/unnest_sentences.R b/R/unnest_sentences.R index f5fffa3..81279de 100644 --- a/R/unnest_sentences.R +++ b/R/unnest_sentences.R @@ -4,6 +4,7 @@ #' @param tbl dataframe containing column of text to be split into sentences #' @param output name of column to be created to store parsed sentences #' @param input name of input column of text to be parsed into sentences +#' @param doc_id column of document ids; if not provided it will be assumed that each row is a different document #' @param output_id name of column to be created to store sentence ids #' @param drop whether original input column should get dropped #' @return A data.frame of parsed sentences and sentence ids @@ -22,7 +23,7 @@ #' unnest_sentences(sents, text) #' @export -unnest_sentences_ <- function(tbl, output, input, output_id="sent_id", drop=TRUE) { +unnest_sentences_ <- function(tbl, output, input, doc_id=NULL, output_id="sent_id", drop=TRUE) { if(!is.data.frame(tbl)) stop("tbl must be a dataframe") if(!(input %in% names(tbl))) stop("input column not found in tbl") if(!is.character(tbl[[input]])) stop("input column must be character") @@ -31,6 +32,9 @@ unnest_sentences_ <- function(tbl, output, input, output_id="sent_id", drop=TRUE output_id <- output_id[1] } if(!is.logical(drop)) stop("drop must be logical") + if(!is.null(doc_id)) { + if(!(doc_id %in% names(tbl))) stop("doc_id column not found in tbl") + } text <- tbl[[input]] parsed_sents <- sentence_parser(text) @@ -43,15 +47,29 @@ unnest_sentences_ <- function(tbl, output, input, output_id="sent_id", drop=TRUE tbl[[output_id]] <- sent_ids tbl[[output]] <- parsed_sents - tidyr::unnest(tbl) + out = tidyr::unnest(tbl) + if(!is.null(doc_id)) { + out_tbl_list = split(out, out[[doc_id]]) + out_tbl_list = lapply(out_tbl_list, function(dfi) { + dfi[[output_id]] = seq_along(dfi[[output_id]]) + dfi + }) + out = do.call('rbind', out_tbl_list) + } + rownames(out) = NULL + return(out) } #' @rdname unnest_sentences_ #' @export -unnest_sentences <- function(tbl, output, input, output_id='sent_id', drop=TRUE) { +unnest_sentences <- function(tbl, output, input, doc_id=NULL, output_id='sent_id', drop=TRUE) { output_str <- as.character(substitute(output)) input_str <- as.character(substitute(input)) out_id_str <- as.character(substitute(output_id)) + doc_id <- as.character(substitute(doc_id)) + if (length(doc_id) == 0) doc_id = NULL - unnest_sentences_(tbl, output_str, input_str, out_id_str, drop) + unnest_sentences_(tbl=tbl, output = output_str, + input = input_str, doc_id = doc_id, + output_id = out_id_str, drop = drop) } diff --git a/man/unnest_sentences_.Rd b/man/unnest_sentences_.Rd index 57cf63b..901fc96 100644 --- a/man/unnest_sentences_.Rd +++ b/man/unnest_sentences_.Rd @@ -5,9 +5,11 @@ \alias{unnest_sentences} \title{Split a column of text into sentences} \usage{ -unnest_sentences_(tbl, output, input, output_id = "sent_id", drop = TRUE) +unnest_sentences_(tbl, output, input, doc_id = NULL, output_id = "sent_id", + drop = TRUE) -unnest_sentences(tbl, output, input, output_id = "sent_id", drop = TRUE) +unnest_sentences(tbl, output, input, doc_id = NULL, output_id = "sent_id", + drop = TRUE) } \arguments{ \item{tbl}{dataframe containing column of text to be split into sentences} @@ -16,6 +18,8 @@ unnest_sentences(tbl, output, input, output_id = "sent_id", drop = TRUE) \item{input}{name of input column of text to be parsed into sentences} +\item{doc_id}{column of document ids; if not provided it will be assumed that each row is a different document} + \item{output_id}{name of column to be created to store sentence ids} \item{drop}{whether original input column should get dropped} diff --git a/tests/testthat/test-bind_lexrank.R b/tests/testthat/test-bind_lexrank.R index d3d5463..5179802 100644 --- a/tests/testthat/test-bind_lexrank.R +++ b/tests/testthat/test-bind_lexrank.R @@ -54,7 +54,7 @@ test_that("test input checking", { expect_error(bind_lexrank(df, sents, fake)) expect_error(bind_lexrank(NULL, sents, doc_id)) expect_error(bind_lexrank(df, sents, doc_id, level="fake")) - expect_warning(bind_lexrank(df, sents, doc_id, level=c("sentences","tokens"))) + # expect_warning(bind_lexrank(df, sents, doc_id, level=c("sentences","tokens"))) df <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), @@ -70,7 +70,7 @@ test_that("test input checking", { expect_error(bind_lexrank(df, tokens, doc_id, fake, level="tokens")) expect_error(bind_lexrank(df, tokens, doc_id, level="tokens")) - expect_warning(bind_lexrank(df, tokens, doc_id, sent_id, level=c("tokens","sentences"))) + # expect_warning(bind_lexrank(df, tokens, doc_id, sent_id, level=c("tokens","sentences"))) }) # test output val ------------------------------------------------------ @@ -116,9 +116,7 @@ test_that("output value", { "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."), tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"), - lexrank = c(0.07143, 0.07143, 0.07143, 0.07143, NA, 0.07143, 0.07143, 0.07143, - 0.07143, 0.07143, NA, 0.07143, NA, 0.07143, 0.07143, 0.07143, - NA, 0.07143, NA), + lexrank = c(0.16667, NA, 0.16667, NA, NA, NA, NA, 0.16667, 0.16667, NA, NA, 0.16667, NA, 0.16667, NA, NA, NA, NA, NA), stringsAsFactors = FALSE) expect_equal(test_result, expected_result) diff --git a/tests/testthat/test-bind_lexrank_.R b/tests/testthat/test-bind_lexrank_.R index 044e887..524ec4b 100644 --- a/tests/testthat/test-bind_lexrank_.R +++ b/tests/testthat/test-bind_lexrank_.R @@ -54,7 +54,7 @@ test_that("test input checking", { expect_error(bind_lexrank_(df, "sents", "fake")) expect_error(bind_lexrank_(NULL, "sents", "doc_id")) expect_error(bind_lexrank_(df, "sents", "doc_id", level="fake")) - expect_warning(bind_lexrank_(df, "sents", "doc_id", level=c("sentences","tokens"))) + # expect_warning(bind_lexrank_(df, "sents", "doc_id", level=c("sentences","tokens"))) df <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), @@ -70,16 +70,16 @@ test_that("test input checking", { expect_error(bind_lexrank_(df, "tokens", "doc_id", "fake", level="tokens")) expect_error(bind_lexrank_(df, "tokens", "doc_id", level="tokens")) - expect_warning(bind_lexrank_(df, "tokens", "doc_id", "sent_id", level=c("tokens","sentences"))) + # expect_warning(bind_lexrank_(df, "tokens", "doc_id", "sent_id", level=c("tokens","sentences"))) }) # test output val ------------------------------------------------------ test_that("output value", { df <- data.frame(doc_id = 1:3, - text = c("Testing the system. Second sentence for you.", - "System testing the tidy documents df.", - "Documents will be parsed and lexranked."), - stringsAsFactors = FALSE) %>% + text = c("Testing the system. Second sentence for you.", + "System testing the tidy documents df.", + "Documents will be parsed and lexranked."), + stringsAsFactors = FALSE) %>% unnest_sentences(sents, text) test_result <- bind_lexrank_(df, "sents", "doc_id", level="sentences") @@ -116,9 +116,7 @@ test_that("output value", { "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."), tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the", "tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"), - lexrank = c(0.07143, 0.07143, 0.07143, 0.07143, NA, 0.07143, 0.07143, 0.07143, - 0.07143, 0.07143, NA, 0.07143, NA, 0.07143, 0.07143, 0.07143, - NA, 0.07143, NA), + lexrank = c(0.16667, NA, 0.16667, NA, NA, NA, NA, 0.16667, 0.16667, NA, NA, 0.16667, NA, 0.16667, NA, NA, NA, NA, NA), stringsAsFactors = FALSE) expect_equal(test_result, expected_result) diff --git a/tests/testthat/test-lexRankFromSimil.R b/tests/testthat/test-lexRankFromSimil.R index c04fd99..f8dd04a 100644 --- a/tests/testthat/test-lexRankFromSimil.R +++ b/tests/testthat/test-lexRankFromSimil.R @@ -49,8 +49,8 @@ test_that("object out value", { token = tokenDf$token, docId = tokenDf$docId) - testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal) %>% - dplyr::mutate(value = round(value, 5)) + testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal) + testResult$value = round(testResult$value, 5) expectedResult <- data.frame(sentenceId = c("1_1", "2_1", "3_1"), value = c(0.25676, 0.48649, 0.25676), @@ -58,8 +58,8 @@ test_that("object out value", { expect_identical(testResult, expectedResult) - testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, continuous = TRUE) %>% - dplyr::mutate(value = round(value, 5)) + testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, continuous = TRUE) + testResult$value = round(testResult$value, 5) expectedResult <- data.frame(sentenceId = c("1_1", "2_1", "3_1"), value = c(0.25676, 0.48649, 0.25676), @@ -67,8 +67,8 @@ test_that("object out value", { expect_identical(testResult, expectedResult) - testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, usePageRank = FALSE) %>% - dplyr::mutate(value = round(value, 5)) + testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, usePageRank = FALSE) + testResult$value = round(testResult$value, 5) expectedResult <- data.frame(sentenceId = c("2_1", "1_1", "3_1"), value = c(2, 1, 1), diff --git a/tests/testthat/test-sentenceSimil.R b/tests/testthat/test-sentenceSimil.R index 8e332b6..760f8a3 100644 --- a/tests/testthat/test-sentenceSimil.R +++ b/tests/testthat/test-sentenceSimil.R @@ -9,7 +9,8 @@ test_that("testing result str and class", { testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId, token = tokenDf$token, - docId = tokenDf$docId) + docId = tokenDf$docId, + sentencesAsDocs = FALSE) expect_equal(class(testResult), "data.frame") expect_equal(names(testResult), c("sent1","sent2","similVal")) @@ -17,8 +18,20 @@ test_that("testing result str and class", { expect_true(is.character(testResult$sent1)) expect_true(is.character(testResult$sent2)) expect_true(is.numeric(testResult$similVal)) -}) + testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId, + token = tokenDf$token, + docId = tokenDf$docId, + sentencesAsDocs = TRUE) + + expect_equal(class(testResult), "data.frame") + expect_equal(names(testResult), c("sent1","sent2","similVal")) + + expect_true(is.character(testResult$sent1)) + expect_true(is.character(testResult$sent2)) + expect_true(is.numeric(testResult$similVal)) +}) + test_that("bad input", { expect_error(sentenceSimil(sentenceId = c("1_1"), @@ -55,8 +68,22 @@ test_that("output value check", { testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId, token = tokenDf$token, - docId = tokenDf$docId) %>% - dplyr::mutate(similVal = round(similVal, 5)) + docId = tokenDf$docId, + sentencesAsDocs = FALSE) + testResult$similVal = round(testResult$similVal, 5) + + expectedResult <- data.frame(sent1 = c("1_1", "1_1", "2_1"), + sent2 = c("2_1", "3_1", "3_1"), + similVal = c(0.48624, 0, 0.48624), + stringsAsFactors = FALSE) + + expect_equal(testResult, expectedResult) + + testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId, + token = tokenDf$token, + docId = tokenDf$docId, + sentencesAsDocs = TRUE) + testResult$similVal = round(testResult$similVal, 5) expectedResult <- data.frame(sent1 = c("1_1", "1_1", "2_1"), sent2 = c("2_1", "3_1", "3_1"), diff --git a/tests/testthat/test-sentenceTokenParse.R b/tests/testthat/test-sentenceTokenParse.R index 29e7fec..a0317f8 100644 --- a/tests/testthat/test-sentenceTokenParse.R +++ b/tests/testthat/test-sentenceTokenParse.R @@ -31,7 +31,7 @@ test_that("All clean options TRUE", { expectedResultTokens <- lexRankr::tokenize(testDocs) %>% unlist() %>% .[which(!is.na(.))] - + expect_equal(testResult$sentences, expectedResultSentences) expect_equal(testResult$tokens$token, expectedResultTokens) diff --git a/tests/testthat/test-unnest_sentences.R b/tests/testthat/test-unnest_sentences.R index ebc65c4..82ec421 100644 --- a/tests/testthat/test-unnest_sentences.R +++ b/tests/testthat/test-unnest_sentences.R @@ -31,6 +31,7 @@ test_that("test input checking", { expect_error(unnest_sentences(df, out, fake)) expect_error(unnest_sentences(NULL, out, text)) expect_error(unnest_sentences(df, out, text, drop = NULL)) + expect_error(unnest_sentences(df, out, text, doc_id = fake)) }) # test output val ------------------------------------------------------ @@ -51,5 +52,22 @@ test_that("output value", { stringsAsFactors = FALSE) expect_equal(test_result, expected_result) + + df <- data.frame(doc_id = c(1,1,3), + text = c("Testing the system. Second sentence for you.", + "System testing the tidy documents df.", + "Documents will be parsed and lexranked."), + stringsAsFactors = FALSE) + + test_result <- unnest_sentences(df, out, text, doc_id = doc_id) + expected_result <- data.frame(doc_id = c(1L, 1L, 1L, 3L), + sent_id = c(1L, 2L, 3L, 1L), + out = c("Testing the system.", + "Second sentence for you.", + "System testing the tidy documents df.", + "Documents will be parsed and lexranked."), + stringsAsFactors = FALSE) + + expect_equal(test_result, expected_result) }) diff --git a/tests/testthat/test-unnest_sentences_.R b/tests/testthat/test-unnest_sentences_.R index 825d7ad..18b9824 100644 --- a/tests/testthat/test-unnest_sentences_.R +++ b/tests/testthat/test-unnest_sentences_.R @@ -31,7 +31,8 @@ test_that("test input checking", { expect_error(unnest_sentences_(df, "out", "fake")) expect_error(unnest_sentences_(NULL, "out", "text")) expect_error(unnest_sentences_(df, "out", "text", drop = NULL)) - expect_warning(unnest_sentences_(df, "out", "text", c("test","test2"))) + expect_error(unnest_sentences(df, "out", "text", doc_id = "fake")) + expect_warning(unnest_sentences_(df, "out", "text", output_id=c("test","test2"))) }) # test output val ------------------------------------------------------ @@ -52,5 +53,22 @@ test_that("output value", { stringsAsFactors = FALSE) expect_equal(test_result, expected_result) + + df <- data.frame(doc_id = c(1,1,3), + text = c("Testing the system. Second sentence for you.", + "System testing the tidy documents df.", + "Documents will be parsed and lexranked."), + stringsAsFactors = FALSE) + + test_result <- unnest_sentences_(df, "out", "text", doc_id = "doc_id") + expected_result <- data.frame(doc_id = c(1L, 1L, 1L, 3L), + sent_id = c(1L, 2L, 3L, 1L), + out = c("Testing the system.", + "Second sentence for you.", + "System testing the tidy documents df.", + "Documents will be parsed and lexranked."), + stringsAsFactors = FALSE) + + expect_equal(test_result, expected_result) })