Skip to content

Commit

Permalink
fixed #12; ie give option to bypass assumption that each row/veceleme…
Browse files Browse the repository at this point in the history
…nt are different documents
  • Loading branch information
Spannbauer, Adam M committed Dec 11, 2017
1 parent 8e43ecf commit fcc9aa7
Show file tree
Hide file tree
Showing 13 changed files with 136 additions and 43 deletions.
2 changes: 1 addition & 1 deletion R/bind_lexrank.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ bind_lexrank_ <- function(tbl, text, doc_id, sent_id=NULL, level=c("sentences",
if(!(doc_id %in% names(tbl))) stop("doc_id column not found in tbl")
if(!is.character(level)) stop("level must be character")
if(length(level) > 1) {
warning("only first element of level will be used")
# warning("only first element of level will be used")
level = level[1]
}
if(!(level %in% c("sentences", "tokens"))) stop("invalid value of level; accepted values for level are 'sentences' and 'tokens'")
Expand Down
23 changes: 15 additions & 8 deletions R/sentenceParse.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,29 @@ sentenceParse <- function(text, docId = "create") {
if(length(text) < 1) stop("text must be at least length 1")
docId <- as.character(docId)
if(length(docId)==1 & docId[1]=="create") {
createDocIds <- TRUE
} else if(length(docId)==length(text)) {
createDocIds <- FALSE
} else if(length(docId)!=length(text)) stop("docId vector must be same length as text vector")


createDocIds <- TRUE
} else if(length(docId)==length(text)) {
createDocIds <- FALSE
} else if(length(docId)!=length(text)) stop("docId vector must be same length as text vector")
sentences <- sentence_parser(text)
sentenceDfList <- lapply(seq_along(sentences), function(i) {
sentVec <- trimws(sentences[[i]])
if(createDocIds) {
data.frame(docId=i, sentenceId=paste0(i,"_",seq_along(sentVec)), sentence=sentVec, stringsAsFactors = FALSE)
} else if(!createDocIds) {
data.frame(docId=docId[i], sentenceId=paste0(docId[i],"_",seq_along(sentVec)), sentence=sentVec, stringsAsFactors = FALSE)
data.frame(docId=docId[i], sentence=sentVec, stringsAsFactors = FALSE)
}
})
sentenceDf <- dplyr::bind_rows(sentenceDfList)
sentenceDf <- do.call('rbind', sentenceDfList)
sentenceDfList <- split(sentenceDf, sentenceDf$docId)
sentenceDfList <- lapply(sentenceDfList, function(dfi) {
dfi$sentenceId <- paste0(dfi$docId, "_", 1:nrow(dfi))
dfi[,c("docId","sentenceId","sentence")]
})
sentenceDf <- do.call('rbind', sentenceDfList)
class(sentenceDf) <- "data.frame"
rownames(sentenceDf) <- NULL
return(sentenceDf)
}
2 changes: 1 addition & 1 deletion R/sentenceSimil.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ sentenceSimil <- function(sentenceId, token, docId=NULL, sentencesAsDocs=FALSE){
dplyr::summarise(tf = n()) %>%
dplyr::ungroup() %>%
dplyr::group_by(token) %>%
dplyr::mutate(idf = 1+log(ndoc/n())) %>%
dplyr::mutate(idf = 1+log(ndoc/length(unique(docId)))) %>%
dplyr::mutate(tfidf = tf*idf) %>%
dplyr::ungroup()

Expand Down
7 changes: 6 additions & 1 deletion R/tokenize.R
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,12 @@ tokenize <- function(text, removePunc=TRUE, removeNum=TRUE, toLower=TRUE, stemWo
if(length(nonStopTok) == 0) NA_character_ else nonStopTok
})
if(stemWords) {
text <- lapply(text, SnowballC::wordStem)
text <- lapply(text, function(w) {
w_na = which(is.na(w))
out = SnowballC::wordStem(w)
out[w_na] = NA
out
})
}

tokenList <- lapply(text, function(tokens) {
Expand Down
26 changes: 22 additions & 4 deletions R/unnest_sentences.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#' @param tbl dataframe containing column of text to be split into sentences
#' @param output name of column to be created to store parsed sentences
#' @param input name of input column of text to be parsed into sentences
#' @param doc_id column of document ids; if not provided it will be assumed that each row is a different document
#' @param output_id name of column to be created to store sentence ids
#' @param drop whether original input column should get dropped
#' @return A data.frame of parsed sentences and sentence ids
Expand All @@ -22,7 +23,7 @@
#' unnest_sentences(sents, text)

#' @export
unnest_sentences_ <- function(tbl, output, input, output_id="sent_id", drop=TRUE) {
unnest_sentences_ <- function(tbl, output, input, doc_id=NULL, output_id="sent_id", drop=TRUE) {
if(!is.data.frame(tbl)) stop("tbl must be a dataframe")
if(!(input %in% names(tbl))) stop("input column not found in tbl")
if(!is.character(tbl[[input]])) stop("input column must be character")
Expand All @@ -31,6 +32,9 @@ unnest_sentences_ <- function(tbl, output, input, output_id="sent_id", drop=TRUE
output_id <- output_id[1]
}
if(!is.logical(drop)) stop("drop must be logical")
if(!is.null(doc_id)) {
if(!(doc_id %in% names(tbl))) stop("doc_id column not found in tbl")
}

text <- tbl[[input]]
parsed_sents <- sentence_parser(text)
Expand All @@ -43,15 +47,29 @@ unnest_sentences_ <- function(tbl, output, input, output_id="sent_id", drop=TRUE
tbl[[output_id]] <- sent_ids
tbl[[output]] <- parsed_sents

tidyr::unnest(tbl)
out = tidyr::unnest(tbl)
if(!is.null(doc_id)) {
out_tbl_list = split(out, out[[doc_id]])
out_tbl_list = lapply(out_tbl_list, function(dfi) {
dfi[[output_id]] = seq_along(dfi[[output_id]])
dfi
})
out = do.call('rbind', out_tbl_list)
}
rownames(out) = NULL
return(out)
}

#' @rdname unnest_sentences_
#' @export
unnest_sentences <- function(tbl, output, input, output_id='sent_id', drop=TRUE) {
unnest_sentences <- function(tbl, output, input, doc_id=NULL, output_id='sent_id', drop=TRUE) {
output_str <- as.character(substitute(output))
input_str <- as.character(substitute(input))
out_id_str <- as.character(substitute(output_id))
doc_id <- as.character(substitute(doc_id))
if (length(doc_id) == 0) doc_id = NULL

unnest_sentences_(tbl, output_str, input_str, out_id_str, drop)
unnest_sentences_(tbl=tbl, output = output_str,
input = input_str, doc_id = doc_id,
output_id = out_id_str, drop = drop)
}
8 changes: 6 additions & 2 deletions man/unnest_sentences_.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 3 additions & 5 deletions tests/testthat/test-bind_lexrank.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ test_that("test input checking", {
expect_error(bind_lexrank(df, sents, fake))
expect_error(bind_lexrank(NULL, sents, doc_id))
expect_error(bind_lexrank(df, sents, doc_id, level="fake"))
expect_warning(bind_lexrank(df, sents, doc_id, level=c("sentences","tokens")))
# expect_warning(bind_lexrank(df, sents, doc_id, level=c("sentences","tokens")))

df <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L),
sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
Expand All @@ -70,7 +70,7 @@ test_that("test input checking", {

expect_error(bind_lexrank(df, tokens, doc_id, fake, level="tokens"))
expect_error(bind_lexrank(df, tokens, doc_id, level="tokens"))
expect_warning(bind_lexrank(df, tokens, doc_id, sent_id, level=c("tokens","sentences")))
# expect_warning(bind_lexrank(df, tokens, doc_id, sent_id, level=c("tokens","sentences")))
})

# test output val ------------------------------------------------------
Expand Down Expand Up @@ -116,9 +116,7 @@ test_that("output value", {
"Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."),
tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the",
"tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"),
lexrank = c(0.07143, 0.07143, 0.07143, 0.07143, NA, 0.07143, 0.07143, 0.07143,
0.07143, 0.07143, NA, 0.07143, NA, 0.07143, 0.07143, 0.07143,
NA, 0.07143, NA),
lexrank = c(0.16667, NA, 0.16667, NA, NA, NA, NA, 0.16667, 0.16667, NA, NA, 0.16667, NA, 0.16667, NA, NA, NA, NA, NA),
stringsAsFactors = FALSE)

expect_equal(test_result, expected_result)
Expand Down
16 changes: 7 additions & 9 deletions tests/testthat/test-bind_lexrank_.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ test_that("test input checking", {
expect_error(bind_lexrank_(df, "sents", "fake"))
expect_error(bind_lexrank_(NULL, "sents", "doc_id"))
expect_error(bind_lexrank_(df, "sents", "doc_id", level="fake"))
expect_warning(bind_lexrank_(df, "sents", "doc_id", level=c("sentences","tokens")))
# expect_warning(bind_lexrank_(df, "sents", "doc_id", level=c("sentences","tokens")))

df <- data.frame(doc_id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L),
sent_id = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
Expand All @@ -70,16 +70,16 @@ test_that("test input checking", {

expect_error(bind_lexrank_(df, "tokens", "doc_id", "fake", level="tokens"))
expect_error(bind_lexrank_(df, "tokens", "doc_id", level="tokens"))
expect_warning(bind_lexrank_(df, "tokens", "doc_id", "sent_id", level=c("tokens","sentences")))
# expect_warning(bind_lexrank_(df, "tokens", "doc_id", "sent_id", level=c("tokens","sentences")))
})

# test output val ------------------------------------------------------
test_that("output value", {
df <- data.frame(doc_id = 1:3,
text = c("Testing the system. Second sentence for you.",
"System testing the tidy documents df.",
"Documents will be parsed and lexranked."),
stringsAsFactors = FALSE) %>%
text = c("Testing the system. Second sentence for you.",
"System testing the tidy documents df.",
"Documents will be parsed and lexranked."),
stringsAsFactors = FALSE) %>%
unnest_sentences(sents, text)

test_result <- bind_lexrank_(df, "sents", "doc_id", level="sentences")
Expand Down Expand Up @@ -116,9 +116,7 @@ test_that("output value", {
"Documents will be parsed and lexranked.", "Documents will be parsed and lexranked.", "Documents will be parsed and lexranked."),
tokens = c("testing", "the", "system", "second", "sentence", "for", "you", "system", "testing", "the",
"tidy", "documents", "df", "documents", "will", "be", "parsed", "and", "lexranked"),
lexrank = c(0.07143, 0.07143, 0.07143, 0.07143, NA, 0.07143, 0.07143, 0.07143,
0.07143, 0.07143, NA, 0.07143, NA, 0.07143, 0.07143, 0.07143,
NA, 0.07143, NA),
lexrank = c(0.16667, NA, 0.16667, NA, NA, NA, NA, 0.16667, 0.16667, NA, NA, 0.16667, NA, 0.16667, NA, NA, NA, NA, NA),
stringsAsFactors = FALSE)

expect_equal(test_result, expected_result)
Expand Down
12 changes: 6 additions & 6 deletions tests/testthat/test-lexRankFromSimil.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,26 +49,26 @@ test_that("object out value", {
token = tokenDf$token,
docId = tokenDf$docId)

testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal) %>%
dplyr::mutate(value = round(value, 5))
testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal)
testResult$value = round(testResult$value, 5)

expectedResult <- data.frame(sentenceId = c("1_1", "2_1", "3_1"),
value = c(0.25676, 0.48649, 0.25676),
stringsAsFactors = FALSE)

expect_identical(testResult, expectedResult)

testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, continuous = TRUE) %>%
dplyr::mutate(value = round(value, 5))
testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, continuous = TRUE)
testResult$value = round(testResult$value, 5)

expectedResult <- data.frame(sentenceId = c("1_1", "2_1", "3_1"),
value = c(0.25676, 0.48649, 0.25676),
stringsAsFactors = FALSE)

expect_identical(testResult, expectedResult)

testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, usePageRank = FALSE) %>%
dplyr::mutate(value = round(value, 5))
testResult <- lexRankFromSimil(similDf$sent1, similDf$sent2, similDf$similVal, usePageRank = FALSE)
testResult$value = round(testResult$value, 5)

expectedResult <- data.frame(sentenceId = c("2_1", "1_1", "3_1"),
value = c(2, 1, 1),
Expand Down
35 changes: 31 additions & 4 deletions tests/testthat/test-sentenceSimil.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,29 @@ test_that("testing result str and class", {

testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId,
token = tokenDf$token,
docId = tokenDf$docId)
docId = tokenDf$docId,
sentencesAsDocs = FALSE)

expect_equal(class(testResult), "data.frame")
expect_equal(names(testResult), c("sent1","sent2","similVal"))

expect_true(is.character(testResult$sent1))
expect_true(is.character(testResult$sent2))
expect_true(is.numeric(testResult$similVal))
})

testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId,
token = tokenDf$token,
docId = tokenDf$docId,
sentencesAsDocs = TRUE)

expect_equal(class(testResult), "data.frame")
expect_equal(names(testResult), c("sent1","sent2","similVal"))

expect_true(is.character(testResult$sent1))
expect_true(is.character(testResult$sent2))
expect_true(is.numeric(testResult$similVal))
})


test_that("bad input", {
expect_error(sentenceSimil(sentenceId = c("1_1"),
Expand Down Expand Up @@ -55,8 +68,22 @@ test_that("output value check", {

testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId,
token = tokenDf$token,
docId = tokenDf$docId) %>%
dplyr::mutate(similVal = round(similVal, 5))
docId = tokenDf$docId,
sentencesAsDocs = FALSE)
testResult$similVal = round(testResult$similVal, 5)

expectedResult <- data.frame(sent1 = c("1_1", "1_1", "2_1"),
sent2 = c("2_1", "3_1", "3_1"),
similVal = c(0.48624, 0, 0.48624),
stringsAsFactors = FALSE)

expect_equal(testResult, expectedResult)

testResult <- sentenceSimil(sentenceId = tokenDf$sentenceId,
token = tokenDf$token,
docId = tokenDf$docId,
sentencesAsDocs = TRUE)
testResult$similVal = round(testResult$similVal, 5)

expectedResult <- data.frame(sent1 = c("1_1", "1_1", "2_1"),
sent2 = c("2_1", "3_1", "3_1"),
Expand Down
2 changes: 1 addition & 1 deletion tests/testthat/test-sentenceTokenParse.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ test_that("All clean options TRUE", {
expectedResultTokens <- lexRankr::tokenize(testDocs) %>%
unlist() %>%
.[which(!is.na(.))]

expect_equal(testResult$sentences, expectedResultSentences)
expect_equal(testResult$tokens$token, expectedResultTokens)

Expand Down
18 changes: 18 additions & 0 deletions tests/testthat/test-unnest_sentences.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ test_that("test input checking", {
expect_error(unnest_sentences(df, out, fake))
expect_error(unnest_sentences(NULL, out, text))
expect_error(unnest_sentences(df, out, text, drop = NULL))
expect_error(unnest_sentences(df, out, text, doc_id = fake))
})

# test output val ------------------------------------------------------
Expand All @@ -51,5 +52,22 @@ test_that("output value", {
stringsAsFactors = FALSE)

expect_equal(test_result, expected_result)

df <- data.frame(doc_id = c(1,1,3),
text = c("Testing the system. Second sentence for you.",
"System testing the tidy documents df.",
"Documents will be parsed and lexranked."),
stringsAsFactors = FALSE)

test_result <- unnest_sentences(df, out, text, doc_id = doc_id)
expected_result <- data.frame(doc_id = c(1L, 1L, 1L, 3L),
sent_id = c(1L, 2L, 3L, 1L),
out = c("Testing the system.",
"Second sentence for you.",
"System testing the tidy documents df.",
"Documents will be parsed and lexranked."),
stringsAsFactors = FALSE)

expect_equal(test_result, expected_result)
})

20 changes: 19 additions & 1 deletion tests/testthat/test-unnest_sentences_.R
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ test_that("test input checking", {
expect_error(unnest_sentences_(df, "out", "fake"))
expect_error(unnest_sentences_(NULL, "out", "text"))
expect_error(unnest_sentences_(df, "out", "text", drop = NULL))
expect_warning(unnest_sentences_(df, "out", "text", c("test","test2")))
expect_error(unnest_sentences(df, "out", "text", doc_id = "fake"))
expect_warning(unnest_sentences_(df, "out", "text", output_id=c("test","test2")))
})

# test output val ------------------------------------------------------
Expand All @@ -52,5 +53,22 @@ test_that("output value", {
stringsAsFactors = FALSE)

expect_equal(test_result, expected_result)

df <- data.frame(doc_id = c(1,1,3),
text = c("Testing the system. Second sentence for you.",
"System testing the tidy documents df.",
"Documents will be parsed and lexranked."),
stringsAsFactors = FALSE)

test_result <- unnest_sentences_(df, "out", "text", doc_id = "doc_id")
expected_result <- data.frame(doc_id = c(1L, 1L, 1L, 3L),
sent_id = c(1L, 2L, 3L, 1L),
out = c("Testing the system.",
"Second sentence for you.",
"System testing the tidy documents df.",
"Documents will be parsed and lexranked."),
stringsAsFactors = FALSE)

expect_equal(test_result, expected_result)
})

0 comments on commit fcc9aa7

Please sign in to comment.