Skip to content

Commit

Permalink
default to partition by year and introduce cas_delete_corpus for rout…
Browse files Browse the repository at this point in the history
…ine updates
  • Loading branch information
giocomai committed Sep 23, 2023
1 parent cff86eb commit 2495caa
Show file tree
Hide file tree
Showing 7 changed files with 127 additions and 4 deletions.
3 changes: 3 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
^renv$
^renv\.lock$
^castarter2\.Rproj$
^\.Rproj\.user$
^README\.Rmd$
Expand All @@ -10,3 +12,4 @@
^pkgdown$
^doc$
^Meta$
^deploy$
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: castarter
Title: Content Analysis Starter Toolkit
Version: 0.0.2.9031
Version: 0.0.2.9032
Authors@R:
c(person(given = "Giorgio",
family = "Comai",
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ export(cas_count)
export(cas_count_relative)
export(cas_count_total_words)
export(cas_create_db_folder)
export(cas_delete_corpus)
export(cas_delete_from_db)
export(cas_disable_db)
export(cas_disconnect_from_db)
Expand Down
76 changes: 76 additions & 0 deletions R/cas_delete_corpus.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#' Delete previously stored corpora written with `cas_write_corpus()`.
#'
#' Typically used for file maintainance, especially when datasets are routinely updated.
#'
#' @param keep Numeric, defaults to 1. Number of corpus files to keep. Only the most recent files are kept.
#' @inheritParams cas_write_corpus
#'
#' @return
#' @export
#'
#' @examples
cas_delete_corpus <- function(keep = 1,
ask = TRUE,
file_format = "parquet",
partition = "year",
token = "full_text",
corpus_folder = "corpus",
path = NULL,
...) {
cas_options_l <- cas_get_options(...)

if (is.null(path) == TRUE) {
path <- cas_get_corpus_path(
corpus_folder = corpus_folder,
file_format = file_format,
partition = partition,
token = token,
...
) %>%
fs::path_dir()
}

if (fs::file_exists(path) == FALSE) {
cli::cli_warn("The folder {.path {path}} does not exists. No corpus to remove.")
return(invisible(NULL))
}

corpus_path_df <- tibble::tibble(path = fs::dir_ls(path))

corpus_path_to_remove_df <- corpus_path_df %>%
dplyr::slice_head(n = (nrow(corpus_path_df) - keep))

corpus_path_to_keep_df <- corpus_path_df %>%
dplyr::slice_tail(n = keep)

if (nrow(corpus_path_to_remove_df) > 0) {
paths_to_remove_l <- corpus_path_to_remove_df$path
names(paths_to_remove_l) <- rep("x", length(paths_to_remove_l))

cli::cli_inform(c(`!` = cli::cli_text("{length(paths_to_remove_l)} corpus file{?s} about to be removed:")))
cli::cli_bullets(paths_to_remove_l)
} else {
cli::cli_inform(c(`i` = "No corpus files to be removed."))
return(invisible(NULL))
}

if (nrow(corpus_path_to_keep_df) > 0) {
paths_to_keep_l <- corpus_path_to_keep_df$path
names(paths_to_keep_l) <- rep("*", length(paths_to_keep_l))

cli::cli_inform(c(`i` = cli::cli_text("{length(paths_to_keep_l)} corpus file{?s} will be kept:")))
cli::cli_bullets(paths_to_keep_l)
}

if (ask == FALSE) {
confirmed <- TRUE
} else {
cli::cli_inform(cli::cli_text("A total of {length(paths_to_remove_l)} corpus file{?s} will be removed and {length(paths_to_keep_l)} will be kept."))
confirmed <- usethis::ui_yeah(x = "Proceed?")
}

if (confirmed == TRUE) {
fs::dir_delete(path = paths_to_remove_l)
cli::cli_alert_success(text = "Done.")
}
}
4 changes: 2 additions & 2 deletions R/cas_write_corpus.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ cas_write_corpus <- function(corpus = NULL,
text = text,
tif_compliant = TRUE,
file_format = "parquet",
partition = NULL,
partition = "year",
token = "full_text",
corpus_folder = "corpus",
path = NULL,
Expand Down Expand Up @@ -81,7 +81,7 @@ cas_write_corpus <- function(corpus = NULL,
}

if (fs::file_exists(path)) {
cli::cli_abort("The folder {.path {path}} already exists. Please remove or rename it before writing corpus")
cli::cli_abort("The folder {.path {path}} already exists. Please remove or rename it before writing corpus.")
}

fs::dir_create(path = path)
Expand Down
43 changes: 43 additions & 0 deletions man/cas_delete_corpus.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/cas_write_corpus.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 2495caa

Please sign in to comment.