From 2495caa040a94b502f666a393052d372bbc1d9f9 Mon Sep 17 00:00:00 2001 From: Giorgio Comai Date: Sat, 23 Sep 2023 15:52:59 +0200 Subject: [PATCH] default to partition by year and introduce cas_delete_corpus for routine updates --- .Rbuildignore | 3 ++ DESCRIPTION | 2 +- NAMESPACE | 1 + R/cas_delete_corpus.R | 76 ++++++++++++++++++++++++++++++++++++++++ R/cas_write_corpus.R | 4 +-- man/cas_delete_corpus.Rd | 43 +++++++++++++++++++++++ man/cas_write_corpus.Rd | 2 +- 7 files changed, 127 insertions(+), 4 deletions(-) create mode 100644 R/cas_delete_corpus.R create mode 100644 man/cas_delete_corpus.Rd diff --git a/.Rbuildignore b/.Rbuildignore index f354390..b83029d 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -1,3 +1,5 @@ +^renv$ +^renv\.lock$ ^castarter2\.Rproj$ ^\.Rproj\.user$ ^README\.Rmd$ @@ -10,3 +12,4 @@ ^pkgdown$ ^doc$ ^Meta$ +^deploy$ diff --git a/DESCRIPTION b/DESCRIPTION index 455bf32..70b0204 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: castarter Title: Content Analysis Starter Toolkit -Version: 0.0.2.9031 +Version: 0.0.2.9032 Authors@R: c(person(given = "Giorgio", family = "Comai", diff --git a/NAMESPACE b/NAMESPACE index a0ead6d..00384f5 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,6 +20,7 @@ export(cas_count) export(cas_count_relative) export(cas_count_total_words) export(cas_create_db_folder) +export(cas_delete_corpus) export(cas_delete_from_db) export(cas_disable_db) export(cas_disconnect_from_db) diff --git a/R/cas_delete_corpus.R b/R/cas_delete_corpus.R new file mode 100644 index 0000000..68b1f89 --- /dev/null +++ b/R/cas_delete_corpus.R @@ -0,0 +1,76 @@ +#' Delete previously stored corpora written with `cas_write_corpus()`. +#' +#' Typically used for file maintainance, especially when datasets are routinely updated. +#' +#' @param keep Numeric, defaults to 1. Number of corpus files to keep. Only the most recent files are kept. +#' @inheritParams cas_write_corpus +#' +#' @return +#' @export +#' +#' @examples +cas_delete_corpus <- function(keep = 1, + ask = TRUE, + file_format = "parquet", + partition = "year", + token = "full_text", + corpus_folder = "corpus", + path = NULL, + ...) { + cas_options_l <- cas_get_options(...) + + if (is.null(path) == TRUE) { + path <- cas_get_corpus_path( + corpus_folder = corpus_folder, + file_format = file_format, + partition = partition, + token = token, + ... + ) %>% + fs::path_dir() + } + + if (fs::file_exists(path) == FALSE) { + cli::cli_warn("The folder {.path {path}} does not exists. No corpus to remove.") + return(invisible(NULL)) + } + + corpus_path_df <- tibble::tibble(path = fs::dir_ls(path)) + + corpus_path_to_remove_df <- corpus_path_df %>% + dplyr::slice_head(n = (nrow(corpus_path_df) - keep)) + + corpus_path_to_keep_df <- corpus_path_df %>% + dplyr::slice_tail(n = keep) + + if (nrow(corpus_path_to_remove_df) > 0) { + paths_to_remove_l <- corpus_path_to_remove_df$path + names(paths_to_remove_l) <- rep("x", length(paths_to_remove_l)) + + cli::cli_inform(c(`!` = cli::cli_text("{length(paths_to_remove_l)} corpus file{?s} about to be removed:"))) + cli::cli_bullets(paths_to_remove_l) + } else { + cli::cli_inform(c(`i` = "No corpus files to be removed.")) + return(invisible(NULL)) + } + + if (nrow(corpus_path_to_keep_df) > 0) { + paths_to_keep_l <- corpus_path_to_keep_df$path + names(paths_to_keep_l) <- rep("*", length(paths_to_keep_l)) + + cli::cli_inform(c(`i` = cli::cli_text("{length(paths_to_keep_l)} corpus file{?s} will be kept:"))) + cli::cli_bullets(paths_to_keep_l) + } + + if (ask == FALSE) { + confirmed <- TRUE + } else { + cli::cli_inform(cli::cli_text("A total of {length(paths_to_remove_l)} corpus file{?s} will be removed and {length(paths_to_keep_l)} will be kept.")) + confirmed <- usethis::ui_yeah(x = "Proceed?") + } + + if (confirmed == TRUE) { + fs::dir_delete(path = paths_to_remove_l) + cli::cli_alert_success(text = "Done.") + } +} diff --git a/R/cas_write_corpus.R b/R/cas_write_corpus.R index b36c93e..f7fbd2a 100644 --- a/R/cas_write_corpus.R +++ b/R/cas_write_corpus.R @@ -46,7 +46,7 @@ cas_write_corpus <- function(corpus = NULL, text = text, tif_compliant = TRUE, file_format = "parquet", - partition = NULL, + partition = "year", token = "full_text", corpus_folder = "corpus", path = NULL, @@ -81,7 +81,7 @@ cas_write_corpus <- function(corpus = NULL, } if (fs::file_exists(path)) { - cli::cli_abort("The folder {.path {path}} already exists. Please remove or rename it before writing corpus") + cli::cli_abort("The folder {.path {path}} already exists. Please remove or rename it before writing corpus.") } fs::dir_create(path = path) diff --git a/man/cas_delete_corpus.Rd b/man/cas_delete_corpus.Rd new file mode 100644 index 0000000..db3e24f --- /dev/null +++ b/man/cas_delete_corpus.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cas_delete_corpus.R +\name{cas_delete_corpus} +\alias{cas_delete_corpus} +\title{Delete previously stored corpora written with \code{cas_write_corpus()}.} +\usage{ +cas_delete_corpus( + keep = 1, + ask = TRUE, + file_format = "parquet", + partition = "year", + token = "full_text", + corpus_folder = "corpus", + path = NULL, + ... +) +} +\arguments{ +\item{keep}{Numeric, defaults to 1. Number of corpus files to keep. Only the most recent files are kept.} + +\item{file_format}{Defaults to "parquet". Currently, other options are not +implemented.} + +\item{partition}{Defaults to NULL. If NULL, the parquet file is not +partitioned. "year" is a common alternative: if set to "year", the parquet +file is partitioned by year. If a \code{year} column does not exist, it is +created based on the assumption that a \code{date} column exists and it is (or +can be coerced to) a vector of class \code{Date}.} + +\item{token}{Defaults to "full_text", which does not tokenise the text +column. If different from \code{full_text}, it is passed to +\code{tidytext::unnest_tokens} (see its help for details). Accepted values +include "words", "sentences", and "paragraphs". See +\code{?tidytext::unnest_tokens()} for details.} + +\item{path}{Defaults to NULL. If NULL, path is set to the +project/website/export/dataset/file_format folder.} + +\item{...}{Passed to \code{cas_get_db_file()}.} +} +\description{ +Typically used for file maintainance, especially when datasets are routinely updated. +} diff --git a/man/cas_write_corpus.Rd b/man/cas_write_corpus.Rd index a693f7a..3f3073c 100644 --- a/man/cas_write_corpus.Rd +++ b/man/cas_write_corpus.Rd @@ -13,7 +13,7 @@ cas_write_corpus( text = text, tif_compliant = TRUE, file_format = "parquet", - partition = NULL, + partition = "year", token = "full_text", corpus_folder = "corpus", path = NULL,