From 2495caa040a94b502f666a393052d372bbc1d9f9 Mon Sep 17 00:00:00 2001
From: Giorgio Comai <g@giorgiocomai.eu>
Date: Sat, 23 Sep 2023 15:52:59 +0200
Subject: [PATCH] default to partition by year and introduce cas_delete_corpus
 for routine updates

---
 .Rbuildignore            |  3 ++
 DESCRIPTION              |  2 +-
 NAMESPACE                |  1 +
 R/cas_delete_corpus.R    | 76 ++++++++++++++++++++++++++++++++++++++++
 R/cas_write_corpus.R     |  4 +--
 man/cas_delete_corpus.Rd | 43 +++++++++++++++++++++++
 man/cas_write_corpus.Rd  |  2 +-
 7 files changed, 127 insertions(+), 4 deletions(-)
 create mode 100644 R/cas_delete_corpus.R
 create mode 100644 man/cas_delete_corpus.Rd

diff --git a/.Rbuildignore b/.Rbuildignore
index f354390..b83029d 100644
--- a/.Rbuildignore
+++ b/.Rbuildignore
@@ -1,3 +1,5 @@
+^renv$
+^renv\.lock$
 ^castarter2\.Rproj$
 ^\.Rproj\.user$
 ^README\.Rmd$
@@ -10,3 +12,4 @@
 ^pkgdown$
 ^doc$
 ^Meta$
+^deploy$
diff --git a/DESCRIPTION b/DESCRIPTION
index 455bf32..70b0204 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: castarter
 Title: Content Analysis Starter Toolkit
-Version: 0.0.2.9031
+Version: 0.0.2.9032
 Authors@R: 
     c(person(given = "Giorgio",
              family = "Comai",
diff --git a/NAMESPACE b/NAMESPACE
index a0ead6d..00384f5 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -20,6 +20,7 @@ export(cas_count)
 export(cas_count_relative)
 export(cas_count_total_words)
 export(cas_create_db_folder)
+export(cas_delete_corpus)
 export(cas_delete_from_db)
 export(cas_disable_db)
 export(cas_disconnect_from_db)
diff --git a/R/cas_delete_corpus.R b/R/cas_delete_corpus.R
new file mode 100644
index 0000000..68b1f89
--- /dev/null
+++ b/R/cas_delete_corpus.R
@@ -0,0 +1,76 @@
+#' Delete previously stored corpora written with `cas_write_corpus()`.
+#'
+#' Typically used for file maintainance, especially when datasets are routinely updated.
+#'
+#' @param keep Numeric, defaults to 1. Number of corpus files to keep. Only the most recent files are kept.
+#' @inheritParams cas_write_corpus
+#'
+#' @return
+#' @export
+#'
+#' @examples
+cas_delete_corpus <- function(keep = 1,
+                              ask = TRUE,
+                              file_format = "parquet",
+                              partition = "year",
+                              token = "full_text",
+                              corpus_folder = "corpus",
+                              path = NULL,
+                              ...) {
+  cas_options_l <- cas_get_options(...)
+
+  if (is.null(path) == TRUE) {
+    path <- cas_get_corpus_path(
+      corpus_folder = corpus_folder,
+      file_format = file_format,
+      partition = partition,
+      token = token,
+      ...
+    ) %>%
+      fs::path_dir()
+  }
+
+  if (fs::file_exists(path) == FALSE) {
+    cli::cli_warn("The folder {.path {path}} does not exists. No corpus to remove.")
+    return(invisible(NULL))
+  }
+
+  corpus_path_df <- tibble::tibble(path = fs::dir_ls(path))
+
+  corpus_path_to_remove_df <- corpus_path_df %>%
+    dplyr::slice_head(n = (nrow(corpus_path_df) - keep))
+
+  corpus_path_to_keep_df <- corpus_path_df %>%
+    dplyr::slice_tail(n = keep)
+
+  if (nrow(corpus_path_to_remove_df) > 0) {
+    paths_to_remove_l <- corpus_path_to_remove_df$path
+    names(paths_to_remove_l) <- rep("x", length(paths_to_remove_l))
+
+    cli::cli_inform(c(`!` = cli::cli_text("{length(paths_to_remove_l)} corpus file{?s} about to be removed:")))
+    cli::cli_bullets(paths_to_remove_l)
+  } else {
+    cli::cli_inform(c(`i` = "No corpus files to be removed."))
+    return(invisible(NULL))
+  }
+
+  if (nrow(corpus_path_to_keep_df) > 0) {
+    paths_to_keep_l <- corpus_path_to_keep_df$path
+    names(paths_to_keep_l) <- rep("*", length(paths_to_keep_l))
+
+    cli::cli_inform(c(`i` = cli::cli_text("{length(paths_to_keep_l)} corpus file{?s} will be kept:")))
+    cli::cli_bullets(paths_to_keep_l)
+  }
+
+  if (ask == FALSE) {
+    confirmed <- TRUE
+  } else {
+    cli::cli_inform(cli::cli_text("A total of {length(paths_to_remove_l)} corpus file{?s} will be removed and {length(paths_to_keep_l)} will be kept."))
+    confirmed <- usethis::ui_yeah(x = "Proceed?")
+  }
+
+  if (confirmed == TRUE) {
+    fs::dir_delete(path = paths_to_remove_l)
+    cli::cli_alert_success(text = "Done.")
+  }
+}
diff --git a/R/cas_write_corpus.R b/R/cas_write_corpus.R
index b36c93e..f7fbd2a 100644
--- a/R/cas_write_corpus.R
+++ b/R/cas_write_corpus.R
@@ -46,7 +46,7 @@ cas_write_corpus <- function(corpus = NULL,
                              text = text,
                              tif_compliant = TRUE,
                              file_format = "parquet",
-                             partition = NULL,
+                             partition = "year",
                              token = "full_text",
                              corpus_folder = "corpus",
                              path = NULL,
@@ -81,7 +81,7 @@ cas_write_corpus <- function(corpus = NULL,
   }
 
   if (fs::file_exists(path)) {
-    cli::cli_abort("The folder {.path {path}} already exists. Please remove or rename it before writing corpus")
+    cli::cli_abort("The folder {.path {path}} already exists. Please remove or rename it before writing corpus.")
   }
 
   fs::dir_create(path = path)
diff --git a/man/cas_delete_corpus.Rd b/man/cas_delete_corpus.Rd
new file mode 100644
index 0000000..db3e24f
--- /dev/null
+++ b/man/cas_delete_corpus.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/cas_delete_corpus.R
+\name{cas_delete_corpus}
+\alias{cas_delete_corpus}
+\title{Delete previously stored corpora written with \code{cas_write_corpus()}.}
+\usage{
+cas_delete_corpus(
+  keep = 1,
+  ask = TRUE,
+  file_format = "parquet",
+  partition = "year",
+  token = "full_text",
+  corpus_folder = "corpus",
+  path = NULL,
+  ...
+)
+}
+\arguments{
+\item{keep}{Numeric, defaults to 1. Number of corpus files to keep. Only the most recent files are kept.}
+
+\item{file_format}{Defaults to "parquet". Currently, other options are not
+implemented.}
+
+\item{partition}{Defaults to NULL. If NULL, the parquet file is not
+partitioned. "year" is a common alternative: if set to "year", the parquet
+file is partitioned by year. If a \code{year} column does not exist, it is
+created based on the assumption that a \code{date} column exists and it is (or
+can be coerced to) a vector of class \code{Date}.}
+
+\item{token}{Defaults to "full_text", which does not tokenise the text
+column. If different from \code{full_text}, it is passed to
+\code{tidytext::unnest_tokens} (see its help for details). Accepted values
+include "words", "sentences", and "paragraphs". See
+\code{?tidytext::unnest_tokens()} for details.}
+
+\item{path}{Defaults to NULL. If NULL, path is set to the
+project/website/export/dataset/file_format folder.}
+
+\item{...}{Passed to \code{cas_get_db_file()}.}
+}
+\description{
+Typically used for file maintainance, especially when datasets are routinely updated.
+}
diff --git a/man/cas_write_corpus.Rd b/man/cas_write_corpus.Rd
index a693f7a..3f3073c 100644
--- a/man/cas_write_corpus.Rd
+++ b/man/cas_write_corpus.Rd
@@ -13,7 +13,7 @@ cas_write_corpus(
   text = text,
   tif_compliant = TRUE,
   file_format = "parquet",
-  partition = NULL,
+  partition = "year",
   token = "full_text",
   corpus_folder = "corpus",
   path = NULL,