Skip to content

Commit

Permalink
Update dl_read_gcp: several improvements
Browse files Browse the repository at this point in the history
- Minor refactor
- Check whether the file exists in GCP
- Improve verbosity
- Improve documentation
  • Loading branch information
biodavidjm committed May 16, 2024
1 parent 2fbf066 commit 49913fb
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 63 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: MotrpacBicQC
Type: Package
Title: QC/QA functions for the MoTrPAC community
Version: 0.9.3
Version: 0.9.31
Date: 2024-03-25
Author: MoTrPAC Bioinformatics Center
Maintainer: David Jimenez-Morales <davidjm@stanford.edu>
Expand Down
139 changes: 92 additions & 47 deletions R/misc.R
Original file line number Diff line number Diff line change
Expand Up @@ -52,31 +52,52 @@ create_folder <- function(folder_name = NULL,
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#' @title dl_read_gcp: Data Load, Read file from Google Cloud
#' Download and Read File from Google Cloud Storage
#'
#' @description
#' Read a single file from Google Cloud Storage (GSC) into a data table
#' This function downloads a file from Google Cloud Storage (GCS) to a local
#' directory and reads it into R as a data frame. It uses the `gsutil`
#' command-line tool to handle the file download.
#'
#' @param path (char) GCS path, i.e., starts with "gs://"
#' @param sep (char) column separator to use with [data.table::fread]
#' @param tmpdir (char) scratch directory to download files from GCS
#' @param gsutil_path (char) path to \code{gsutil} on your computer.
#' Can be "gsutil" if \code{gsutil} is in your \code{$PATH}.
#' @param check_first (char) check if file exists in \code{tmpdir} before
#' downloading it. Read in existing file if it exists.
#' Should be set to \code{TRUE} if you are running this function in parallel.
#' @param header (bool) whether input file has a header line
#' @param verbose (logical) `TRUE` shows messages (default `FALSE`)
#' @param ... optional arguments for [data.table::fread]
#' @param path Character. The path to the file in GCS, e.g., `gs://bucket-name/file-name.csv`.
#' @param sep Character. The field separator character. Default is `\t`.
#' @param header Logical. Whether the file contains the names of the variables
#' as its first line. Default is TRUE.
#' @param tmpdir Character. The local directory to which the file will be
#' downloaded.
#' @param gsutil_path Character. The path to the `gsutil` command-line tool.
#' Default is "gsutil".
#' @param check_first Logical. Whether to check if the file already exists
#' locally before downloading. Default is TRUE.
#' @param verbose Logical. If TRUE, prints messages about the download process.
#' Default is FALSE.
#' @param ... Additional arguments passed to `readr::read_delim`.
#'
#' @details
#' This function first checks if the specified file exists in GCS. If the file
#' exists, it downloads the file to the specified local directory (`tmpdir`). If
#' the local directory does not exist, it will be created. The function handles
#' spaces in directory paths by quoting them appropriately. If the file is
#' successfully downloaded, it is read into R using `readr::read_delim`.
#'
#' @return a data table
#' If the `check_first` argument is set to TRUE, the function will first check
#' if the file already exists locally to avoid redundant downloads. If the file
#' is already present locally, it will not be downloaded again.
#'
#' @importFrom data.table fread
#' @return A data frame containing the contents of the downloaded file.
#'
#' @examples
#' \dontrun{
#' pheno = dl_read_gcp(path = "gs://your-bucket/file.txt")
#' df <- dl_read_gcp(
#' path = "gs://bucket-name/file-name.csv",
#' sep = ",",
#' header = TRUE,
#' tmpdir = "/local/path",
#' gsutil_path = "gsutil",
#' check_first = TRUE,
#' verbose = TRUE
#' )
#' }
#'
#' @export
dl_read_gcp <- function(path,
sep = "\t",
Expand All @@ -86,22 +107,50 @@ dl_read_gcp <- function(path,
check_first = TRUE,
verbose = FALSE,
...){


# Detect the operating system
os_name <- Sys.info()["sysname"]

# Default arguments for Mac
ignore_std_err <- TRUE
ignore_std_out <- TRUE

# Change default arguments if the OS is Windows
if (os_name == "Windows") {
ignore_std_err <- FALSE
ignore_std_out <- FALSE
}

# Check if the file exists in GCP
check_cmd <- sprintf('%s ls %s', gsutil_path, path)
file_exists <- system(check_cmd,
ignore.stdout = ignore_std_out,
ignore.stderr = ignore_std_err) == 0

if(!file_exists){
stop(paste0("\nThe file `", path, "` does not exist in GCP"))
}

# Create directory
if(!dir.exists(tmpdir)){
dir.create(tmpdir)
if(verbose) message(paste0("- New folder ", tmpdir, " created successfully"))
if(verbose) message(paste0("- New folder `", tmpdir, "` created successfully"))
}else{
if(verbose) message(paste0("- Folder `", tmpdir, "` already exists"))
}

#create the normalized version of the destination path
# create the normalized version of the destination path
tmpdir_norm <- normalizePath(tmpdir)

#if the normalized path name contains spaces, add shell quotes before it is saved to tmpdir, which ultimately goes to system()
if(grepl("\\s",tmpdir_norm)){
tmpdir<-shQuote(tmpdir_norm)
# if the normalized path name contains spaces,
# add shell quotes before it is saved to tmpdir,
# which ultimately goes to system()
if(grepl("\\s", tmpdir_norm)){
tmpdir <- shQuote(tmpdir_norm)
if(verbose) message("- The temp folder has spaces")
} else{

#Otherwise, tmpdir_norm and tmpdir can remain the same
tmpdir<-tmpdir_norm
# Otherwise, tmpdir_norm and tmpdir can remain the same
tmpdir <- tmpdir_norm
}

# Check path
Expand All @@ -110,47 +159,43 @@ dl_read_gcp <- function(path,
}else{
new_path <- file.path(tmpdir_norm, basename(path))
}

# Detect the operating system
os_name <- Sys.info()["sysname"]

# Default arguments for Mac
ignore_std_err <- TRUE
ignore_std_out <- TRUE

# Change default arguments if the OS is Windows
if (os_name == "Windows") {
ignore_std_err <- FALSE
ignore_std_out <- FALSE
}

# only download if it doesn't exist to avoid conflicts when running this script in parallel; clear scratch space when you're done
# only download if it doesn't exist to avoid conflicts when running this
# script in parallel; clear scratch space when you're done
if(check_first){
if( !file.exists(new_path) ){
# cp file from GCP
cmd <- sprintf('%s cp %s %s', gsutil_path, path, tmpdir)
if(verbose) message(paste0("- Running command ", cmd))
system(cmd,
ignore.stdout = ignore_std_out,
ignore.stderr = ignore_std_err)
message("- Downloaded file: ", new_path)
if(verbose) message("- Downloaded file: ", new_path)
}else{
if(verbose) message(paste0("- The file <", new_path, "> already exists"))
if(verbose) message(paste0("- The file `", new_path, "` already exists. LOADING EXISTING VERSION"))
}
}else{
if(verbose) message(paste("- Downloading file (from GCP) <", basename(path), ">"))
if(verbose) message(paste0("- Downloading file (from GCP): `", basename(path), "`"))
cmd <- sprintf('%s cp %s %s', gsutil_path, path, tmpdir)
system(cmd,
ignore.stdout = ignore_std_out,
ignore.stderr = ignore_std_err)
message("- Downloaded file: ", new_path)
if(verbose) message("- Downloaded file: ", new_path)
}
# read in the data as a data.table

# read in the data using readr instead of data.table
if(file.exists(new_path)){
df <- readr::read_delim(new_path, delim = sep, col_names = header, skip_empty_rows = TRUE, show_col_types = FALSE, ...)
df <- readr::read_delim(new_path,
delim = sep,
col_names = header,
skip_empty_rows = TRUE,
show_col_types = FALSE, ...)
df <- as.data.frame(df)
return(df)
}else{
stop("- Problems loading the file. Possible reason: the file does not exist in the bucket anymore. Please, validate the address. Re-run this command again with `verbose = TRUE`)")
stop("Problems loading the file.
Something might have gone wrong with the download.
Re-run this command again with `verbose = TRUE`)")
}
}

Expand Down
54 changes: 39 additions & 15 deletions man/dl_read_gcp.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 49913fb

Please sign in to comment.