Patch performance (#16)

* Much faster, cleaner parsing of SPARQL returns Uses getResults() method from redland package internals. This is way way faster for returning large numbers of results. This also sidesteps the need to rectangularize query results and manually coerce types; readr instead can handle that for us (as well as one can duck type from strings). * suggest nycflights13 data * rdf_add can handle NA as a blank node * c() method use turtle to save disk space * parser and serializer will guess format based on file extension, closes #4 * serializer also sets explicit base option * serializer defaults to print to character string if doc is NULL. * cleaning up as_rdf methods * methods take vocab, base, and key * datatype should not be assigned to blank nodes * avoid use of `c()` by passing `rdf` argument * add option to reconnect to an existing database * indicate storage type in rdf() constructor * update tests & check `goodpractice::gp()` * update pkgdown * skip `has_bdb` on appveyor for unknown reasons
ropensci · Feb 20, 2018 · 01f59bb · 01f59bb
1 parent 78b7d31
commit 01f59bb
Show file tree

Hide file tree

Showing 47 changed files with 1,400 additions and 546 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -28,23 +28,26 @@ Imports: redland,
     jsonld,
     methods,
     utils,
-    stringi
+    stringi,
+    readr
 RoxygenNote: 6.0.1
 Roxygen: list(markdown = TRUE)
 Suggests: magrittr,
     covr,
     testthat,
     knitr,
     rmarkdown,
-    jsonlite,
-    httr,
-    xml2,
     jqr,
-    lubridate,
     DT,
     tidyverse,
-    readr,
     dplyr,
     tidyr,
-    tibble
+    tibble,
+    purrr,
+    lubridate,
+    httr,
+    xml2,
+    jsonlite,
+    repurrrsive,
+    nycflights13
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -21,6 +21,7 @@ importFrom(jsonld,jsonld_compact)
 importFrom(jsonld,jsonld_to_rdf)
 importFrom(methods,as)
 importFrom(methods,new)
+importFrom(readr,read_csv)
 importFrom(stringi,stri_unescape_unicode)
 importFrom(utils,capture.output)
 importFrom(utils,download.file)

diff --git a/NEWS.md b/NEWS.md
@@ -2,13 +2,21 @@
 
 ## New Features
 
-* adds `c()` method to concatenate `rdf` objects
+
 * `rdf()` supports BDB backend for disk-based storage for large
    triplestores [#6](https://github.com/cboettig/rdflib/issues/6)
 * `rdf_parse()` gains an argument `rdf` to append triples to existing graph
+* adds `c()` method to concatenate `rdf` objects
+* Performance improvements make it possible to handle triplestores with millions of triples
+* Two new vignettes better introduce RDF and package functions.
 
 ## Minor Improvements
 
+* `rdf_query` now bypasses the the very slow iteration over `getNextResult`
+   approach and uses an internal redland function call to access all results
+   at once in csv format.
+* experimental `as_rdf` method now uses a poor-man's nquad serializer to
+  rapidly generate rdf (instead of slowly iterating over `add_rdf`).  
 
 * `rdf_add` argument for `object` can now take all atomic types
    (numeric, integer, string, Date, POSIX, logical) and 

diff --git a/R/rdf.R b/R/rdf.R
@@ -1,8 +1,10 @@
 #' Initialize an `rdf` Object
 #'
+#'@param storage Use in-memory hashes ("memory"), or disk based storage ("BDB")? 
 #' @param path where should local database to store RDF triples be created, if
 #' configured for disk-based storage; see details.
-#'
+#' @param new_db logical, default FALSE. should we create a new database on disk
+#' or attempt to connect to an existing database (at the path specified)?
 #' @return an rdf object
 #' @details an rdf Object is a list of class 'rdf', consisting of
 #' three pointers to external C objects managed by the redland library.
@@ -12,8 +14,7 @@
 #' 
 #' `rdflib` defaults to an in-memory hash-based storage structure. 
 #' which should be best for most use cases. For very large triplestores,
-#' disk-based storage will be necessary.  Enable this by setting the option
-#' `options(rdflib_storage = "BDB")` before calling `rdf()` to use disk-based
+#' disk-based storage will be necessary. set `storage="BDB"` to use disk-based
 #' storage. Specify a path with the optional `path` argument, default uses
 #' the current working directory. Disk-based storage requires redland package
 #' to be installed from source with support for the Berkeley DB 
@@ -25,19 +26,6 @@
 #' Typical use will be simply to initialize a container to which
 #' the user would manually add triples using \code{\link{rdf_add}}.
 #'
-#' Overview of configuration options
-#' rdflib_storage:
-#'   - NULL or "memory" for in memory storage. (default)
-#'   - "BDB" for disk-based storage in Berkeley Database
-#' rdflib_print_format: 
-#'   - NULL or "nquads" (default)
-#'   - any valid serializer name: e.g. "rdfxml", "jsonld", "turtle",  "ntriples"
-#' rdflib_base_uri:
-#'   - Default base URI to use (when serializing JSON-LD only at this time)
-#'     default is "localhost://"
-#'
-#'
-#'
 #' @importClassesFrom redland World Model Storage
 #' @importMethodsFrom redland freeWorld freeModel freeStorage
 #' @importFrom utils capture.output
@@ -46,14 +34,20 @@
 #' @examples
 #' x <- rdf()
 #' 
-rdf <- function(path = "."){
+rdf <- function(storage = c("memory", "BDB"), path = ".", new_db = FALSE){
   world <- new("World")
 
+
   ## Handle storage type
-  if(getOption("rdflib_storage", "memory") == "BDB"){
+  storage <- match.arg(storage)
+  if(storage == "BDB"){
     if(rdf_has_bdb()){
       ## Store in Berkeley DB
-      options <- paste0("new='yes',hash-type='bdb',dir='", path, "'") 
+      if(new_db){
+        options <- paste0("new='yes',hash-type='bdb',dir='", path, "'") 
+      } else {
+        options <- paste0("hash-type='bdb',dir='", path, "'") 
+      }
     } else {
       warning("BDB driver not found. Falling back on in-memory storage")
       options <- "hash-type='memory'"
@@ -91,7 +85,7 @@ rdf <- function(path = "."){
 #'
 #' For more information, see the Wikipedia pages for RDF, SPARQL, and JSON-LD:
 #' 
-#' #' \itemize{
+#' \itemize{
 #' \item \url{https://en.wikipedia.org/wiki/Resource_Description_Framework}
 #' \item \url{https://en.wikipedia.org/wiki/SPARQL}
 #' \item \url{https://en.wikipedia.org/wiki/JSON-LD}
@@ -100,6 +94,19 @@ rdf <- function(path = "."){
 #' To learn more about rdflib, start with the vignettes:
 #' `browseVignettes(package = "rdflib")`
 #'
+#'  Configurations via `options()`
+#' 
+#' `rdflib_print_format`:
+#'  
+#' - NULL or "nquads" (default)
+#' - any valid serializer name: e.g. "rdfxml", "jsonld", "turtle",  "ntriples"
+#'   
+#' `rdflib_base_uri`:
+#' 
+#' - Default base URI to use (when serializing JSON-LD only at this time)
+#'     default is "localhost://"
+#'
+#'
 #'
 "_PACKAGE"
 

diff --git a/R/rdf_add.R b/R/rdf_add.R
@@ -28,7 +28,8 @@
 #'  typed as resource nodes, otherwise as literals.  An empty object `""`
 #'  will be treated as blank node.  Set `subjectType` or `objectType` 
 #'  explicitly to override this behavior, e.g. to treat an object URI
-#'  as a literal string.  See examples.  
+#'  as a literal string.  NAs are also treated as blank nodes in subject
+#'  or ojbect.  See examples for details.
 #' 
 #' @references <https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>
 #' @importClassesFrom redland Statement
@@ -80,6 +81,15 @@ rdf_add <- function(rdf, subject, predicate, object,
   if(is.na(datatype_uri)){
     datatype_uri <- xs_class(object)
   }
+
+  ## predicate as blank node
+  if(is.na(subject)){
+    subject <- ""
+  }
+  if(is.na(object)){
+    object <- ""
+    datatype_uri <- as.character(NA)
+  }
 
 
   stmt <- new("Statement", world = rdf$world, 

diff --git a/R/rdf_has_bdb.R b/R/rdf_has_bdb.R
@@ -27,4 +27,5 @@ rdf_has_bdb <- function(){
   redland::freeWorld(world)
 
   out
-}
+}
+
diff --git a/R/rdf_methods.R b/R/rdf_methods.R
@@ -12,17 +12,19 @@ c.rdf <- function(...){
   rdf <- rdfs[[1]]
   for(i in seq_along(rdfs)){
     f <- file.path(loc,paste0(i, ".rdf"))
-    rdf_serialize(rdfs[[i]],f) 
-    rdf_parse(f, rdf = rdf)
+    rdf_serialize(rdfs[[i]],f, format = "turtle") 
+    rdf_parse(f, rdf = rdf, format = "turtle")
+    file.remove(f)
   }
+  unlink(loc)
   rdf
 }
 
 
 
 #' @export
 print.rdf <- function(x, ...){
-  cat(format.rdf(x), sep = "\n")
+  cat(format.rdf(x, ...), sep = "\n")
 }
 
 
@@ -34,7 +36,8 @@ format.rdf <- function(x,
   tmp <- tempfile()
   rdf_serialize(x, 
                 tmp,
-                format = format)
+                format = format,
+                ...)
   ## Fix encoding on nquads, ntriples 
   txt <- stringi::stri_unescape_unicode(
     paste(readLines(tmp), collapse = "\n"))

diff --git a/R/rdf_parse.R b/R/rdf_parse.R
@@ -3,9 +3,11 @@
 #' @param doc path, URL, or literal string of the rdf document to parse
 #' @param format rdf serialization format of the doc,
 #' one of "rdfxml", "nquads", "ntriples", "turtle"
-#' or "jsonld"
+#' or "jsonld". If not provided, will try to guess based
+#' on file extension and fall back on rdfxml.
 #' @param rdf an existing rdf triplestore to extend with triples from
 #' the parsed file.  Default will create a new rdf object.
+#' @param base the base URI to assume for any relative URIs (blank nodes)
 #' @param ... additional parameters (not implemented)
 #'
 #' @return an rdf object, containing the redland world
@@ -20,31 +22,41 @@
 #' rdf <- rdf_parse(doc)
 #'
 rdf_parse <- function(doc,
-                      format = c("rdfxml",
+                      format = c("guess",
+                                 "rdfxml",
                                  "nquads",
                                  "ntriples",
                                  "turtle",
                                  "jsonld"),
                       rdf = NULL,
+                      base = getOption("rdflib_base_uri", "localhost://"),
                       ...){
+
   format <- match.arg(format)
+  if(format == "guess"){
+    format <- guess_format(doc)
+  }
+
+  ## if we get a string as input, we'll store it in tmp file here
+  ## which we can later be sure to clean up.
+  tmp_string <- tempfile()
+  ## if we get json-ld, we'll need a temp location to serialize that too:
+  tmp_json <- tempfile()
 
   # convert string input or url to local file
-  doc <- text_or_url_to_doc(doc)
+  doc <- text_or_url_to_doc(doc, tmp_string)
 
   ## redlands doesn't support jsonld. So rewrite as nquads using jsonld package
   ## We use tmp to avoid altering input doc, since parsing a local file should
   ## be a read-only task!
   if(format == "jsonld"){
-    tmp <- tempfile()
-    #tmp <- add_base_uri(doc, tmp)
     x <- jsonld::jsonld_to_rdf(doc, 
                                options = 
            list(base = getOption("rdflib_base_uri", "localhost://"),
                 format = "application/nquads"))
-    writeLines(x, tmp)
+    writeLines(x, tmp_json)
     format <- "nquads"
-    doc <- tmp
+    doc <- tmp_json
   }
 
   if(is.null(rdf)){
@@ -53,9 +65,13 @@ rdf_parse <- function(doc,
 
   mimetype <- unname(rdf_mimetypes[format])
   parser <- new("Parser", rdf$world, name = format, mimeType = mimetype)
-  redland::parseFileIntoModel(parser, rdf$world, doc, rdf$model)
+  redland::parseFileIntoModel(parser, rdf$world, doc, rdf$model, baseUri = base)
+
   redland::freeParser(parser)
+  unlink(tmp_string)
+  unlink(tmp_json)  
 
+  ## return rdf object (pointer)
   rdf
 }
 

diff --git a/R/rdf_query.R b/R/rdf_query.R
@@ -32,58 +32,32 @@
 #'
 rdf_query <- function(rdf, query, data.frame = TRUE, ...){
   queryObj <- new("Query", rdf$world, query, ...)
-  # ... defaults are: base_uri=NULL, query_language="sparql", query_uri=NULL)
+
+  # ... defaults are: base_uri=NULL, query_language="sparql", query_uri=NULL
+
   queryResult <- redland::executeQuery(queryObj, rdf$model)
-
-  out <- list()
-  result <- redland::getNextResult(queryResult)
-  out <- c(out, result)
-  while(!is.null(result)){
-    result <- redland::getNextResult(queryResult)
-    out <- c(out, result)
-
-  }
+  out <- getResults(queryResult)
   redland::freeQueryResults(queryResult)
   redland::freeQuery(queryObj)
 
-  if(data.frame){
-    out <- rectangularize_query_results(out)
-  } else {
-    ## group by query variable 
-#    vars <- unique(names(out))
-#    out <- lapply(vars, function(v){ 
-#      contents <- as.character(out[names(out) == v ])
-#      type_by_datauri(contents)  
-#      })
-#  names(out) <- vars
-  }
   out
 }
 
+## Notes
+## readr does a pretty good job guessing types returned from sparql
+## character, numeric, integer, Dates, POSIXct work fine
+## logicals are denoted `true` and `false`, which readr mistakes for characters
 
-rectangularize_query_results <- function(out){
-  vars <- unique(names(out))
-
-  X <- lapply(vars, function(v){ 
-    contents <- as.character(out[names(out) == v ])
-    values <- type_by_datauri(contents)
-
-    ## use "character" if mixed type column
-    types <- vapply(values, function(x) class(x)[[1]], character(1))
-    u <- unique(types)
-    if(length(u) == 1){
-      values <- unlist(values)
-      if(u %in% c("Date", "POSIXct"))
-      class(values) <- unique(types) # Restore date class
-    } else {
-      values <- vapply(values, as.character, character(1))
-    }
-    values
-  })
-
-  names(X) <- vars
-  ## Or we could use tibble::as_data_frame for list columns w/ mixed type..
-  as.data.frame(X, stringsAsFactors=FALSE)
-}
+## Redland only exports the getNextResult parser, which is extremely slow on large returns
 
+#' @importFrom readr read_csv
+getResults <- function(queryResult, format = "csv", ...){
+  mimetype <- switch(format,
+                     "csv" = "text/csv; charset=utf-8",
+                     NULL)
+  readr::read_csv(redland:::librdf_query_results_to_string2(
+                            queryResult@librdf_query_results, 
+                            format, mimetype, NULL, NULL), 
+                  ...)
+}