use rdflib_base_uri throughout

ropensci · cboettig · Feb 20, 2018 · Feb 16, 2018 · Feb 17, 2018 · Feb 17, 2018
commit 55749382efe0c2b483934d8526369305ab9a9f1e
diff --git a/R/rdf_parse.R b/R/rdf_parse.R
@@ -7,6 +7,7 @@
 #' on file extension and fall back on rdfxml.
 #' @param rdf an existing rdf triplestore to extend with triples from
 #' the parsed file.  Default will create a new rdf object.
+#' @param base the base URI to assume for any relative URIs (blank nodes)
 #' @param ... additional parameters (not implemented)
 #'
 #' @return an rdf object, containing the redland world
@@ -28,6 +29,7 @@ rdf_parse <- function(doc,
                                  "turtle",
                                  "jsonld"),
                       rdf = NULL,
+                      base = getOption("rdflib_base_uri", "localhost://"),
                       ...){
 
   format <- match.arg(format)
@@ -63,7 +65,7 @@ rdf_parse <- function(doc,
 
   mimetype <- unname(rdf_mimetypes[format])
   parser <- new("Parser", rdf$world, name = format, mimeType = mimetype)
-  redland::parseFileIntoModel(parser, rdf$world, doc, rdf$model)
+  redland::parseFileIntoModel(parser, rdf$world, doc, rdf$model, baseUri = base)
 
   redland::freeParser(parser)
   unlink(tmp_string)

diff --git a/R/rdf_serialize.R b/R/rdf_serialize.R
@@ -38,7 +38,7 @@ rdf_serialize <- function(rdf,
                                      "jsonld"),
                           namespace = NULL,
                           prefix = NULL,
-                          base = getOption("rdflib_base_uri", as.character(NA)),
+                          base = getOption("rdflib_base_uri", "localhost://"),
                           ...){
 
   format <- match.arg(format)

diff --git a/inst/examples/as_rdf.R b/inst/examples/as_rdf.R
@@ -12,27 +12,35 @@
 as_rdf <- function(x, 
                    rdf = rdf(),
                    vocab = NULL, 
-                   base = getOption("rdflib_base_uri"), 
+                   base = getOption("rdflib_base_uri", "localhost://"), 
                    context = NULL, 
                    ...) UseMethod("as_rdf")
 
 as_rdf.list <- function(x, 
+                        rdf = NULL,
                         vocab = NULL, 
-                        base = getOption("rdflib_base_uri"), 
+                        base = getOption("rdflib_base_uri", "localhost://"), 
                         context = NULL){
 
+  if(is.null(rdf)){
+    rdf <- rdf()  
+  }
   ## unbox length-1 lists so we can apply a context successfully
   if(is(x, "list") && length(x) == 1) x <- x[[1]]
 
   json <- jsonlite::toJSON(x, pretty = TRUE, auto_unbox = TRUE, force = TRUE)
   jsonld_context <- json_context(vocab, base, context)
   json2 <- paste0('{\n"@context":', jsonld_context, ',\n',  '"@graph": ', json,  '}')
-  rdf <- rdflib::rdf_parse(json2, "jsonld", rdf = rdf)
+  rdflib::rdf_parse(json2, "jsonld", rdf = rdf)
   rdf
 }
+
+
 # helper function (identical to plyr::compact)
 compact <- function (l) Filter(Negate(is.null), l)
-json_context <- function(vocab = NULL, base = getOption("rdflib_base_uri"), context = NULL){
+json_context <- function(vocab = NULL, 
+                         base = getOption("rdflib_base_uri", "localhost://"), 
+                         context = NULL){
   jsonlite::toJSON(
     compact(c(list("@vocab" = vocab,
                    "@base" = base),
@@ -52,11 +60,14 @@ json_context <- function(vocab = NULL, base = getOption("rdflib_base_uri"), cont
 
 ## tidy data to rdf
 as_rdf.data.frame <- function(df,  
-                              rdf = rdf(), 
+                              rdf = NULL, 
                               vocab = NULL, 
-                              base = getOption("rdflib_base_uri"), 
+                              base = getOption("rdflib_base_uri", "localhost://"), 
                               context = NULL, 
                               key = NULL){
+  if(is.null(rdf)){
+    rdf <- rdf()  
+  }
 
   ## gather looses col-classes, so pre-compute them (with base R)
   col_classes <- data.frame(datatype = 
@@ -86,7 +97,7 @@ as_rdf.data.frame <- function(df,
   ## And parse text file.  Way faster than adding row by row!
   ## but still about 8 s on 800K triples, all in the C layer
 
-  rdf <- rdf_parse(loc, "nquads")
+  rdf <- rdf_parse(loc, rdf = rdf, format = "nquads")
   unlink(loc)
 
   rdf
@@ -103,23 +114,30 @@ poor_mans_nquads <- function(x, loc, vocab){
   ## However, this seems to be fast enough that it is rarely the bottleneck
 
   ## NOTE: paste0 is a little slow ~ 1 s on 800K triples
+  ## No datatype on blank (missing) nodes
+
+  blank_object <-is.na(x$object)
+  blank_subject <- is.na(x$subject)
+
+  x$datatype[blank_object] <- as.character(NA)
   ## NA needs to become a unique blank node number, could do uuid or _:r<rownum>
-  x$object[is.na(x$object)] <- paste0("_:r", which(is.na(x$object)))
-  x$subject[is.na(x$subject)] <- paste0("_:r", which(is.na(x$subject)))
+  x$object[blank_object] <- paste0("_:r", which(blank_object))
+  x$subject[blank_subject] <- paste0("_:r", which(blank_subject))
 
   ## strings and URIs do not get a datatype
   needs_type <- !is.na(x$datatype)
 
-  x$subject = paste0("<", vocab, x$subject, ">")
+  ## URIs that are not blank nodes need <>
+  x$subject[!blank_subject] = paste0("<", vocab, x$subject[!blank_subject], ">")
   ## Predicate is always a URI
   x$predicate = paste0("<", vocab, x$predicate, ">")
 
   ## Strings should be quoted
-  is_string <- !grepl("\\w+:\\w.*", x$object) & !needs_type
+  is_string <- !grepl("\\w+:\\w.*", x$object) & !needs_type & !blank_object
   x$object[is_string] <- paste0('\"', x$object[is_string] , '\"')
 
-  ## URIs should be <> instead
-  x$object <- gsub("(^\\w+:\\w.*$)", "<\\1>", x$object)
+  ## URIs should be <> instead, but not blanks!
+  x$object[!blank_object] <- gsub("(^\\w+:\\w.*$)", "<\\1>", x$object[!blank_object])
 
   ## assumes datatype is not empty (e.g. string)
   x$object[needs_type] = paste0('\"', x$object[needs_type], 

diff --git a/man/rdf_parse.Rd b/man/rdf_parse.Rd
diff --git a/man/rdf_serialize.Rd b/man/rdf_serialize.Rd
diff --git a/vignettes/data-lake.Rmd b/vignettes/data-lake.Rmd
@@ -38,6 +38,8 @@ library(jsonlite)
 library(rdflib)
 ## experimental functions for rdflib package
 source(system.file("examples/as_rdf.R", package="rdflib"))
+source(system.file("examples/tidy_schema.R", package="rdflib"))
+
 ```
 
 Configure RDF storage:
@@ -48,8 +50,7 @@ options(rdflib_storage = "memory")
 
 # Use a smaller dataset if we do not have a BDB backend: 
 if(getOption("rdflib_storage") != "BDB"){
-#flights <- flights %>% 
-#  filter(distance > 2500) # try smaller dataset
+  flights <- flights[1:1e3,]
 }
 ```
 
@@ -89,14 +90,15 @@ Similarly, when reading into RDF we have to declare the key column for the table
 and again establish a `base_uri` which will allow RDF methods to distinguish between URIs (subjects, predicates, and foreign keys) and literal strings.  
 
 ```{r write_rdf}
-system.time(
+system.time({
 
-rdf <- c(
-  as_rdf(airlines, key = "carrier", vocab = "x:"),
-  as_rdf(planes,  key = "tailnum", vocab = "x:"),
-  as_rdf(uri_flights, key = NULL, vocab = "x:"))
+rdf <- rdf()
 
-)
+as_rdf(airlines, rdf = rdf, key = "carrier", vocab = "x:")
+as_rdf(planes,  rdf = rdf, key = "tailnum", vocab = "x:")
+as_rdf(uri_flights, rdf = rdf, key = NULL, vocab = "x:")
+
+})
 ```
 
 Note that flights does not have a natural key (somewhat surprisingly, `flight` number is not a unique key for this table, as the same flight number is reused on the same route at different times.)  So, we will treat each row as a unique anonymous key by setting the key to `NULL`.

diff --git a/vignettes/rdf_intro.Rmd b/vignettes/rdf_intro.Rmd
@@ -350,6 +350,8 @@ _Still working on writing this section_
 
 ```{r}
 source(system.file("examples/as_rdf.R", package="rdflib"))
+source(system.file("examples/tidy_schema.R", package="rdflib"))
+
 ## Testing: Digest some data.frames into RDF and extract back
  library(tidyverse)
  cars <- mtcars %>% rownames_to_column("Model")