Skip to content

Commit

Permalink
fix utf-8 encoding issue
Browse files Browse the repository at this point in the history
  • Loading branch information
cboettig committed Feb 3, 2018
1 parent ab51029 commit b74627c
Show file tree
Hide file tree
Showing 7 changed files with 154 additions and 32 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ BugReports: https://github.com/cboettig/rdflib/issues
Imports: redland,
jsonld,
methods,
utils
utils,
stringi
RoxygenNote: 6.0.1
Roxygen: list(markdown = TRUE)
Suggests: magrittr,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ importFrom(jsonld,jsonld_expand)
importFrom(jsonld,jsonld_to_rdf)
importFrom(methods,as)
importFrom(methods,new)
importFrom(stringi,stri_unescape_unicode)
importFrom(utils,capture.output)
importFrom(utils,download.file)
importMethodsFrom(redland,addStatement)
Expand Down
1 change: 1 addition & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

## Bug Fixes

* fix encoding with UTF-8 characters (coming from nquads & ntriples)
* `rdf_query` now coerces data into appropriate type
if it recognizes the data URI and can match that
to an R type (a few XMLSchema types are recognized,
Expand Down
31 changes: 21 additions & 10 deletions R/rdf.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ format.rdf <- function(x,
rdf_serialize(x,
tmp,
format = format)
txt <- paste(readLines(tmp), collapse = "\n")
## Fix encoding on nquads, ntriples
txt <- utf8me(paste(readLines(tmp), collapse = "\n"))
unlink(tmp)
txt
}
Expand Down Expand Up @@ -282,6 +283,7 @@ rdf_query <- function(rdf, query, ...){
}



#' Add RDF Triples
#'
#' add a triple (subject, predicate, object) to the RDF graph
Expand All @@ -301,15 +303,18 @@ rdf_query <- function(rdf, query, ...){
#'
#' @details
#'
#' - Predicate should always be a
#' [URI](https://en.wikipedia.org/wiki/Uniform_Resource_Identifier).
#' - Subject should be either URI or a character string. If subject string
#' does not look like a URI (e.g. a URL or a `prefix:string`), it
#' will be typed as a blank node and prefixed with `_:` automatically,
#' equivalent to setting `subjectType="blank"`, See examples.
#' - Object will automatically type URIs as URIs, strings as literals,
#' and empty strings as blank nodes. Override by setting `objectType`
#' explicitly (e.g. to treat a URL as a literal; see examples)
#' `rdf_add()` will automatically 'duck type' nodes (if looks like a duck...).
#' That is, strings that look like URIs will be declared as URIs. (See
#' [URI](https://en.wikipedia.org/wiki/Uniform_Resource_Identifier)).
#' Predicate should always be a URI (e.g. URL or a `prefix:string`),
#' cannot be blank or literal. Subjects that look like strings will be
#' treated as [Blank Nodes](https://en.wikipedia.org/wiki/Blank_node) (i.e.
#' will be prefixed with `_:`). An empty subject, `""`, will create a
#' blank node with random name. Objects that look like URIs will be
#' typed as resource nodes, otherwise as literals. An empty object `""`
#' will be treated as blank node. Set `subjectType` or `objectType`
#' explicitly to override this behavior, e.g. to treat an object URI
#' as a literal string. See examples.
#'
#' @references <https://en.wikipedia.org/wiki/Uniform_Resource_Identifier>
#' @importClassesFrom redland Statement
Expand Down Expand Up @@ -354,6 +359,12 @@ rdf_add <- function(rdf, subject, predicate, object,
subjectType = as.character(NA),
objectType = as.character(NA),
datatype_uri = as.character(NA)){

## FIXME: datatype_uri should be inferred from object
## Rather than making object a character.
## Note: redland doesn't appear to support implicit datatype


stmt <- new("Statement", world = rdf$world,
subject, predicate, as.character(object),
subjectType, objectType, datatype_uri)
Expand Down
24 changes: 14 additions & 10 deletions R/utilities.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,25 +21,28 @@ type_by_datauri <- function(x){
types <- get_types(x)
r_types <- vapply(get_types(x), r_class, character(length(1)))
df <- data.frame(value = get_values(x), class = r_types)
apply(df, 1, function(x) as(x[1], x[2]))
apply(df, 1, function(x) as(utf8me(x[1]), x[2]))
}

#' @importFrom stringi stri_unescape_unicode
utf8me <- function(x){
removed_quotes <- gsub('\"', '', x)
stringi::stri_unescape_unicode(removed_quotes)
}

rectangularize_query_results <- function(out){
vars <- unique(names(out))
X <- lapply(vars, function(v)
## Strip ^^TYPE typing
#gsub('\"(([^\\^])+)\"\\^*.*',
# "\\1",
type_by_datauri(as.character(out[names(out) == v ])))
X <- lapply(vars, function(v){
contents <- as.character(out[names(out) == v ])
type_by_datauri(contents)
})
names(X) <- vars
as.data.frame(X, stringsAsFactors=FALSE)
}





has_bdb <- function(){
## Unfortunately convoluted way to check if we have Berkeley DB Support
world <- new("World")
Expand Down Expand Up @@ -68,9 +71,10 @@ rdf_mimetypes <- c("nquads" = "text/x-nquads",
"trig" = "application/x-trig",
"turtle" = "text/turtle")

# trig not working right now, not clear why
# Consider adding/testing:
# - n3 (text/n3)
## My redland version does not find support for these, probably optional
## additional dependency needed when compiling redland
# - trig (application/x-trig)
# - n3 (text/n3)
# - rdfa (application/xhtml+xml, or text/html)
# - rss (application/rss+xml or text/rss)

Expand Down
73 changes: 62 additions & 11 deletions inst/examples/as_rdf.R
Original file line number Diff line number Diff line change
Expand Up @@ -49,29 +49,80 @@ xs_class <- function(x){


as_rdf.list <- function(x){

}


## Testing: Digest some data.frames into RDF and extract back

library(tidyverse)
cars <- mtcars[1:4, 1:4] %>% rownames_to_column("Model")


x1 <- as_rdf(iris)
x2 <- as_rdf(cars)
rdf <- c(x1,x2)


#' @param ... names of the properties that should make up columns in the table
#' @param columns Alternatively, supply a vector of property names.
#' @param prefix the URI string to prefix before the property names to give
#' fully-resolved properties.
#' @param na.rm logical, default TRUE. Will not return a row for any
#' subject that does not have a object value matching the schema.
#' @examples
#'
#' sparql <- table_schema("Species", "Sepal.Length", prefix = "iris")
#' rdf_query(rdf, sparql)
#'
#' ## use columns arg for an existing vector of names.
#' columns <- names(iris)
#' sparql <- table_schema(columns = columns, prefix = "iris")
#' rdf_query(rdf, sparql)
#'
#' ## use na.rm = FALSE to include NA if variable is not defined for some observations
#' sparql <- table_schema("Species", "Sepal.Length", "Sepal.Color",
#' prefix = "iris", na.rm=FALSE)
#' rdf_query(rdf, sparql)
table_schema <- function(..., columns = NULL, prefix=NULL, na.rm = TRUE){
if(is.null(columns)){
columns <- c(...)
}

if(!is.null(prefix)){
attributes <- paste0("<",prefix, ":", columns, ">")
} else {
attributes <- paste0("<", columns, ">")
}
## Replace forbidden characters with "_",
## See spec: https://www.w3.org/TR/rdf-sparql-query/#rVARNAME
vars <- gsub("[^a-zA-Z1-9_]","_", basename(columns))
select <- paste0("?", vars)

where <- paste("?s", attributes, select)
if(!na.rm)
where <- paste("OPTIONAL {", where, "}")
else
where <- paste(where, ".")
where <- paste(where, collapse = "\n")
query <- paste("SELECT", paste0(select, collapse = " "), "\nWHERE {\n", where, "\n}")
query
}


sparql <- table_schema("Sepal.Length", "Sepal.Width",
"Petal.Length", "Petal.Width",
"Species", prefix="iris")
sparql <- table_schema(columns=names(iris), prefix="iris")
rdf_query(rdf, sparql)


## Look, original data back, no filter / spread required!
## Could write helper function to construct this pattern of sparql?
## e.g. return a data.frame with all observations of a list of attributes (properties)
sparql <-
'SELECT ?Sepal_Length ?Sepal_Width ?Petal_Length ?Petal_Width ?Species
'SELECT ?Sepal_Length ?Sepal_Width ?Petal_Length ?Petal_Color ?Species
WHERE {
?s <iris:Sepal.Width> ?Sepal_Width .
?s <iris:Sepal.Length> ?Sepal_Length .
?s <iris:Petal.Width> ?Petal_Width .
?s <iris:Petal.Length> ?Petal_Length .
?s <iris:Species> ?Species .
OPTIONAL { ?s <iris:Sepal.Width> ?Sepal_Width }
OPTIONAL { ?s <iris:Sepal.Length> ?Sepal_Length }
OPTIONAL { ?s <iris:Petal.Color> ?Petal_Color }
OPTIONAL { ?s <iris:Petal.Length> ?Petal_Length }
OPTIONAL { ?s <iris:Species> ?Species }
}'
tmp <- rdf_query(rdf, sparql)
rdf_query(rdf, sparql) %>% head()
53 changes: 53 additions & 0 deletions inst/examples/debug.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@

library(redland)
world <- new("World")
storage <- new("Storage", world, "hashes", name="", options="hash-type='memory'")
model <- new("Model", world, storage, options="")

stmt <- new("Statement",
world = world,
subject="",
predicate="http://schema.org/name",
object="Maëlle Salmon")
addStatement(model, stmt)

stmt <- new("Statement",
world = world,
subject="",
predicate="http://schema.org/name",
object="Matt Jones")
addStatement(model, stmt)

query <-'SELECT ?o WHERE { ?s ?p ?o}'
queryObj <- new("Query", world, query)
queryResult <- executeQuery(queryObj, model)
r <-getNextResult(queryResult)
r

## These two fail to encode UTF-8, I get "Ma\u00EBlle" not Maëlle
serializer <- new("Serializer", world, name = "nquads", mimeType = "text/x-nquads")
redland::serializeToFile(serializer, world, model, "test.rdf")
cat(readLines("test.rdf"))

serializer <- new("Serializer", world, name = "ntriples", mimeType = "application/n-triples")
redland::serializeToFile(serializer, world, model, "test.rdf")
cat(readLines("test.rdf"))

## As expected here
serializer <- new("Serializer", world, name = "turtle", mimeType = "text/turtle")
redland::serializeToFile(serializer, world, model, "test.rdf")
cat(readLines("test.rdf"))

serializer <- new("Serializer", world)
redland::serializeToFile(serializer, world, model, "test.rdf")
cat(readLines("test.rdf"))


libary(rdflib)
r <- rdf()
rdf_add(r,
subject="",
predicate="http://schema.org/name",
object="Maëlle Salmon")

r

0 comments on commit b74627c

Please sign in to comment.