From 25a366be6927cec8e742013db3d4a2344ab9e91b Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Fri, 10 May 2024 09:10:27 -0700 Subject: [PATCH 1/2] filter out special control characters this will fix #96 --- NEWS.md | 2 ++ R/to_xml.R | 9 ++++++--- tests/testthat/test-to_xml.R | 9 +++++++++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/NEWS.md b/NEWS.md index 45d39ff..a86a35b 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,8 @@ * Inline math with single characters will no longer cause an error (issue: #101, fix: #103, @maelle) +* Special control characters are now filtered out before processing XML (issue: + #96, fix: #111, @zkamvar) ## MISC diff --git a/R/to_xml.R b/R/to_xml.R index 8f3042c..49c54e7 100644 --- a/R/to_xml.R +++ b/R/to_xml.R @@ -66,10 +66,13 @@ to_xml <- function(path, encoding = "UTF-8", sourcepos = FALSE, anchor_links = T clean_content <- function(content){ + illegal_control_chars <- "[^\u0009\u000a\u000d\u0020-\uD7FF\uE000-\uFFFD]" + smart_double_quotes <- "[\u201C\u201D]" + smart_single_quotes <- "[\u2018\u2019]" content %>% - str_replace_all("\u201C", '"') %>% - str_replace_all("\u201D", '"') %>% - str_replace_all("\u2019", "'") + str_replace_all(smart_double_quotes, '"') %>% + str_replace_all(smart_single_quotes, "'") %>% + str_replace_all(illegal_control_chars, "") } diff --git a/tests/testthat/test-to_xml.R b/tests/testthat/test-to_xml.R index c72025f..108e4da 100644 --- a/tests/testthat/test-to_xml.R +++ b/tests/testthat/test-to_xml.R @@ -23,6 +23,15 @@ test_that("to_xml works for Rmd", { }) +test_that("to_xml can parse markdown with special control characters", { + tmp <- withr::local_tempfile() + writeLines("\u2018test single\u2019 \u001C\u201Ctest double\u201D", tmp) + expect_no_error(xml <- tinkr::to_xml(tmp)) + expect_equal(xml2::xml_text(xml$body), "'test single' \"test double\"") +}) + + + test_that("to_xml will not convert numeric options to character", { txt <- "```{r txt, fig.width=4.2, fig.height=4.2, out.width='100%', purl = TRUE}\n#code\n```" con <- textConnection(txt) From c8b887c42533961fd4c2769ab7d08564768f060f Mon Sep 17 00:00:00 2001 From: "Zhian N. Kamvar" Date: Fri, 10 May 2024 09:43:30 -0700 Subject: [PATCH 2/2] add test skip for insufficient UTF-8 support --- tests/testthat/test-to_xml.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/testthat/test-to_xml.R b/tests/testthat/test-to_xml.R index 108e4da..a832b9b 100644 --- a/tests/testthat/test-to_xml.R +++ b/tests/testthat/test-to_xml.R @@ -24,6 +24,11 @@ test_that("to_xml works for Rmd", { test_that("to_xml can parse markdown with special control characters", { + # skip if we are on windows with R version lower than 4.2.0 + os <- tolower(Sys.info())[["sysname"]] + no_utf8_support <- os == "windows" && getRversion() < numeric_version('4.2.0') + skip_if(no_utf8_support, message = "this system cannot test UTF-8 output") + tmp <- withr::local_tempfile() writeLines("\u2018test single\u2019 \u001C\u201Ctest double\u201D", tmp) expect_no_error(xml <- tinkr::to_xml(tmp))