From a5f6fa3674ed91713adf390954fb7234618201fa Mon Sep 17 00:00:00 2001 From: Mohamed Thabet Date: Wed, 22 May 2024 23:20:58 +0300 Subject: [PATCH] Fix spaces around CSV quoted strings (#15727) This PR adds an option to CSV parsing to detect quotes even if they are surrounded by whitespaces. Current behavior when `options.keepquotes == false`: - `"A"` -> `A` - ` "A" ` -> ` "A" ` (The spaces around the 'A' are not removed and the quotes are kept) New behavior after enabling the new option: - `"A"` -> `A` - ` "A" ` -> `A` The new option is false by default to avoid breaking any code that relied on the old behavior. Closes #13892. Authors: - Mohamed Thabet (https://github.com/thabetx) - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/15727 --- cpp/include/cudf/io/csv.hpp | 35 ++++++++++++++++ cpp/src/io/csv/csv_gpu.cu | 16 ++++++-- cpp/src/io/csv/reader_impl.cu | 6 ++- cpp/src/io/utilities/parsing_utils.cuh | 3 ++ cpp/tests/io/csv_test.cpp | 41 +++++++++++++++++++ .../cudf/_lib/pylibcudf/libcudf/io/csv.pxd | 3 ++ 6 files changed, 99 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index fdceda40e92..a20f75cecd7 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -106,6 +106,9 @@ class csv_reader_options { char _quotechar = '"'; // Whether a quote inside a value is double-quoted bool _doublequote = true; + // Whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no effect when + // _doublequote is true + bool _detect_whitespace_around_quotes = false; // Names of columns to read as datetime std::vector _parse_dates_names; // Indexes of columns to read as datetime @@ -375,6 +378,17 @@ class csv_reader_options { */ [[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; } + /** + * @brief Whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no + * effect when _doublequote is true + * + * @return `true` if detect_whitespace_around_quotes is enabled + */ + [[nodiscard]] bool is_enabled_detect_whitespace_around_quotes() const + { + return _detect_whitespace_around_quotes; + } + /** * @brief Returns names of columns to read as datetime. * @@ -698,6 +712,14 @@ class csv_reader_options { */ void enable_doublequote(bool val) { _doublequote = val; } + /** + * @brief Sets whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no + * effect when _doublequote is true + * + * @param val Boolean value to enable/disable + */ + void enable_detect_whitespace_around_quotes(bool val) { _detect_whitespace_around_quotes = val; } + /** * @brief Sets names of columns to read as datetime. * @@ -1126,6 +1148,19 @@ class csv_reader_options_builder { return *this; } + /** + * @brief Sets whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no + * effect when _doublequote is true + * + * @param val Boolean value to enable/disable + * @return this for chaining + */ + csv_reader_options_builder& detect_whitespace_around_quotes(bool val) + { + options._detect_whitespace_around_quotes = val; + return *this; + } + /** * @brief Sets names of columns to read as datetime. * diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu index 9c186f161b3..7a05d0aebaf 100644 --- a/cpp/src/io/csv/csv_gpu.cu +++ b/cpp/src/io/csv/csv_gpu.cu @@ -351,9 +351,19 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim) if (dtypes[actual_col].id() == cudf::type_id::STRING) { auto end = next_delimiter; if (not options.keepquotes) { - if ((*field_start == options.quotechar) && (*(end - 1) == options.quotechar)) { - ++field_start; - --end; + if (not options.detect_whitespace_around_quotes) { + if ((*field_start == options.quotechar) && (*(end - 1) == options.quotechar)) { + ++field_start; + --end; + } + } else { + // If the string is quoted, whitespace around the quotes get removed as well + auto const trimmed_field = trim_whitespaces(field_start, end); + if ((*trimmed_field.first == options.quotechar) && + (*(trimmed_field.second - 1) == options.quotechar)) { + field_start = trimmed_field.first + 1; + end = trimmed_field.second - 1; + } } } auto str_list = static_cast*>(columns[actual_col]); diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index 67c1194578a..5dee0c17a33 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -951,8 +951,10 @@ parse_options make_parse_options(csv_reader_options const& reader_opts, parse_opts.terminator = reader_opts.get_lineterminator(); if (reader_opts.get_quotechar() != '\0' && reader_opts.get_quoting() != quote_style::NONE) { - parse_opts.quotechar = reader_opts.get_quotechar(); - parse_opts.keepquotes = false; + parse_opts.quotechar = reader_opts.get_quotechar(); + parse_opts.keepquotes = false; + parse_opts.detect_whitespace_around_quotes = + reader_opts.is_enabled_detect_whitespace_around_quotes(); parse_opts.doublequote = reader_opts.is_enabled_doublequote(); } else { parse_opts.quotechar = '\0'; diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 06a0a63c0ab..faee05541cc 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -63,6 +63,7 @@ struct parse_options_view { char thousands; char comment; bool keepquotes; + bool detect_whitespace_around_quotes; bool doublequote; bool dayfirst; bool skipblanklines; @@ -80,6 +81,7 @@ struct parse_options { char thousands; char comment; bool keepquotes; + bool detect_whitespace_around_quotes; bool doublequote; bool dayfirst; bool skipblanklines; @@ -105,6 +107,7 @@ struct parse_options { thousands, comment, keepquotes, + detect_whitespace_around_quotes, doublequote, dayfirst, skipblanklines, diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp index 8e3ecd817e4..880dc911954 100644 --- a/cpp/tests/io/csv_test.cpp +++ b/cpp/tests/io/csv_test.cpp @@ -1018,6 +1018,47 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored) view.column(1)); } +TEST_F(CsvReaderTest, StringsQuotesWhitespace) +{ + std::vector names{"line", "verse"}; + + auto filepath = temp_env->get_temp_dir() + "StringsQuotesIgnored.csv"; + { + std::ofstream outfile(filepath, std::ofstream::out); + outfile << names[0] << ',' << names[1] << '\n'; + outfile << "A,a" << '\n'; // unquoted no whitespace + outfile << " B,b" << '\n'; // unquoted leading whitespace + outfile << "C ,c" << '\n'; // unquoted trailing whitespace + outfile << " D ,d" << '\n'; // unquoted leading and trailing whitespace + outfile << "\"E\",e" << '\n'; // quoted no whitespace + outfile << "\"F\" ,f" << '\n'; // quoted trailing whitespace + outfile << " \"G\",g" << '\n'; // quoted leading whitespace + outfile << " \"H\" ,h" << '\n'; // quoted leading and trailing whitespace + outfile << " \" I \" ,i" + << '\n'; // quoted leading and trailing whitespace with spaces inside quotes + } + + cudf::io::csv_reader_options in_opts = + cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}) + .names(names) + .dtypes(std::vector{dtype(), dtype()}) + .quoting(cudf::io::quote_style::ALL) + .doublequote(false) + .detect_whitespace_around_quotes(true); + auto result = cudf::io::read_csv(in_opts); + + auto const view = result.tbl->view(); + ASSERT_EQ(2, view.num_columns()); + ASSERT_EQ(type_id::STRING, view.column(0).type().id()); + ASSERT_EQ(type_id::STRING, view.column(1).type().id()); + + expect_column_data_equal( + std::vector{"A", " B", "C ", " D ", "E", "F", "G", "H", " I "}, + view.column(0)); + expect_column_data_equal(std::vector{"a", "b", "c", "d", "e", "f", "g", "h", "i"}, + view.column(1)); +} + TEST_F(CsvReaderTest, SkiprowsNrows) { auto filepath = temp_env->get_temp_dir() + "SkiprowsNrows.csv"; diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd index 754dd37d53f..b5ff6558cd8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd @@ -50,6 +50,7 @@ cdef extern from "cudf/io/csv.hpp" \ cudf_io_types.quote_style get_quoting() except + char get_quotechar() except + bool is_enabled_doublequote() except + + bool is_enabled_updated_quotes_detection() except + vector[string] get_parse_dates_names() except + vector[int] get_parse_dates_indexes() except + vector[string] get_parse_hex_names() except + @@ -95,6 +96,7 @@ cdef extern from "cudf/io/csv.hpp" \ void set_quoting(cudf_io_types.quote_style style) except + void set_quotechar(char val) except + void set_doublequote(bool val) except + + void set_detect_whitespace_around_quotes(bool val) except + void set_parse_dates(vector[string]) except + void set_parse_dates(vector[int]) except + void set_parse_hex(vector[string]) except + @@ -163,6 +165,7 @@ cdef extern from "cudf/io/csv.hpp" \ ) except + csv_reader_options_builder& quotechar(char val) except + csv_reader_options_builder& doublequote(bool val) except + + csv_reader_options_builder& detect_whitespace_around_quotes(bool val) except + csv_reader_options_builder& parse_dates(vector[string]) except + csv_reader_options_builder& parse_dates(vector[int]) except +