Skip to content

Commit

Permalink
Fix spaces around CSV quoted strings (#15727)
Browse files Browse the repository at this point in the history
This PR adds an option to CSV parsing to detect quotes even if they are surrounded by whitespaces.

Current behavior when `options.keepquotes == false`:
- `"A"` ->  `A`
- `  "A"  ` -> `  "A"  ` (The spaces around the 'A' are not removed and the quotes are kept)

New behavior after enabling the new option:
- `"A"` -> `A`
- `  "A"  ` -> `A`

The new option is false by default to avoid breaking any code that relied on the old behavior.

Closes #13892.

Authors:
  - Mohamed Thabet (https://github.com/thabetx)
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Lawrence Mitchell (https://github.com/wence-)

URL: #15727
  • Loading branch information
thabetx authored May 22, 2024
1 parent b4ce6e4 commit a5f6fa3
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 5 deletions.
35 changes: 35 additions & 0 deletions cpp/include/cudf/io/csv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ class csv_reader_options {
char _quotechar = '"';
// Whether a quote inside a value is double-quoted
bool _doublequote = true;
// Whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no effect when
// _doublequote is true
bool _detect_whitespace_around_quotes = false;
// Names of columns to read as datetime
std::vector<std::string> _parse_dates_names;
// Indexes of columns to read as datetime
Expand Down Expand Up @@ -375,6 +378,17 @@ class csv_reader_options {
*/
[[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; }

/**
* @brief Whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no
* effect when _doublequote is true
*
* @return `true` if detect_whitespace_around_quotes is enabled
*/
[[nodiscard]] bool is_enabled_detect_whitespace_around_quotes() const
{
return _detect_whitespace_around_quotes;
}

/**
* @brief Returns names of columns to read as datetime.
*
Expand Down Expand Up @@ -698,6 +712,14 @@ class csv_reader_options {
*/
void enable_doublequote(bool val) { _doublequote = val; }

/**
* @brief Sets whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no
* effect when _doublequote is true
*
* @param val Boolean value to enable/disable
*/
void enable_detect_whitespace_around_quotes(bool val) { _detect_whitespace_around_quotes = val; }

/**
* @brief Sets names of columns to read as datetime.
*
Expand Down Expand Up @@ -1126,6 +1148,19 @@ class csv_reader_options_builder {
return *this;
}

/**
* @brief Sets whether to detect quotes surrounded by spaces e.g. ` "data" `. This flag has no
* effect when _doublequote is true
*
* @param val Boolean value to enable/disable
* @return this for chaining
*/
csv_reader_options_builder& detect_whitespace_around_quotes(bool val)
{
options._detect_whitespace_around_quotes = val;
return *this;
}

/**
* @brief Sets names of columns to read as datetime.
*
Expand Down
16 changes: 13 additions & 3 deletions cpp/src/io/csv/csv_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -351,9 +351,19 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
if (dtypes[actual_col].id() == cudf::type_id::STRING) {
auto end = next_delimiter;
if (not options.keepquotes) {
if ((*field_start == options.quotechar) && (*(end - 1) == options.quotechar)) {
++field_start;
--end;
if (not options.detect_whitespace_around_quotes) {
if ((*field_start == options.quotechar) && (*(end - 1) == options.quotechar)) {
++field_start;
--end;
}
} else {
// If the string is quoted, whitespace around the quotes get removed as well
auto const trimmed_field = trim_whitespaces(field_start, end);
if ((*trimmed_field.first == options.quotechar) &&
(*(trimmed_field.second - 1) == options.quotechar)) {
field_start = trimmed_field.first + 1;
end = trimmed_field.second - 1;
}
}
}
auto str_list = static_cast<std::pair<char const*, size_t>*>(columns[actual_col]);
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/io/csv/reader_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -951,8 +951,10 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
parse_opts.terminator = reader_opts.get_lineterminator();

if (reader_opts.get_quotechar() != '\0' && reader_opts.get_quoting() != quote_style::NONE) {
parse_opts.quotechar = reader_opts.get_quotechar();
parse_opts.keepquotes = false;
parse_opts.quotechar = reader_opts.get_quotechar();
parse_opts.keepquotes = false;
parse_opts.detect_whitespace_around_quotes =
reader_opts.is_enabled_detect_whitespace_around_quotes();
parse_opts.doublequote = reader_opts.is_enabled_doublequote();
} else {
parse_opts.quotechar = '\0';
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/io/utilities/parsing_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ struct parse_options_view {
char thousands;
char comment;
bool keepquotes;
bool detect_whitespace_around_quotes;
bool doublequote;
bool dayfirst;
bool skipblanklines;
Expand All @@ -80,6 +81,7 @@ struct parse_options {
char thousands;
char comment;
bool keepquotes;
bool detect_whitespace_around_quotes;
bool doublequote;
bool dayfirst;
bool skipblanklines;
Expand All @@ -105,6 +107,7 @@ struct parse_options {
thousands,
comment,
keepquotes,
detect_whitespace_around_quotes,
doublequote,
dayfirst,
skipblanklines,
Expand Down
41 changes: 41 additions & 0 deletions cpp/tests/io/csv_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1018,6 +1018,47 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
view.column(1));
}

TEST_F(CsvReaderTest, StringsQuotesWhitespace)
{
std::vector<std::string> names{"line", "verse"};

auto filepath = temp_env->get_temp_dir() + "StringsQuotesIgnored.csv";
{
std::ofstream outfile(filepath, std::ofstream::out);
outfile << names[0] << ',' << names[1] << '\n';
outfile << "A,a" << '\n'; // unquoted no whitespace
outfile << " B,b" << '\n'; // unquoted leading whitespace
outfile << "C ,c" << '\n'; // unquoted trailing whitespace
outfile << " D ,d" << '\n'; // unquoted leading and trailing whitespace
outfile << "\"E\",e" << '\n'; // quoted no whitespace
outfile << "\"F\" ,f" << '\n'; // quoted trailing whitespace
outfile << " \"G\",g" << '\n'; // quoted leading whitespace
outfile << " \"H\" ,h" << '\n'; // quoted leading and trailing whitespace
outfile << " \" I \" ,i"
<< '\n'; // quoted leading and trailing whitespace with spaces inside quotes
}

cudf::io::csv_reader_options in_opts =
cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
.names(names)
.dtypes(std::vector<data_type>{dtype<cudf::string_view>(), dtype<cudf::string_view>()})
.quoting(cudf::io::quote_style::ALL)
.doublequote(false)
.detect_whitespace_around_quotes(true);
auto result = cudf::io::read_csv(in_opts);

auto const view = result.tbl->view();
ASSERT_EQ(2, view.num_columns());
ASSERT_EQ(type_id::STRING, view.column(0).type().id());
ASSERT_EQ(type_id::STRING, view.column(1).type().id());

expect_column_data_equal(
std::vector<std::string>{"A", " B", "C ", " D ", "E", "F", "G", "H", " I "},
view.column(0));
expect_column_data_equal(std::vector<std::string>{"a", "b", "c", "d", "e", "f", "g", "h", "i"},
view.column(1));
}

TEST_F(CsvReaderTest, SkiprowsNrows)
{
auto filepath = temp_env->get_temp_dir() + "SkiprowsNrows.csv";
Expand Down
3 changes: 3 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ cdef extern from "cudf/io/csv.hpp" \
cudf_io_types.quote_style get_quoting() except +
char get_quotechar() except +
bool is_enabled_doublequote() except +
bool is_enabled_updated_quotes_detection() except +
vector[string] get_parse_dates_names() except +
vector[int] get_parse_dates_indexes() except +
vector[string] get_parse_hex_names() except +
Expand Down Expand Up @@ -95,6 +96,7 @@ cdef extern from "cudf/io/csv.hpp" \
void set_quoting(cudf_io_types.quote_style style) except +
void set_quotechar(char val) except +
void set_doublequote(bool val) except +
void set_detect_whitespace_around_quotes(bool val) except +
void set_parse_dates(vector[string]) except +
void set_parse_dates(vector[int]) except +
void set_parse_hex(vector[string]) except +
Expand Down Expand Up @@ -163,6 +165,7 @@ cdef extern from "cudf/io/csv.hpp" \
) except +
csv_reader_options_builder& quotechar(char val) except +
csv_reader_options_builder& doublequote(bool val) except +
csv_reader_options_builder& detect_whitespace_around_quotes(bool val) except +
csv_reader_options_builder& parse_dates(vector[string]) except +
csv_reader_options_builder& parse_dates(vector[int]) except +

Expand Down

0 comments on commit a5f6fa3

Please sign in to comment.