Skip to content

Commit

Permalink
fix: check overflow numbers while inferring type for csv files (#6481)
Browse files Browse the repository at this point in the history
* refactor: detect overflow for type inference

* chore: fallback to utf8 and tests
  • Loading branch information
CookiePieWw authored Oct 2, 2024
1 parent 581c647 commit 4389cf9
Showing 1 changed file with 8 additions and 1 deletion.
9 changes: 8 additions & 1 deletion arrow-csv/src/reader/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,12 @@ impl InferredDataType {
self.packed |= if string.starts_with('"') {
1 << 8 // Utf8
} else if let Some(m) = REGEX_SET.matches(string).into_iter().next() {
1 << m
if m == 1 && string.len() >= 19 && string.parse::<i64>().is_err() {
// if overflow i64, fallback to utf8
1 << 8
} else {
1 << m
}
} else {
1 << 8 // Utf8
}
Expand Down Expand Up @@ -1819,6 +1824,8 @@ mod tests {
infer_field_schema("2021-12-19T13:12:30.123456789"),
DataType::Timestamp(TimeUnit::Nanosecond, None)
);
assert_eq!(infer_field_schema("–9223372036854775809"), DataType::Utf8);
assert_eq!(infer_field_schema("9223372036854775808"), DataType::Utf8);
}

#[test]
Expand Down

0 comments on commit 4389cf9

Please sign in to comment.