pola-rs · ritchie46 · May 2, 2024 · May 1, 2024
@@ -13,8 +13,8 @@ use sqlparser::ast::ExactNumberInfo;
 use sqlparser::ast::{
     ArrayAgg, ArrayElemTypeDef, BinaryOperator as SQLBinaryOperator, BinaryOperator, CastFormat,
     DataType as SQLDataType, DateTimeField, Expr as SQLExpr, Function as SQLFunction, Ident,
-    JoinConstraint, OrderByExpr, Query as Subquery, SelectItem, TimezoneInfo, TrimWhereField,
-    UnaryOperator, Value as SQLValue,
+    JoinConstraint, ObjectName, OrderByExpr, Query as Subquery, SelectItem, TimezoneInfo,
+    TrimWhereField, UnaryOperator, Value as SQLValue,
 };
 use sqlparser::dialect::GenericDialect;
 use sqlparser::parser::{Parser, ParserOptions};
@@ -24,41 +24,53 @@ use crate::SQLContext;
 
 pub(crate) fn map_sql_polars_datatype(data_type: &SQLDataType) -> PolarsResult<DataType> {
     Ok(match data_type {
+        // ---------------------------------
+        // array/list
+        // ---------------------------------
         SQLDataType::Array(ArrayElemTypeDef::AngleBracket(inner_type))
         | SQLDataType::Array(ArrayElemTypeDef::SquareBracket(inner_type)) => {
             DataType::List(Box::new(map_sql_polars_datatype(inner_type)?))
         },
-        #[cfg(feature = "dtype-decimal")]
-        SQLDataType::Dec(info) | SQLDataType::Decimal(info) | SQLDataType::Numeric(info) => {
-            match *info {
-                ExactNumberInfo::PrecisionAndScale(p, s) => {
-                    DataType::Decimal(Some(p as usize), Some(s as usize))
-                },
-                ExactNumberInfo::Precision(p) => DataType::Decimal(Some(p as usize), Some(0)),
-                ExactNumberInfo::None => DataType::Decimal(Some(38), Some(9)),
-            }
-        },
-        SQLDataType::BigInt(_) => DataType::Int64,
-        SQLDataType::Boolean => DataType::Boolean,
+
+        // ---------------------------------
+        // binary
+        // ---------------------------------
         SQLDataType::Bytea
         | SQLDataType::Bytes(_)
         | SQLDataType::Binary(_)
         | SQLDataType::Blob(_)
         | SQLDataType::Varbinary(_) => DataType::Binary,
-        SQLDataType::Char(_)
-        | SQLDataType::CharVarying(_)
-        | SQLDataType::Character(_)
-        | SQLDataType::CharacterVarying(_)
-        | SQLDataType::Clob(_)
-        | SQLDataType::String(_)
-        | SQLDataType::Text
-        | SQLDataType::Uuid
-        | SQLDataType::Varchar(_) => DataType::String,
-        SQLDataType::Date => DataType::Date,
-        SQLDataType::Double
-        | SQLDataType::DoublePrecision
-        | SQLDataType::Float8
-        | SQLDataType::Float64 => DataType::Float64,
+
+        // ---------------------------------
+        // boolean
+        // ---------------------------------
+        SQLDataType::Boolean | SQLDataType::Bool => DataType::Boolean,
+
+        // ---------------------------------
+        // signed integer
+        // ---------------------------------
+        SQLDataType::Int(_) | SQLDataType::Integer(_) => DataType::Int32,
+        SQLDataType::Int2(_) | SQLDataType::SmallInt(_) => DataType::Int16,
+        SQLDataType::Int4(_) | SQLDataType::MediumInt(_) => DataType::Int32,
+        SQLDataType::Int8(_) | SQLDataType::BigInt(_) => DataType::Int64,
+        SQLDataType::TinyInt(_) => DataType::Int8,
+
+        // ---------------------------------
+        // unsigned integer: the following do not map to PostgreSQL types/syntax, but
+        // are enabled for wider compatibility (eg: "CAST(col AS BIGINT UNSIGNED)").
+        // ---------------------------------
+        SQLDataType::UnsignedInt(_) | SQLDataType::UnsignedInteger(_) => DataType::UInt32,
+        SQLDataType::UnsignedInt2(_) | SQLDataType::UnsignedSmallInt(_) => DataType::UInt16,
+        SQLDataType::UnsignedInt4(_) | SQLDataType::UnsignedMediumInt(_) => DataType::UInt32,
+        SQLDataType::UnsignedInt8(_) | SQLDataType::UnsignedBigInt(_) => DataType::UInt64,
+        SQLDataType::UnsignedTinyInt(_) => DataType::UInt8, // see also: "custom" types below
+
+        // ---------------------------------
+        // float
+        // ---------------------------------
+        SQLDataType::Double | SQLDataType::DoublePrecision | SQLDataType::Float8 => {
+            DataType::Float64
+        },
         SQLDataType::Float(n_bytes) => match n_bytes {
             Some(n) if (1u64..=24u64).contains(n) => DataType::Float32,
             Some(n) if (25u64..=53u64).contains(n) => DataType::Float64,
@@ -68,12 +80,26 @@ pub(crate) fn map_sql_polars_datatype(data_type: &SQLDataType) -> PolarsResult<D
             None => DataType::Float64,
         },
         SQLDataType::Float4 | SQLDataType::Real => DataType::Float32,
-        SQLDataType::Int(_) | SQLDataType::Integer(_) => DataType::Int32,
-        SQLDataType::Int2(_) => DataType::Int16,
-        SQLDataType::Int4(_) => DataType::Int32,
-        SQLDataType::Int8(_) => DataType::Int64,
+
+        // ---------------------------------
+        // decimal
+        // ---------------------------------
+        #[cfg(feature = "dtype-decimal")]
+        SQLDataType::Dec(info) | SQLDataType::Decimal(info) | SQLDataType::Numeric(info) => {
+            match *info {
+                ExactNumberInfo::PrecisionAndScale(p, s) => {
+                    DataType::Decimal(Some(p as usize), Some(s as usize))
+                },
+                ExactNumberInfo::Precision(p) => DataType::Decimal(Some(p as usize), Some(0)),
+                ExactNumberInfo::None => DataType::Decimal(Some(38), Some(9)),
+            }
+        },
+
+        // ---------------------------------
+        // temporal
+        // ---------------------------------
+        SQLDataType::Date => DataType::Date,
         SQLDataType::Interval => DataType::Duration(TimeUnit::Microseconds),
-        SQLDataType::SmallInt(_) => DataType::Int16,
         SQLDataType::Time(_, tz) => match tz {
             TimezoneInfo::None => DataType::Time,
             _ => {
@@ -97,16 +123,41 @@ pub(crate) fn map_sql_polars_datatype(data_type: &SQLDataType) -> PolarsResult<D
                 },
             }
         },
-        SQLDataType::TinyInt(_) => DataType::Int8,
-        SQLDataType::UnsignedBigInt(_) => DataType::UInt64,
-        SQLDataType::UnsignedInt(_) | SQLDataType::UnsignedInteger(_) => DataType::UInt32,
-        SQLDataType::UnsignedInt2(_) => DataType::UInt16,
-        SQLDataType::UnsignedInt4(_) => DataType::UInt32,
-        SQLDataType::UnsignedInt8(_) => DataType::UInt64,
-        SQLDataType::UnsignedSmallInt(_) => DataType::UInt16,
-        SQLDataType::UnsignedTinyInt(_) => DataType::UInt8,
 
-        _ => polars_bail!(ComputeError: "SQL datatype {:?} is not yet supported", data_type),
+        // ---------------------------------
+        // string
+        // ---------------------------------
+        SQLDataType::Char(_)
+        | SQLDataType::CharVarying(_)
+        | SQLDataType::Character(_)
+        | SQLDataType::CharacterVarying(_)
+        | SQLDataType::Clob(_)
+        | SQLDataType::String(_)
+        | SQLDataType::Text
+        | SQLDataType::Uuid
+        | SQLDataType::Varchar(_) => DataType::String,
+
+        // ---------------------------------
+        // custom
+        // ---------------------------------
+        SQLDataType::Custom(ObjectName(idents), _) => match idents.as_slice() {
+            [Ident { value, .. }] => match value.to_lowercase().as_str() {
+                // these integer types are not supported by the PostgreSQL core distribution,
+                // but they ARE available via `pguint` (https://github.com/petere/pguint), an
+                // extension maintained by one of the PostgreSQL core developers.
+                "uint1" => DataType::UInt8,
+                "uint2" => DataType::UInt16,
+                "uint4" | "uint" => DataType::UInt32,
+                "uint8" => DataType::UInt64,
+                // `pguint` also provides a 1 byte (8bit) integer type alias
+                "int1" => DataType::Int8,
+                _ => {
+                    polars_bail!(ComputeError: "SQL datatype {:?} is not currently supported", value)
+                },
+            },
+            _ => polars_bail!(ComputeError: "SQL datatype {:?} is not currently supported", idents),
+        },
+        _ => polars_bail!(ComputeError: "SQL datatype {:?} is not currently supported", data_type),
     })
 }
 
@@ -500,7 +551,7 @@ impl SQLExprVisitor<'_> {
             return Ok(expr.str().json_decode(None, None));
         }
         let polars_type = map_sql_polars_datatype(data_type)?;
-        Ok(expr.cast(polars_type))
+        Ok(expr.strict_cast(polars_type))
     }
 
     /// Visit a SQL literal.

@@ -1,8 +1,11 @@
 from __future__ import annotations
 
+from typing import Any
+
 import pytest
 
 import polars as pl
+import polars.selectors as cs
 from polars.exceptions import ComputeError
 from polars.testing import assert_frame_equal
 
@@ -14,6 +17,7 @@ def test_cast() -> None:
             "b": [1.1, 2.2, 3.3, 4.4, 5.5],
             "c": ["a", "b", "c", "d", "e"],
             "d": [True, False, True, False, True],
+            "e": [-1, 0, None, 1, 2],
         }
     )
     # test various dtype casts, using standard ("CAST <col> AS <dtype>")
@@ -25,41 +29,116 @@ def test_cast() -> None:
               -- float
               CAST(a AS DOUBLE PRECISION) AS a_f64,
               a::real AS a_f32,
+              b::float(24) AS b_f32,
+              b::float(25) AS b_f64,
+              e::float8 AS e_f64,
+              e::float4 AS e_f32,
+
               -- integer
               CAST(b AS TINYINT) AS b_i8,
               CAST(b AS SMALLINT) AS b_i16,
               b::bigint AS b_i64,
               d::tinyint AS d_i8,
+              a::int1 AS a_i8,
+              a::int2 AS a_i16,
+              a::int4 AS a_i32,
+              a::int8 AS a_i64,
+
+              -- unsigned integer
+              CAST(a AS TINYINT UNSIGNED) AS a_u8,
+              d::uint1 AS d_u8,
+              a::uint2 AS a_u16,
+              b::uint4 AS b_u32,
+              b::uint8 AS b_u64,
+              CAST(a AS BIGINT UNSIGNED) AS a_u64,
+
               -- string/binary
               CAST(a AS CHAR) AS a_char,
               CAST(b AS VARCHAR) AS b_varchar,
               c::blob AS c_blob,
               c::bytes AS c_bytes,
               c::VARBINARY AS c_varbinary,
               CAST(d AS CHARACTER VARYING) AS d_charvar,
+
+              -- boolean
+              e::bool AS e_bool,
+              e::boolean AS e_boolean
             FROM df
             """
         )
     assert res.schema == {
         "a_f64": pl.Float64,
         "a_f32": pl.Float32,
+        "b_f32": pl.Float32,
+        "b_f64": pl.Float64,
+        "e_f64": pl.Float64,
+        "e_f32": pl.Float32,
         "b_i8": pl.Int8,
         "b_i16": pl.Int16,
         "b_i64": pl.Int64,
         "d_i8": pl.Int8,
+        "a_i8": pl.Int8,
+        "a_i16": pl.Int16,
+        "a_i32": pl.Int32,
+        "a_i64": pl.Int64,
+        "a_u8": pl.UInt8,
+        "d_u8": pl.UInt8,
+        "a_u16": pl.UInt16,
+        "b_u32": pl.UInt32,
+        "b_u64": pl.UInt64,
+        "a_u64": pl.UInt64,
         "a_char": pl.String,
         "b_varchar": pl.String,
         "c_blob": pl.Binary,
         "c_bytes": pl.Binary,
         "c_varbinary": pl.Binary,
         "d_charvar": pl.String,
+        "e_bool": pl.Boolean,
+        "e_boolean": pl.Boolean,
     }
-    assert res.rows() == [
-        (1.0, 1.0, 1, 1, 1, 1, "1", "1.1", b"a", b"a", b"a", "true"),
-        (2.0, 2.0, 2, 2, 2, 0, "2", "2.2", b"b", b"b", b"b", "false"),
-        (3.0, 3.0, 3, 3, 3, 1, "3", "3.3", b"c", b"c", b"c", "true"),
-        (4.0, 4.0, 4, 4, 4, 0, "4", "4.4", b"d", b"d", b"d", "false"),
-        (5.0, 5.0, 5, 5, 5, 1, "5", "5.5", b"e", b"e", b"e", "true"),
+    assert res.select(cs.by_dtype(pl.Float32)).rows() == pytest.approx(
+        [
+            (1.0, 1.100000023841858, -1.0),
+            (2.0, 2.200000047683716, 0.0),
+            (3.0, 3.299999952316284, None),
+            (4.0, 4.400000095367432, 1.0),
+            (5.0, 5.5, 2.0),
+        ]
+    )
+    assert res.select(cs.by_dtype(pl.Float64)).rows() == [
+        (1.0, 1.1, -1.0),
+        (2.0, 2.2, 0.0),
+        (3.0, 3.3, None),
+        (4.0, 4.4, 1.0),
+        (5.0, 5.5, 2.0),
+    ]
+    assert res.select(cs.integer()).rows() == [
+        (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1),
+        (2, 2, 2, 0, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2),
+        (3, 3, 3, 1, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3),
+        (4, 4, 4, 0, 4, 4, 4, 4, 4, 0, 4, 4, 4, 4),
+        (5, 5, 5, 1, 5, 5, 5, 5, 5, 1, 5, 5, 5, 5),
+    ]
+    assert res.select(cs.string()).rows() == [
+        ("1", "1.1", "true"),
+        ("2", "2.2", "false"),
+        ("3", "3.3", "true"),
+        ("4", "4.4", "false"),
+        ("5", "5.5", "true"),
+    ]
+    assert res.select(cs.binary()).rows() == [
+        (b"a", b"a", b"a"),
+        (b"b", b"b", b"b"),
+        (b"c", b"c", b"c"),
+        (b"d", b"d", b"d"),
+        (b"e", b"e", b"e"),
+    ]
+    assert res.select(cs.boolean()).rows() == [
+        (True, True),
+        (False, False),
+        (None, None),
+        (True, True),
+        (True, True),
     ]
 
     with pytest.raises(ComputeError, match="unsupported use of FORMAT in CAST"):
@@ -68,6 +147,24 @@ def test_cast() -> None:
         )
 
 
+@pytest.mark.parametrize(
+    ("values", "cast_op", "error"),
+    [
+        ([1.0, -1.0], "values::uint8", "conversion from `f64` to `u64` failed"),
+        ([10, 0, -1], "values::uint4", "conversion from `i64` to `u32` failed"),
+        ([int(1e8)], "values::int1", "conversion from `i64` to `i8` failed"),
+        (["a", "b"], "values::date", "conversion from `str` to `date` failed"),
+        (["a", "b"], "values::time", "conversion from `str` to `time` failed"),
+        (["a", "b"], "values::int4", "conversion from `str` to `i32` failed"),
+    ],
+)
+def test_cast_errors(values: Any, cast_op: str, error: str) -> None:
+    df = pl.DataFrame({"values": values})
+
+    with pytest.raises(ComputeError, match=error):
+        df.sql(f"SELECT {cast_op} FROM df")
+
+
 def test_cast_json() -> None:
     df = pl.DataFrame({"txt": ['{"a":[1,2,3],"b":["x","y","z"],"c":5.0}']})