Skip to content

Commit

Permalink
init orc-cast-date
Browse files Browse the repository at this point in the history
Signed-off-by: sinkinben <sinkinben@outlook.com>
  • Loading branch information
sinkinben committed Aug 23, 2022
1 parent 341cfea commit c293a11
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 0 deletions.
41 changes: 41 additions & 0 deletions integration_tests/src/main/python/orc_cast_date_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import pytest

from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error
from data_gen import *
from pyspark.sql.types import *
from spark_session import with_cpu_session


@pytest.mark.parametrize('to_type', ['timestamp', 'string'])
def test_casting_from_date(spark_tmp_path, to_type):
orc_path = spark_tmp_path + '/orc_casting_from_date'

# This case is failed
# data_gen = [('date_column', DateGen(end=date(1582, 1, 1)))]
# length = 10

# This case is fine
data_gen = [('date_column', DateGen(start=date(1582,10,15)))]
length = 2048

with_cpu_session(
func=lambda spark: gen_df(spark, data_gen, length=length).write.orc(orc_path),
)

assert_gpu_and_cpu_are_equal_collect(
func=lambda spark: spark.read.schema("date_column {}".format(to_type)).orc(orc_path),
)
34 changes: 34 additions & 0 deletions sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
import org.apache.spark.sql.catalyst.util.RebaseDateTime
import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, PartitionReaderFactory}
import org.apache.spark.sql.execution.QueryExecutionException
import org.apache.spark.sql.execution.datasources.{PartitionedFile, PartitioningAwareFileIndex}
Expand Down Expand Up @@ -200,6 +201,10 @@ object GpuOrcScan extends Arm {
if (fromDt == toDt) {
return col
}
println("-----------")
println(fromDt.toString)
println(toDt.toString)
println("-----------")
(fromDt, toDt) match {
// integral to integral
case (DType.BOOL8 | DType.INT8 | DType.INT16 | DType.INT32 | DType.INT64,
Expand All @@ -211,6 +216,28 @@ object GpuOrcScan extends Arm {
} else {
downCastAnyInteger(col, toDt)
}

// 'date' type in ORC is actually 'DType.TIMESTAMP_DAYS'
// date -> timestamp
// 'col' is in Proleptic Gregorian calendar, we need to cast it into Julian calendar
case (DType.TIMESTAMP_DAYS, DType.TIMESTAMP_MICROSECONDS) =>
println(col.max().toString())
val gregorianToJulian = RebaseDateTime.rebaseGregorianToJulianDays(col.max().getInt)
val julianToGregorian = RebaseDateTime.rebaseJulianToGregorianDays(col.max().getInt)
printf("gregorianToJulian: %d \n", gregorianToJulian)
printf("julianToGregorian: %d \n", julianToGregorian)
withResource(col.castTo(DType.INT64)) { longVec =>
println(longVec.max().toString())
withResource(Scalar.fromLong(24 * 60 * 60 * 1000 * 1000L)) { toMicroSeconds =>
withResource(longVec.mul(toMicroSeconds)) { microSeconds =>
microSeconds.castTo(DType.TIMESTAMP_MICROSECONDS)
}
}
}
// date -> string
case (DType.TIMESTAMP_DAYS, DType.STRING) =>
col.asStrings()

// TODO more types, tracked in https://github.com/NVIDIA/spark-rapids/issues/5895
case (f, t) =>
throw new QueryExecutionException(s"Unsupported type casting: $f -> $t")
Expand Down Expand Up @@ -239,6 +266,13 @@ object GpuOrcScan extends Arm {
}
case VARCHAR =>
to.getCategory == STRING

case DATE =>
to.getCategory match {
case STRING | TIMESTAMP => true
case _ => false
}

// TODO more types, tracked in https://github.com/NVIDIA/spark-rapids/issues/5895
case _ =>
false
Expand Down

0 comments on commit c293a11

Please sign in to comment.