diff --git a/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst b/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst index f2701d4fb7216..d81008d63cbe9 100644 --- a/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst +++ b/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst @@ -23,3 +23,4 @@ Upgrading from PySpark 3.2 to 3.3 * In Spark 3.3, the ``pyspark.pandas.sql`` method follows [the standard Python string formatter](https://docs.python.org/3/library/string.html#format-string-syntax). To restore the previous behavior, set ``PYSPARK_PANDAS_SQL_LEGACY`` environment variable to ``1``. * In Spark 3.3, the ``drop`` method of pandas API on Spark DataFrame supports dropping rows by ``index``, and sets dropping by index instead of column by default. * In Spark 3.3, PySpark upgrades Pandas version, the new minimum required version changes from 0.23.2 to 1.0.5. +* In Spark 3.3, the ``repr`` return values of SQL DataTypes have been changed to yield an object with the same value when passed to ``eval``. diff --git a/python/pyspark/ml/functions.py b/python/pyspark/ml/functions.py index aecafb3d494df..9725d4b033bd8 100644 --- a/python/pyspark/ml/functions.py +++ b/python/pyspark/ml/functions.py @@ -58,11 +58,11 @@ def vector_to_array(col: Column, dtype: str = "float64") -> Column: [Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]), Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])] >>> df1.schema.fields - [StructField(vec,ArrayType(DoubleType,false),false), - StructField(oldVec,ArrayType(DoubleType,false),false)] + [StructField('vec', ArrayType(DoubleType(), False), False), + StructField('oldVec', ArrayType(DoubleType(), False), False)] >>> df2.schema.fields - [StructField(vec,ArrayType(FloatType,false),false), - StructField(oldVec,ArrayType(FloatType,false),false)] + [StructField('vec', ArrayType(FloatType(), False), False), + StructField('oldVec', ArrayType(FloatType(), False), False)] """ sc = SparkContext._active_spark_context assert sc is not None and sc._jvm is not None diff --git a/python/pyspark/pandas/extensions.py b/python/pyspark/pandas/extensions.py index 69f742541a599..eeb02f06a85e9 100644 --- a/python/pyspark/pandas/extensions.py +++ b/python/pyspark/pandas/extensions.py @@ -109,7 +109,7 @@ def __init__(self, pandas_on_spark_obj): ... Traceback (most recent call last): ... - ValueError: Cannot call DatetimeMethods on type StringType + ValueError: Cannot call DatetimeMethods on type StringType() Note: This function is not meant to be used directly - instead, use register_dataframe_accessor, register_series_accessor, or register_index_accessor. @@ -169,7 +169,7 @@ def register_dataframe_accessor(name: str) -> Callable[[Type[T]], Type[T]]: ... Traceback (most recent call last): ... - ValueError: Cannot call DatetimeMethods on type StringType + ValueError: Cannot call DatetimeMethods on type StringType() Examples -------- @@ -250,7 +250,7 @@ def __init__(self, pandas_on_spark_obj): ... Traceback (most recent call last): ... - ValueError: Cannot call DatetimeMethods on type StringType + ValueError: Cannot call DatetimeMethods on type StringType() Examples -------- @@ -322,7 +322,7 @@ def __init__(self, pandas_on_spark_obj): ... Traceback (most recent call last): ... - ValueError: Cannot call DatetimeMethods on type StringType + ValueError: Cannot call DatetimeMethods on type StringType() Examples -------- diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py index ffc86ba4c6134..f79f0ada73a8d 100644 --- a/python/pyspark/pandas/internal.py +++ b/python/pyspark/pandas/internal.py @@ -206,7 +206,7 @@ def __eq__(self, other: Any) -> bool: ) def __repr__(self) -> str: - return "InternalField(dtype={dtype},struct_field={struct_field})".format( + return "InternalField(dtype={dtype}, struct_field={struct_field})".format( dtype=self.dtype, struct_field=self.struct_field ) @@ -293,13 +293,13 @@ class InternalFrame: >>> internal.index_names [None] >>> internal.data_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=int64,struct_field=StructField(A,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(B,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(C,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(D,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(E,LongType,false))] + [InternalField(dtype=int64, struct_field=StructField('A', LongType(), False)), + InternalField(dtype=int64, struct_field=StructField('B', LongType(), False)), + InternalField(dtype=int64, struct_field=StructField('C', LongType(), False)), + InternalField(dtype=int64, struct_field=StructField('D', LongType(), False)), + InternalField(dtype=int64, struct_field=StructField('E', LongType(), False))] >>> internal.index_fields - [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,false))] + [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), False))] >>> internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE +-----------------+---+---+---+---+---+ |__index_level_0__| A| B| C| D| E| @@ -355,13 +355,13 @@ class InternalFrame: ['A', 'B', 'C', 'D', 'E'] >>> internal.index_names [('A',)] - >>> internal.data_fields - [InternalField(dtype=int64,struct_field=StructField(B,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(C,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(D,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(E,LongType,false))] + >>> internal.data_fields # doctest: +NORMALIZE_WHITESPACE + [InternalField(dtype=int64, struct_field=StructField('B', LongType(), False)), + InternalField(dtype=int64, struct_field=StructField('C', LongType(), False)), + InternalField(dtype=int64, struct_field=StructField('D', LongType(), False)), + InternalField(dtype=int64, struct_field=StructField('E', LongType(), False))] >>> internal.index_fields - [InternalField(dtype=int64,struct_field=StructField(A,LongType,false))] + [InternalField(dtype=int64, struct_field=StructField('A', LongType(), False))] >>> internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE +---+---+---+---+---+ | A| B| C| D| E| @@ -419,13 +419,13 @@ class InternalFrame: >>> internal.index_names [None, ('A',)] >>> internal.data_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=int64,struct_field=StructField(B,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(C,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(D,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(E,LongType,false))] + [InternalField(dtype=int64, struct_field=StructField('B', LongType(), False)), + InternalField(dtype=int64, struct_field=StructField('C', LongType(), False)), + InternalField(dtype=int64, struct_field=StructField('D', LongType(), False)), + InternalField(dtype=int64, struct_field=StructField('E', LongType(), False))] >>> internal.index_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(A,LongType,false))] + [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), False)), + InternalField(dtype=int64, struct_field=StructField('A', LongType(), False))] >>> internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE +-----------------+---+---+---+---+---+ |__index_level_0__| A| B| C| D| E| @@ -508,9 +508,9 @@ class InternalFrame: >>> internal.index_names [('A',)] >>> internal.data_fields - [InternalField(dtype=int64,struct_field=StructField(B,LongType,false))] + [InternalField(dtype=int64, struct_field=StructField('B', LongType(), False))] >>> internal.index_fields - [InternalField(dtype=int64,struct_field=StructField(A,LongType,false))] + [InternalField(dtype=int64, struct_field=StructField('A', LongType(), False))] >>> internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE +---+---+ | A| B| @@ -596,9 +596,12 @@ def __init__( [('row_index_a',), ('row_index_b',), ('a', 'x')] >>> internal.index_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=object,struct_field=StructField(__index_level_0__,StringType,false)), - InternalField(dtype=object,struct_field=StructField(__index_level_1__,StringType,false)), - InternalField(dtype=int64,struct_field=StructField((a, x),LongType,false))] + [InternalField(dtype=object, + struct_field=StructField('__index_level_0__', StringType(), False)), + InternalField(dtype=object, + struct_field=StructField('__index_level_1__', StringType(), False)), + InternalField(dtype=int64, + struct_field=StructField('(a, x)', LongType(), False))] >>> internal.column_labels [('a', 'y'), ('b', 'z')] @@ -607,8 +610,8 @@ def __init__( [Column<'(a, y)'>, Column<'(b, z)'>] >>> internal.data_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=int64,struct_field=StructField((a, y),LongType,false)), - InternalField(dtype=int64,struct_field=StructField((b, z),LongType,false))] + [InternalField(dtype=int64, struct_field=StructField('(a, y)', LongType(), False)), + InternalField(dtype=int64, struct_field=StructField('(b, z)', LongType(), False))] >>> internal.column_label_names [('column_labels_a',), ('column_labels_b',)] @@ -1505,13 +1508,14 @@ def prepare_pandas_frame( 2 30 c 1 >>> index_columns ['__index_level_0__'] - >>> index_fields - [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,false))] + >>> index_fields # doctest: +NORMALIZE_WHITESPACE + [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', + LongType(), False))] >>> data_columns ['(x, a)', '(y, b)'] >>> data_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=object,struct_field=StructField((x, a),StringType,false)), - InternalField(dtype=category,struct_field=StructField((y, b),ByteType,false))] + [InternalField(dtype=object, struct_field=StructField('(x, a)', StringType(), False)), + InternalField(dtype=category, struct_field=StructField('(y, b)', ByteType(), False))] >>> import datetime >>> pdf = pd.DataFrame({ @@ -1521,9 +1525,11 @@ def prepare_pandas_frame( >>> _, _, _, _, data_fields = ( ... InternalFrame.prepare_pandas_frame(pdf, prefer_timestamp_ntz=True) ... ) - >>> data_fields - [InternalField(dtype=datetime64[ns],struct_field=StructField(dt,TimestampNTZType,false)), - InternalField(dtype=object,struct_field=StructField(dt_obj,TimestampNTZType,false))] + >>> data_fields # doctest: +NORMALIZE_WHITESPACE + [InternalField(dtype=datetime64[ns], + struct_field=StructField('dt', TimestampNTZType(), False)), + InternalField(dtype=object, + struct_field=StructField('dt_obj', TimestampNTZType(), False))] >>> pdf = pd.DataFrame({ ... "td": [datetime.timedelta(0)], "td_obj": [datetime.timedelta(0)] @@ -1533,8 +1539,10 @@ def prepare_pandas_frame( ... InternalFrame.prepare_pandas_frame(pdf) ... ) >>> data_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=timedelta64[ns],struct_field=StructField(td,DayTimeIntervalType(0,3),false)), - InternalField(dtype=object,struct_field=StructField(td_obj,DayTimeIntervalType(0,3),false))] + [InternalField(dtype=timedelta64[ns], + struct_field=StructField('td', DayTimeIntervalType(0, 3), False)), + InternalField(dtype=object, + struct_field=StructField('td_obj', DayTimeIntervalType(0, 3), False))] """ pdf = pdf.copy() diff --git a/python/pyspark/pandas/spark/utils.py b/python/pyspark/pandas/spark/utils.py index 0940a5e508071..9b8b5bb7542ab 100644 --- a/python/pyspark/pandas/spark/utils.py +++ b/python/pyspark/pandas/spark/utils.py @@ -52,7 +52,7 @@ def as_nullable_spark_type(dt: DataType) -> DataType: >>> as_nullable_spark_type(StructType([ ... StructField("A", IntegerType(), True), ... StructField("B", FloatType(), False)])) # doctest: +NORMALIZE_WHITESPACE - StructType(List(StructField(A,IntegerType,true),StructField(B,FloatType,true))) + StructType([StructField('A', IntegerType(), True), StructField('B', FloatType(), True)]) >>> as_nullable_spark_type(StructType([ ... StructField("A", @@ -62,9 +62,12 @@ def as_nullable_spark_type(dt: DataType) -> DataType: ... ArrayType(IntegerType(), False), False), False), ... StructField('b', StringType(), True)])), ... StructField("B", FloatType(), False)])) # doctest: +NORMALIZE_WHITESPACE - StructType(List(StructField(A,StructType(List(StructField(a,MapType(IntegerType,ArrayType\ -(IntegerType,true),true),true),StructField(b,StringType,true))),true),\ -StructField(B,FloatType,true))) + StructType([StructField('A', + StructType([StructField('a', + MapType(IntegerType(), + ArrayType(IntegerType(), True), True), True), + StructField('b', StringType(), True)]), True), + StructField('B', FloatType(), True)]) """ if isinstance(dt, StructType): new_fields = [] @@ -132,7 +135,8 @@ def force_decimal_precision_scale( >>> force_decimal_precision_scale(StructType([ ... StructField("A", DecimalType(10, 0), True), ... StructField("B", DecimalType(14, 7), False)])) # doctest: +NORMALIZE_WHITESPACE - StructType(List(StructField(A,DecimalType(38,18),true),StructField(B,DecimalType(38,18),false))) + StructType([StructField('A', DecimalType(38,18), True), + StructField('B', DecimalType(38,18), False)]) >>> force_decimal_precision_scale(StructType([ ... StructField("A", @@ -143,9 +147,12 @@ def force_decimal_precision_scale( ... StructField('b', StringType(), True)])), ... StructField("B", DecimalType(30, 15), False)]), ... precision=30, scale=15) # doctest: +NORMALIZE_WHITESPACE - StructType(List(StructField(A,StructType(List(StructField(a,MapType(DecimalType(30,15),\ -ArrayType(DecimalType(30,15),false),false),false),StructField(b,StringType,true))),true),\ -StructField(B,DecimalType(30,15),false))) + StructType([StructField('A', + StructType([StructField('a', + MapType(DecimalType(30,15), + ArrayType(DecimalType(30,15), False), False), False), + StructField('b', StringType(), True)]), True), + StructField('B', DecimalType(30,15), False)]) """ if isinstance(dt, StructType): new_fields = [] diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py index 661526b160050..ec17e0dba2799 100644 --- a/python/pyspark/pandas/tests/test_groupby.py +++ b/python/pyspark/pandas/tests/test_groupby.py @@ -2227,7 +2227,7 @@ def udf(col) -> int: with self.assertRaisesRegex( TypeError, "Expected the return type of this function to be of Series type, " - "but found type ScalarType\\[LongType\\]", + "but found type ScalarType\\[LongType\\(\\)\\]", ): psdf.groupby("a").transform(udf) diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index 4cfd7c63e312d..fc78bcf4cd436 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -2985,7 +2985,7 @@ def udf(col) -> ps.Series[int]: with self.assertRaisesRegex( ValueError, r"Expected the return type of this function to be of scalar type, " - r"but found type SeriesType\[LongType\]", + r"but found type SeriesType\[LongType\(\)\]", ): psser.apply(udf) diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py index 695ed31af6f42..8a32a14b64e72 100644 --- a/python/pyspark/pandas/typedef/typehints.py +++ b/python/pyspark/pandas/typedef/typehints.py @@ -319,17 +319,17 @@ def pandas_on_spark_type(tpe: Union[str, type, Dtype]) -> Tuple[Dtype, types.Dat Examples -------- >>> pandas_on_spark_type(int) - (dtype('int64'), LongType) + (dtype('int64'), LongType()) >>> pandas_on_spark_type(str) - (dtype('>> pandas_on_spark_type(datetime.date) - (dtype('O'), DateType) + (dtype('O'), DateType()) >>> pandas_on_spark_type(datetime.datetime) - (dtype('>> pandas_on_spark_type(datetime.timedelta) - (dtype('>> pandas_on_spark_type(List[bool]) - (dtype('O'), ArrayType(BooleanType,true)) + (dtype('O'), ArrayType(BooleanType(), True)) """ try: dtype = pandas_dtype(tpe) @@ -381,7 +381,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtype dtype('int64') >>> inferred.spark_type - LongType + LongType() >>> def func() -> ps.Series[int]: ... pass @@ -389,7 +389,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtype dtype('int64') >>> inferred.spark_type - LongType + LongType() >>> def func() -> ps.DataFrame[np.float, str]: ... pass @@ -397,7 +397,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('float64'), dtype('>> inferred.spark_type - StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true))) + StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) >>> def func() -> ps.DataFrame[np.float]: ... pass @@ -405,7 +405,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type - StructType(List(StructField(c0,DoubleType,true))) + StructType([StructField('c0', DoubleType(), True)]) >>> def func() -> 'int': ... pass @@ -413,7 +413,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtype dtype('int64') >>> inferred.spark_type - LongType + LongType() >>> def func() -> 'ps.Series[int]': ... pass @@ -421,7 +421,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtype dtype('int64') >>> inferred.spark_type - LongType + LongType() >>> def func() -> 'ps.DataFrame[np.float, str]': ... pass @@ -429,7 +429,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('float64'), dtype('>> inferred.spark_type - StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true))) + StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) >>> def func() -> 'ps.DataFrame[np.float]': ... pass @@ -437,7 +437,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type - StructType(List(StructField(c0,DoubleType,true))) + StructType([StructField('c0', DoubleType(), True)]) >>> def func() -> ps.DataFrame['a': np.float, 'b': int]: ... pass @@ -445,7 +445,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type - StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true))) + StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)]) >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]": ... pass @@ -453,7 +453,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type - StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true))) + StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[pdf.dtypes]: @@ -462,7 +462,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type - StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true))) + StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: @@ -471,7 +471,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type - StructType(List(StructField(a,LongType,true),StructField(b,LongType,true))) + StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)]) >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: @@ -480,7 +480,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type - StructType(List(StructField((x, a),LongType,true),StructField((y, b),LongType,true))) + StructType([StructField('(x, a)', LongType(), True), StructField('(y, b)', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])}) >>> def func() -> ps.DataFrame[pdf.dtypes]: @@ -489,7 +489,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type - StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true))) + StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)]) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass @@ -497,7 +497,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type - StructType(List(StructField(a,LongType,true),StructField(b,LongType,true))) + StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)]) >>> def func() -> ps.Series[pdf.b.dtype]: ... pass @@ -505,7 +505,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtype CategoricalDtype(categories=[3, 4, 5], ordered=False) >>> inferred.spark_type - LongType + LongType() >>> def func() -> ps.DataFrame[int, [int, int]]: ... pass @@ -515,7 +515,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields - [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] + [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]: ... pass @@ -525,7 +525,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields - [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] + [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] >>> def func() -> ps.DataFrame[ ... ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)), @@ -537,7 +537,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type.simpleString() 'struct' >>> inferred.index_fields - [InternalField(dtype=category,struct_field=StructField(index,LongType,true))] + [InternalField(dtype=category, struct_field=StructField('index', LongType(), True))] >>> def func() -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]: @@ -548,7 +548,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,a:bigint,b:bigint>' >>> inferred.index_fields - [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] + [InternalField(dtype=int64, struct_field=StructField('__index_level_0__', LongType(), True))] """ # We should re-import to make sure the class 'SeriesType' is not treated as a class # within this module locally. See Series.__class_getitem__ which imports this class diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index c5de9fb79571f..ea30e5bc88839 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -364,7 +364,8 @@ def schema(self) -> StructType: Examples -------- >>> df.schema - StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true))) + StructType([StructField('age', IntegerType(), True), + StructField('name', StringType(), True)]) """ if self._schema is None: try: diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index 5f5e88fd46deb..be5e1d9a6e5dc 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -627,7 +627,7 @@ def test_toDF_with_schema_string(self): # field types mismatch will cause exception at runtime. self.assertRaisesRegex( Exception, - "FloatType can not accept", + "FloatType\\(\\) can not accept", lambda: rdd.toDF("key: float, value: string").collect(), ) diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index 9ae6c3a63457e..d9ad2344ac5db 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -949,6 +949,29 @@ def assertCollectSuccess(typecode, value): a = array.array(t) self.spark.createDataFrame([Row(myarray=a)]).collect() + def test_repr(self): + instances = [ + NullType(), + StringType(), + BinaryType(), + BooleanType(), + DateType(), + TimestampType(), + DecimalType(), + DoubleType(), + FloatType(), + ByteType(), + IntegerType(), + LongType(), + ShortType(), + ArrayType(StringType()), + MapType(StringType(), IntegerType()), + StructField("f1", StringType(), True), + StructType([StructField("f1", StringType(), True)]), + ] + for instance in instances: + self.assertEqual(eval(repr(instance)), instance) + def test_daytime_interval_type_constructor(self): # SPARK-37277: Test constructors in day time interval. self.assertEqual(DayTimeIntervalType().simpleString(), "interval day to second") diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 41db22b054049..23e54eb8889d9 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -80,7 +80,7 @@ class DataType: """Base class for data types.""" def __repr__(self) -> str: - return self.__class__.__name__ + return self.__class__.__name__ + "()" def __hash__(self) -> int: return hash(str(self)) @@ -364,7 +364,7 @@ def _str_repr(self) -> str: jsonValue = _str_repr def __repr__(self) -> str: - return "%s(%d,%d)" % (type(self).__name__, self.startField, self.endField) + return "%s(%d, %d)" % (type(self).__name__, self.startField, self.endField) def needConversion(self) -> bool: return True @@ -415,7 +415,7 @@ def simpleString(self) -> str: return "array<%s>" % self.elementType.simpleString() def __repr__(self) -> str: - return "ArrayType(%s,%s)" % (self.elementType, str(self.containsNull).lower()) + return "ArrayType(%s, %s)" % (self.elementType, str(self.containsNull)) def jsonValue(self) -> Dict[str, Any]: return { @@ -485,11 +485,7 @@ def simpleString(self) -> str: return "map<%s,%s>" % (self.keyType.simpleString(), self.valueType.simpleString()) def __repr__(self) -> str: - return "MapType(%s,%s,%s)" % ( - self.keyType, - self.valueType, - str(self.valueContainsNull).lower(), - ) + return "MapType(%s, %s, %s)" % (self.keyType, self.valueType, str(self.valueContainsNull)) def jsonValue(self) -> Dict[str, Any]: return { @@ -570,7 +566,7 @@ def simpleString(self) -> str: return "%s:%s" % (self.name, self.dataType.simpleString()) def __repr__(self) -> str: - return "StructField(%s,%s,%s)" % (self.name, self.dataType, str(self.nullable).lower()) + return "StructField('%s', %s, %s)" % (self.name, self.dataType, str(self.nullable)) def jsonValue(self) -> Dict[str, Any]: return { @@ -616,9 +612,9 @@ class StructType(DataType): -------- >>> struct1 = StructType([StructField("f1", StringType(), True)]) >>> struct1["f1"] - StructField(f1,StringType,true) + StructField('f1', StringType(), True) >>> struct1[0] - StructField(f1,StringType,true) + StructField('f1', StringType(), True) >>> struct1 = StructType([StructField("f1", StringType(), True)]) >>> struct2 = StructType([StructField("f1", StringType(), True)]) @@ -753,7 +749,7 @@ def simpleString(self) -> str: return "struct<%s>" % (",".join(f.simpleString() for f in self)) def __repr__(self) -> str: - return "StructType(List(%s))" % ",".join(str(field) for field in self) + return "StructType([%s])" % ", ".join(str(field) for field in self) def jsonValue(self) -> Dict[str, Any]: return {"type": self.typeName(), "fields": [f.jsonValue() for f in self]} @@ -979,17 +975,17 @@ def _parse_datatype_string(s: str) -> DataType: Examples -------- >>> _parse_datatype_string("int ") - IntegerType + IntegerType() >>> _parse_datatype_string("INT ") - IntegerType + IntegerType() >>> _parse_datatype_string("a: byte, b: decimal( 16 , 8 ) ") - StructType(List(StructField(a,ByteType,true),StructField(b,DecimalType(16,8),true))) + StructType([StructField('a', ByteType(), True), StructField('b', DecimalType(16,8), True)]) >>> _parse_datatype_string("a DOUBLE, b STRING") - StructType(List(StructField(a,DoubleType,true),StructField(b,StringType,true))) + StructType([StructField('a', DoubleType(), True), StructField('b', StringType(), True)]) >>> _parse_datatype_string("a: array< short>") - StructType(List(StructField(a,ArrayType(ShortType,true),true))) + StructType([StructField('a', ArrayType(ShortType(), True), True)]) >>> _parse_datatype_string(" map ") - MapType(StringType,StringType,true) + MapType(StringType(), StringType(), True) >>> # Error cases >>> _parse_datatype_string("blabla") # doctest: +IGNORE_EXCEPTION_DETAIL