From 177240ade0e280f2c9345131d00abaf598a87318 Mon Sep 17 00:00:00 2001 From: flynn Date: Mon, 18 Oct 2021 22:18:14 -0400 Subject: [PATCH 01/15] make sql type reprs eval-able --- python/pyspark/sql/tests/test_types.py | 23 +++++++++++++++++++++++ python/pyspark/sql/types.py | 18 +++++++++--------- 2 files changed, 32 insertions(+), 9 deletions(-) diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py index 1dbddf7c30896..38736ab1c6cba 100644 --- a/python/pyspark/sql/tests/test_types.py +++ b/python/pyspark/sql/tests/test_types.py @@ -764,6 +764,29 @@ def assertCollectSuccess(typecode, value): a = array.array(t) self.spark.createDataFrame([Row(myarray=a)]).collect() + def test_repr(self): + instances = [ + NullType(), + StringType(), + BinaryType(), + BooleanType(), + DateType(), + TimestampType(), + DecimalType(), + DoubleType(), + FloatType(), + ByteType(), + IntegerType(), + LongType(), + ShortType(), + ArrayType(StringType()), + MapType(StringType(), IntegerType()), + StructField("f1", StringType(), True), + StructType([StructField("f1", StringType(), True)]), + ] + for instance in instances: + self.assertEqual(eval(repr(instance)), instance) + class DataTypeTests(unittest.TestCase): # regression test for SPARK-6055 diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 69ec96e14fd9b..b443b0468ac4e 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -60,7 +60,7 @@ class DataType(object): """Base class for data types.""" def __repr__(self) -> str: - return self.__class__.__name__ + return self.__class__.__name__ + "()" def __hash__(self) -> int: return hash(str(self)) @@ -336,8 +336,8 @@ def simpleString(self) -> str: return 'array<%s>' % self.elementType.simpleString() def __repr__(self) -> str: - return "ArrayType(%s,%s)" % (self.elementType, - str(self.containsNull).lower()) + return "ArrayType(%s, %s)" % (self.elementType, + str(self.containsNull)) def jsonValue(self) -> Dict[str, Any]: return {"type": self.typeName(), @@ -404,8 +404,8 @@ def simpleString(self) -> str: return 'map<%s,%s>' % (self.keyType.simpleString(), self.valueType.simpleString()) def __repr__(self) -> str: - return "MapType(%s,%s,%s)" % (self.keyType, self.valueType, - str(self.valueContainsNull).lower()) + return "MapType(%s, %s, %s)" % (self.keyType, self.valueType, + str(self.valueContainsNull)) def jsonValue(self) -> Dict[str, Any]: return {"type": self.typeName(), @@ -478,8 +478,8 @@ def simpleString(self) -> str: return '%s:%s' % (self.name, self.dataType.simpleString()) def __repr__(self) -> str: - return "StructField(%s,%s,%s)" % (self.name, self.dataType, - str(self.nullable).lower()) + return "StructField('%s', %s, %s)" % (self.name, self.dataType, + str(self.nullable)) def jsonValue(self) -> Dict[str, Any]: return {"name": self.name, @@ -656,8 +656,8 @@ def simpleString(self) -> str: return 'struct<%s>' % (','.join(f.simpleString() for f in self)) def __repr__(self) -> str: - return ("StructType(List(%s))" % - ",".join(str(field) for field in self)) + return ("StructType([%s])" % + ", ".join(str(field) for field in self)) def jsonValue(self) -> Dict[str, Any]: return {"type": self.typeName(), From d473f3198ec24c3b3710ea6f20af8ed5887e3433 Mon Sep 17 00:00:00 2001 From: flynn Date: Tue, 19 Oct 2021 12:06:52 -0400 Subject: [PATCH 02/15] fix indenting --- python/pyspark/sql/types.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index b443b0468ac4e..c2f09db076625 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -337,7 +337,7 @@ def simpleString(self) -> str: def __repr__(self) -> str: return "ArrayType(%s, %s)" % (self.elementType, - str(self.containsNull)) + str(self.containsNull)) def jsonValue(self) -> Dict[str, Any]: return {"type": self.typeName(), @@ -405,7 +405,7 @@ def simpleString(self) -> str: def __repr__(self) -> str: return "MapType(%s, %s, %s)" % (self.keyType, self.valueType, - str(self.valueContainsNull)) + str(self.valueContainsNull)) def jsonValue(self) -> Dict[str, Any]: return {"type": self.typeName(), @@ -479,7 +479,7 @@ def simpleString(self) -> str: def __repr__(self) -> str: return "StructField('%s', %s, %s)" % (self.name, self.dataType, - str(self.nullable)) + str(self.nullable)) def jsonValue(self) -> Dict[str, Any]: return {"name": self.name, From d7109515fd6837a89998b3b2ec3570670fe73958 Mon Sep 17 00:00:00 2001 From: flynn Date: Tue, 19 Oct 2021 13:37:20 -0400 Subject: [PATCH 03/15] fix dataframe test --- python/pyspark/sql/tests/test_dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index 6b3a6a23f2f17..1ab0d92c0dc7e 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -498,7 +498,7 @@ def test_toDF_with_schema_string(self): lambda: rdd.toDF("key: int").collect()) # field types mismatch will cause exception at runtime. - self.assertRaisesRegex(Exception, "FloatType can not accept", + self.assertRaisesRegex(Exception, r"FloatType\(\) can not accept", lambda: rdd.toDF("key: float, value: string").collect()) # flat schema values will be wrapped into row. From 622739f6f0170be68879dfd77bc6654411149cc9 Mon Sep 17 00:00:00 2001 From: flynn Date: Tue, 19 Oct 2021 15:42:14 -0400 Subject: [PATCH 04/15] fix doctests --- python/pyspark/sql/types.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index c2f09db076625..fbaa871a65858 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -521,9 +521,9 @@ class StructType(DataType): -------- >>> struct1 = StructType([StructField("f1", StringType(), True)]) >>> struct1["f1"] - StructField(f1,StringType,true) + StructField('f1', StringType(), True) >>> struct1[0] - StructField(f1,StringType,true) + StructField('f1', StringType(), True) >>> struct1 = StructType([StructField("f1", StringType(), True)]) >>> struct2 = StructType([StructField("f1", StringType(), True)]) @@ -862,17 +862,17 @@ def _parse_datatype_string(s: str) -> DataType: Examples -------- >>> _parse_datatype_string("int ") - IntegerType + IntegerType() >>> _parse_datatype_string("INT ") - IntegerType + IntegerType() >>> _parse_datatype_string("a: byte, b: decimal( 16 , 8 ) ") - StructType(List(StructField(a,ByteType,true),StructField(b,DecimalType(16,8),true))) + StructType([StructField('a', ByteType(), True), StructField('b', DecimalType(16,8), True)]) >>> _parse_datatype_string("a DOUBLE, b STRING") - StructType(List(StructField(a,DoubleType,true),StructField(b,StringType,true))) + StructType([StructField('a', DoubleType(), True), StructField('b', StringType(), True)]) >>> _parse_datatype_string("a: array< short>") - StructType(List(StructField(a,ArrayType(ShortType,true),true))) + StructType([StructField('a', ArrayType(ShortType(), True), True)]) >>> _parse_datatype_string(" map ") - MapType(StringType,StringType,true) + MapType(StringType(), StringType(), True) >>> # Error cases >>> _parse_datatype_string("blabla") # doctest: +IGNORE_EXCEPTION_DETAIL From 392d7511df4200d73b7c769602fc42fb6d1dcaf6 Mon Sep 17 00:00:00 2001 From: flynn Date: Tue, 19 Oct 2021 16:51:39 -0400 Subject: [PATCH 05/15] fix more doctests --- python/pyspark/ml/functions.py | 8 ++++---- python/pyspark/pandas/extensions.py | 8 ++++---- python/pyspark/pandas/tests/test_groupby.py | 2 +- python/pyspark/sql/dataframe.py | 2 +- python/pyspark/sql/tests/test_dataframe.py | 2 +- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/python/pyspark/ml/functions.py b/python/pyspark/ml/functions.py index 1eadbd694210e..fda00a11df7dc 100644 --- a/python/pyspark/ml/functions.py +++ b/python/pyspark/ml/functions.py @@ -58,11 +58,11 @@ def vector_to_array(col, dtype="float64"): [Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]), Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])] >>> df1.schema.fields - [StructField(vec,ArrayType(DoubleType,false),false), - StructField(oldVec,ArrayType(DoubleType,false),false)] + [StructField('vec', ArrayType(DoubleType(), False), False), + StructField('oldVec', ArrayType(DoubleType(), False), False)] >>> df2.schema.fields - [StructField(vec,ArrayType(FloatType,false),false), - StructField(oldVec,ArrayType(FloatType,false),false)] + [StructField('vec', ArrayType(FloatType(), False), False), + StructField('oldVec', ArrayType(FloatType(), False), False)] """ sc = SparkContext._active_spark_context return Column( diff --git a/python/pyspark/pandas/extensions.py b/python/pyspark/pandas/extensions.py index 69f742541a599..eeb02f06a85e9 100644 --- a/python/pyspark/pandas/extensions.py +++ b/python/pyspark/pandas/extensions.py @@ -109,7 +109,7 @@ def __init__(self, pandas_on_spark_obj): ... Traceback (most recent call last): ... - ValueError: Cannot call DatetimeMethods on type StringType + ValueError: Cannot call DatetimeMethods on type StringType() Note: This function is not meant to be used directly - instead, use register_dataframe_accessor, register_series_accessor, or register_index_accessor. @@ -169,7 +169,7 @@ def register_dataframe_accessor(name: str) -> Callable[[Type[T]], Type[T]]: ... Traceback (most recent call last): ... - ValueError: Cannot call DatetimeMethods on type StringType + ValueError: Cannot call DatetimeMethods on type StringType() Examples -------- @@ -250,7 +250,7 @@ def __init__(self, pandas_on_spark_obj): ... Traceback (most recent call last): ... - ValueError: Cannot call DatetimeMethods on type StringType + ValueError: Cannot call DatetimeMethods on type StringType() Examples -------- @@ -322,7 +322,7 @@ def __init__(self, pandas_on_spark_obj): ... Traceback (most recent call last): ... - ValueError: Cannot call DatetimeMethods on type StringType + ValueError: Cannot call DatetimeMethods on type StringType() Examples -------- diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py index 4e1c0d0674f27..dda7afa060204 100644 --- a/python/pyspark/pandas/tests/test_groupby.py +++ b/python/pyspark/pandas/tests/test_groupby.py @@ -2225,7 +2225,7 @@ def udf(col) -> int: with self.assertRaisesRegex( TypeError, "Expected the return type of this function to be of Series type, " - "but found type ScalarType\\[LongType\\]", + "but found type ScalarType\\[LongType\\(\\)\\]", ): psdf.groupby("a").transform(udf) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index 90311c47ea104..a14eda7c11ee8 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -306,7 +306,7 @@ def schema(self) -> StructType: Examples -------- >>> df.schema - StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true))) + StructType([StructField('age', IntegerType(), True), StructField('name', StringType(), True)]) """ if self._schema is None: try: diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py index 1ab0d92c0dc7e..4061ca3235492 100644 --- a/python/pyspark/sql/tests/test_dataframe.py +++ b/python/pyspark/sql/tests/test_dataframe.py @@ -498,7 +498,7 @@ def test_toDF_with_schema_string(self): lambda: rdd.toDF("key: int").collect()) # field types mismatch will cause exception at runtime. - self.assertRaisesRegex(Exception, r"FloatType\(\) can not accept", + self.assertRaisesRegex(Exception, "FloatType\\(\\) can not accept", lambda: rdd.toDF("key: float, value: string").collect()) # flat schema values will be wrapped into row. From 34e7a449438d376248d65885f817195a163406b0 Mon Sep 17 00:00:00 2001 From: flynn Date: Tue, 19 Oct 2021 17:38:29 -0400 Subject: [PATCH 06/15] fix lint err --- python/pyspark/sql/dataframe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py index a14eda7c11ee8..cba5c4855a1ae 100644 --- a/python/pyspark/sql/dataframe.py +++ b/python/pyspark/sql/dataframe.py @@ -306,7 +306,8 @@ def schema(self) -> StructType: Examples -------- >>> df.schema - StructType([StructField('age', IntegerType(), True), StructField('name', StringType(), True)]) + StructType([StructField('age', IntegerType(), True), + StructField('name', StringType(), True)]) """ if self._schema is None: try: From b99254aaa35d64426b8677964bd69f05c97468ae Mon Sep 17 00:00:00 2001 From: flynn Date: Tue, 19 Oct 2021 18:03:40 -0400 Subject: [PATCH 07/15] fix more doctests --- python/pyspark/pandas/internal.py | 58 +++++++++++----------- python/pyspark/pandas/tests/test_series.py | 2 +- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py index 5cb21a78726c1..bcfb77a3bc4d2 100644 --- a/python/pyspark/pandas/internal.py +++ b/python/pyspark/pandas/internal.py @@ -293,13 +293,13 @@ class InternalFrame(object): >>> internal.index_names [None] >>> internal.data_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=int64,struct_field=StructField(A,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(B,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(C,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(D,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(E,LongType,false))] + [InternalField(dtype=int64,struct_field=StructField('A', LongType(), False)), + InternalField(dtype=int64,struct_field=StructField('B', LongType(), False)), + InternalField(dtype=int64,struct_field=StructField('C', LongType(), False)), + InternalField(dtype=int64,struct_field=StructField('D', LongType(), False)), + InternalField(dtype=int64,struct_field=StructField('E', LongType(), False))] >>> internal.index_fields - [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,false))] + [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', LongType(), False))] >>> internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE +-----------------+---+---+---+---+---+ |__index_level_0__| A| B| C| D| E| @@ -356,12 +356,12 @@ class InternalFrame(object): >>> internal.index_names [('A',)] >>> internal.data_fields - [InternalField(dtype=int64,struct_field=StructField(B,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(C,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(D,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(E,LongType,false))] + [InternalField(dtype=int64,struct_field=StructField('B', LongType(), False)), + InternalField(dtype=int64,struct_field=StructField('C', LongType(), False)), + InternalField(dtype=int64,struct_field=StructField('D', LongType(), False)), + InternalField(dtype=int64,struct_field=StructField('E', LongType(), False))] >>> internal.index_fields - [InternalField(dtype=int64,struct_field=StructField(A,LongType,false))] + [InternalField(dtype=int64,struct_field=StructField('A', LongType(), False))] >>> internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE +---+---+---+---+---+ | A| B| C| D| E| @@ -419,13 +419,13 @@ class InternalFrame(object): >>> internal.index_names [None, ('A',)] >>> internal.data_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=int64,struct_field=StructField(B,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(C,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(D,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(E,LongType,false))] + [InternalField(dtype=int64,struct_field=StructField('B', LongType(), False)), + InternalField(dtype=int64,struct_field=StructField('C', LongType(), False)), + InternalField(dtype=int64,struct_field=StructField('D', LongType(), False)), + InternalField(dtype=int64,struct_field=StructField('E', LongType(), False))] >>> internal.index_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,false)), - InternalField(dtype=int64,struct_field=StructField(A,LongType,false))] + [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', LongType(), False)), + InternalField(dtype=int64,struct_field=StructField('A', LongType(), False))] >>> internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE +-----------------+---+---+---+---+---+ |__index_level_0__| A| B| C| D| E| @@ -508,9 +508,9 @@ class InternalFrame(object): >>> internal.index_names [('A',)] >>> internal.data_fields - [InternalField(dtype=int64,struct_field=StructField(B,LongType,false))] + [InternalField(dtype=int64,struct_field=StructField('B', LongType(), False))] >>> internal.index_fields - [InternalField(dtype=int64,struct_field=StructField(A,LongType,false))] + [InternalField(dtype=int64,struct_field=StructField('A', LongType(), False))] >>> internal.to_internal_spark_frame.show() # doctest: +NORMALIZE_WHITESPACE +---+---+ | A| B| @@ -596,9 +596,9 @@ def __init__( [('row_index_a',), ('row_index_b',), ('a', 'x')] >>> internal.index_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=object,struct_field=StructField(__index_level_0__,StringType,false)), - InternalField(dtype=object,struct_field=StructField(__index_level_1__,StringType,false)), - InternalField(dtype=int64,struct_field=StructField((a, x),LongType,false))] + [InternalField(dtype=object,struct_field=StructField('__index_level_0__', StringType(), False)), + InternalField(dtype=object,struct_field=StructField('__index_level_1__', StringType(), False)), + InternalField(dtype=int64,struct_field=StructField('(a, x)', LongType(), False))] >>> internal.column_labels [('a', 'y'), ('b', 'z')] @@ -607,8 +607,8 @@ def __init__( [Column<'(a, y)'>, Column<'(b, z)'>] >>> internal.data_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=int64,struct_field=StructField((a, y),LongType,false)), - InternalField(dtype=int64,struct_field=StructField((b, z),LongType,false))] + [InternalField(dtype=int64,struct_field=StructField('(a, y)', LongType(), False)), + InternalField(dtype=int64,struct_field=StructField('(b, z)', LongType(), False))] >>> internal.column_label_names [('column_labels_a',), ('column_labels_b',)] @@ -1506,12 +1506,12 @@ def prepare_pandas_frame( >>> index_columns ['__index_level_0__'] >>> index_fields - [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,false))] + [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', LongType(), False))] >>> data_columns ['(x, a)', '(y, b)'] >>> data_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=object,struct_field=StructField((x, a),StringType,false)), - InternalField(dtype=category,struct_field=StructField((y, b),ByteType,false))] + [InternalField(dtype=object,struct_field=StructField('(x, a)', StringType(), False)), + InternalField(dtype=category,struct_field=StructField('(y, b)', ByteType(), False))] >>> import datetime >>> pdf = pd.DataFrame({ @@ -1522,8 +1522,8 @@ def prepare_pandas_frame( ... InternalFrame.prepare_pandas_frame(pdf, prefer_timestamp_ntz=True) ... ) >>> data_fields - [InternalField(dtype=datetime64[ns],struct_field=StructField(dt,TimestampNTZType,false)), - InternalField(dtype=object,struct_field=StructField(dt_obj,TimestampNTZType,false))] + [InternalField(dtype=datetime64[ns],struct_field=StructField('dt', TimestampNTZType(), False)), + InternalField(dtype=object,struct_field=StructField('dt_obj', TimestampNTZType(), False))] """ pdf = pdf.copy() diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py index 72677d18e4b88..e2429306b77a9 100644 --- a/python/pyspark/pandas/tests/test_series.py +++ b/python/pyspark/pandas/tests/test_series.py @@ -3010,7 +3010,7 @@ def udf(col) -> ps.Series[int]: with self.assertRaisesRegex( ValueError, r"Expected the return type of this function to be of scalar type, " - r"but found type SeriesType\[LongType\]", + r"but found type SeriesType\[LongType\(\)\]", ): psser.apply(udf) From f6f3cfb786aba3a8be35f31d89d0004df26bfa93 Mon Sep 17 00:00:00 2001 From: flynn Date: Tue, 19 Oct 2021 19:18:41 -0400 Subject: [PATCH 08/15] doctest fixes --- python/pyspark/pandas/internal.py | 20 +++++++++++++------ python/pyspark/pandas/spark/utils.py | 29 ++++++++++++++++++++-------- 2 files changed, 35 insertions(+), 14 deletions(-) diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py index bcfb77a3bc4d2..d5df0da2c785e 100644 --- a/python/pyspark/pandas/internal.py +++ b/python/pyspark/pandas/internal.py @@ -596,8 +596,12 @@ def __init__( [('row_index_a',), ('row_index_b',), ('a', 'x')] >>> internal.index_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=object,struct_field=StructField('__index_level_0__', StringType(), False)), - InternalField(dtype=object,struct_field=StructField('__index_level_1__', StringType(), False)), + [InternalField( + dtype=object,struct_field=StructField('__index_level_0__', StringType(), False) + ), + InternalField( + dtype=object,struct_field=StructField('__index_level_1__', StringType(), False) + ), InternalField(dtype=int64,struct_field=StructField('(a, x)', LongType(), False))] >>> internal.column_labels @@ -1505,8 +1509,10 @@ def prepare_pandas_frame( 2 30 c 1 >>> index_columns ['__index_level_0__'] - >>> index_fields - [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', LongType(), False))] + >>> index_fields # doctest: +NORMALIZE_WHITESPACE + [InternalField( + dtype=int64,struct_field=StructField('__index_level_0__', LongType(), False) + )] >>> data_columns ['(x, a)', '(y, b)'] >>> data_fields # doctest: +NORMALIZE_WHITESPACE @@ -1521,8 +1527,10 @@ def prepare_pandas_frame( >>> _, _, _, _, data_fields = ( ... InternalFrame.prepare_pandas_frame(pdf, prefer_timestamp_ntz=True) ... ) - >>> data_fields - [InternalField(dtype=datetime64[ns],struct_field=StructField('dt', TimestampNTZType(), False)), + >>> data_fields # doctest: +NORMALIZE_WHITESPACE + [InternalField( + dtype=datetime64[ns],struct_field=StructField('dt', TimestampNTZType(), False) + ), InternalField(dtype=object,struct_field=StructField('dt_obj', TimestampNTZType(), False))] """ pdf = pdf.copy() diff --git a/python/pyspark/pandas/spark/utils.py b/python/pyspark/pandas/spark/utils.py index 0940a5e508071..3c37cdacad919 100644 --- a/python/pyspark/pandas/spark/utils.py +++ b/python/pyspark/pandas/spark/utils.py @@ -52,7 +52,7 @@ def as_nullable_spark_type(dt: DataType) -> DataType: >>> as_nullable_spark_type(StructType([ ... StructField("A", IntegerType(), True), ... StructField("B", FloatType(), False)])) # doctest: +NORMALIZE_WHITESPACE - StructType(List(StructField(A,IntegerType,true),StructField(B,FloatType,true))) + StructType([StructField('A', IntegerType(), True), StructField('B', FloatType(), True)]) >>> as_nullable_spark_type(StructType([ ... StructField("A", @@ -62,9 +62,13 @@ def as_nullable_spark_type(dt: DataType) -> DataType: ... ArrayType(IntegerType(), False), False), False), ... StructField('b', StringType(), True)])), ... StructField("B", FloatType(), False)])) # doctest: +NORMALIZE_WHITESPACE - StructType(List(StructField(A,StructType(List(StructField(a,MapType(IntegerType,ArrayType\ -(IntegerType,true),true),true),StructField(b,StringType,true))),true),\ -StructField(B,FloatType,true))) + StructType([ + StructField('A', StructType([ + StructField('a', MapType(IntegerType(), ArrayType(IntegerType(), True), True), True), + StructField('b', StringType(), True) + ]), True), + StructField('B', FloatType(), True) + ]) """ if isinstance(dt, StructType): new_fields = [] @@ -132,7 +136,9 @@ def force_decimal_precision_scale( >>> force_decimal_precision_scale(StructType([ ... StructField("A", DecimalType(10, 0), True), ... StructField("B", DecimalType(14, 7), False)])) # doctest: +NORMALIZE_WHITESPACE - StructType(List(StructField(A,DecimalType(38,18),true),StructField(B,DecimalType(38,18),false))) + StructType([ + StructField('A', DecimalType(38,18), True), StructField('B', DecimalType(38,18), False) + ]) >>> force_decimal_precision_scale(StructType([ ... StructField("A", @@ -143,9 +149,16 @@ def force_decimal_precision_scale( ... StructField('b', StringType(), True)])), ... StructField("B", DecimalType(30, 15), False)]), ... precision=30, scale=15) # doctest: +NORMALIZE_WHITESPACE - StructType(List(StructField(A,StructType(List(StructField(a,MapType(DecimalType(30,15),\ -ArrayType(DecimalType(30,15),false),false),false),StructField(b,StringType,true))),true),\ -StructField(B,DecimalType(30,15),false))) + StructType([ + StructField('A', StructType([ + StructField( + 'a', MapType(DecimalType(30,15), ArrayType(DecimalType(30,15), False), False + ), + False + ), + StructField('b', StringType(), True)]), True), + StructField('B', DecimalType(30,15), False) + ]) """ if isinstance(dt, StructType): new_fields = [] From aece8fa9fcade285d19af91a403d7d6e60a3c579 Mon Sep 17 00:00:00 2001 From: flynn Date: Wed, 20 Oct 2021 21:32:18 -0400 Subject: [PATCH 09/15] fix doctest output --- python/pyspark/pandas/internal.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py index d5df0da2c785e..9398a2b290795 100644 --- a/python/pyspark/pandas/internal.py +++ b/python/pyspark/pandas/internal.py @@ -596,13 +596,12 @@ def __init__( [('row_index_a',), ('row_index_b',), ('a', 'x')] >>> internal.index_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField( - dtype=object,struct_field=StructField('__index_level_0__', StringType(), False) - ), - InternalField( - dtype=object,struct_field=StructField('__index_level_1__', StringType(), False) - ), - InternalField(dtype=int64,struct_field=StructField('(a, x)', LongType(), False))] + [InternalField(dtype=object,struct_field=StructField('__index_level_0__', + StringType(), False)), + InternalField(dtype=object,struct_field=StructField('__index_level_1__', + StringType(), False)), + InternalField(dtype=int64,struct_field=StructField('(a, x)', + LongType(), False))] >>> internal.column_labels [('a', 'y'), ('b', 'z')] @@ -1510,9 +1509,8 @@ def prepare_pandas_frame( >>> index_columns ['__index_level_0__'] >>> index_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField( - dtype=int64,struct_field=StructField('__index_level_0__', LongType(), False) - )] + [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', + LongType(), False))] >>> data_columns ['(x, a)', '(y, b)'] >>> data_fields # doctest: +NORMALIZE_WHITESPACE @@ -1528,10 +1526,11 @@ def prepare_pandas_frame( ... InternalFrame.prepare_pandas_frame(pdf, prefer_timestamp_ntz=True) ... ) >>> data_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField( - dtype=datetime64[ns],struct_field=StructField('dt', TimestampNTZType(), False) - ), - InternalField(dtype=object,struct_field=StructField('dt_obj', TimestampNTZType(), False))] + [InternalField(dtype=datetime64[ns],struct_field=StructField('dt', + TimestampNTZType(), False)), + InternalField(dtype=object,struct_field=StructField('dt_obj', + TimestampNTZType(), False))] + """ pdf = pdf.copy() From a02d86f39b8c166cd399a45088c5a0780a580de6 Mon Sep 17 00:00:00 2001 From: flynn Date: Wed, 20 Oct 2021 22:39:59 -0400 Subject: [PATCH 10/15] fix pandas doctests --- python/pyspark/pandas/spark/utils.py | 40 ++++++++++++++-------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/python/pyspark/pandas/spark/utils.py b/python/pyspark/pandas/spark/utils.py index 3c37cdacad919..fcae47380bd27 100644 --- a/python/pyspark/pandas/spark/utils.py +++ b/python/pyspark/pandas/spark/utils.py @@ -62,13 +62,15 @@ def as_nullable_spark_type(dt: DataType) -> DataType: ... ArrayType(IntegerType(), False), False), False), ... StructField('b', StringType(), True)])), ... StructField("B", FloatType(), False)])) # doctest: +NORMALIZE_WHITESPACE - StructType([ - StructField('A', StructType([ - StructField('a', MapType(IntegerType(), ArrayType(IntegerType(), True), True), True), - StructField('b', StringType(), True) - ]), True), - StructField('B', FloatType(), True) - ]) + StructType([StructField('A', + StructType([StructField('a', + MapType(IntegerType(), + ArrayType(IntegerType(), + True), + True), + True), + StructField('b', StringType(), True)]), True), + StructField('B', FloatType(), True)]) """ if isinstance(dt, StructType): new_fields = [] @@ -136,9 +138,8 @@ def force_decimal_precision_scale( >>> force_decimal_precision_scale(StructType([ ... StructField("A", DecimalType(10, 0), True), ... StructField("B", DecimalType(14, 7), False)])) # doctest: +NORMALIZE_WHITESPACE - StructType([ - StructField('A', DecimalType(38,18), True), StructField('B', DecimalType(38,18), False) - ]) + StructType([StructField('A', DecimalType(38,18), True), + StructField('B', DecimalType(38,18), False)]) >>> force_decimal_precision_scale(StructType([ ... StructField("A", @@ -149,16 +150,15 @@ def force_decimal_precision_scale( ... StructField('b', StringType(), True)])), ... StructField("B", DecimalType(30, 15), False)]), ... precision=30, scale=15) # doctest: +NORMALIZE_WHITESPACE - StructType([ - StructField('A', StructType([ - StructField( - 'a', MapType(DecimalType(30,15), ArrayType(DecimalType(30,15), False), False - ), - False - ), - StructField('b', StringType(), True)]), True), - StructField('B', DecimalType(30,15), False) - ]) + StructType([StructField('A', + StructType([StructField('a', + MapType(DecimalType(30,15), + ArrayType(DecimalType(30,15), + False), + False), + False), + StructField('b', StringType(), True)]), True), + StructField('B', DecimalType(30,15), False)]) """ if isinstance(dt, StructType): new_fields = [] From 4615139d32a1e329f24af4026c8a894452828ba6 Mon Sep 17 00:00:00 2001 From: flynn Date: Thu, 21 Oct 2021 08:48:36 -0400 Subject: [PATCH 11/15] fix typehints docstrings --- python/pyspark/pandas/typedef/typehints.py | 50 +++++++++++----------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py index 14a105672933e..c693f8d796708 100644 --- a/python/pyspark/pandas/typedef/typehints.py +++ b/python/pyspark/pandas/typedef/typehints.py @@ -323,15 +323,15 @@ def pandas_on_spark_type(tpe: Union[str, type, Dtype]) -> Tuple[Dtype, types.Dat Examples -------- >>> pandas_on_spark_type(int) - (dtype('int64'), LongType) + (dtype('int64'), LongType()) >>> pandas_on_spark_type(str) - (dtype('>> pandas_on_spark_type(datetime.date) - (dtype('O'), DateType) + (dtype('O'), DateType()) >>> pandas_on_spark_type(datetime.datetime) - (dtype('>> pandas_on_spark_type(List[bool]) - (dtype('O'), ArrayType(BooleanType,true)) + (dtype('O'), ArrayType(BooleanType(), True)) """ try: dtype = pandas_dtype(tpe) @@ -383,7 +383,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtype dtype('int64') >>> inferred.spark_type - LongType + LongType() >>> def func() -> ps.Series[int]: ... pass @@ -391,7 +391,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtype dtype('int64') >>> inferred.spark_type - LongType + LongType() >>> def func() -> ps.DataFrame[np.float, str]: ... pass @@ -399,7 +399,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('float64'), dtype('>> inferred.spark_type - StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true))) + StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) >>> def func() -> ps.DataFrame[np.float]: ... pass @@ -407,7 +407,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type - StructType(List(StructField(c0,DoubleType,true))) + StructType([StructField('c0', DoubleType(), True)]) >>> def func() -> 'int': ... pass @@ -415,7 +415,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtype dtype('int64') >>> inferred.spark_type - LongType + LongType() >>> def func() -> 'ps.Series[int]': ... pass @@ -423,7 +423,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtype dtype('int64') >>> inferred.spark_type - LongType + LongType() >>> def func() -> 'ps.DataFrame[np.float, str]': ... pass @@ -431,7 +431,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('float64'), dtype('>> inferred.spark_type - StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true))) + StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)]) >>> def func() -> 'ps.DataFrame[np.float]': ... pass @@ -439,7 +439,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('float64')] >>> inferred.spark_type - StructType(List(StructField(c0,DoubleType,true))) + StructType([StructField('c0', DoubleType(), True)]) >>> def func() -> ps.DataFrame['a': np.float, 'b': int]: ... pass @@ -447,7 +447,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type - StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true))) + StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)]) >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]": ... pass @@ -455,7 +455,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('float64'), dtype('int64')] >>> inferred.spark_type - StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true))) + StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[pdf.dtypes]: @@ -464,7 +464,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type - StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true))) + StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: @@ -473,7 +473,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type - StructType(List(StructField(a,LongType,true),StructField(b,LongType,true))) + StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)]) >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]}) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: @@ -482,7 +482,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('int64'), dtype('int64')] >>> inferred.spark_type - StructType(List(StructField((x, a),LongType,true),StructField((y, b),LongType,true))) + StructType([StructField('(x, a)', LongType(), True), StructField('(y, b)', LongType(), True)]) >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])}) >>> def func() -> ps.DataFrame[pdf.dtypes]: @@ -491,7 +491,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type - StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true))) + StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)]) >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]: ... pass @@ -499,7 +499,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtypes [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)] >>> inferred.spark_type - StructType(List(StructField(a,LongType,true),StructField(b,LongType,true))) + StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)]) >>> def func() -> ps.Series[pdf.b.dtype]: ... pass @@ -507,7 +507,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.dtype CategoricalDtype(categories=[3, 4, 5], ordered=False) >>> inferred.spark_type - LongType + LongType() >>> def func() -> ps.DataFrame[int, [int, int]]: ... pass @@ -517,7 +517,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields - [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] + [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', LongType(), True))] >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]: ... pass @@ -527,7 +527,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>' >>> inferred.index_fields - [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] + [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', LongType(), True))] >>> def func() -> ps.DataFrame[ ... ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)), @@ -539,7 +539,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type.simpleString() 'struct' >>> inferred.index_fields - [InternalField(dtype=category,struct_field=StructField(index,LongType,true))] + [InternalField(dtype=category,struct_field=StructField('index', LongType(), True))] >>> def func() -> ps.DataFrame[ ... (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]: @@ -550,7 +550,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp >>> inferred.spark_type.simpleString() 'struct<__index_level_0__:bigint,a:bigint,b:bigint>' >>> inferred.index_fields - [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))] + [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', LongType(), True))] """ # We should re-import to make sure the class 'SeriesType' is not treated as a class # within this module locally. See Series.__class_getitem__ which imports this class From 58ddcd082600f0f9ab3c0570611c03678107647b Mon Sep 17 00:00:00 2001 From: flynn Date: Wed, 22 Dec 2021 12:37:53 -0500 Subject: [PATCH 12/15] black fmt --- python/pyspark/sql/types.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py index 322eb8bfcff22..3cd151b798999 100644 --- a/python/pyspark/sql/types.py +++ b/python/pyspark/sql/types.py @@ -415,8 +415,7 @@ def simpleString(self) -> str: return "array<%s>" % self.elementType.simpleString() def __repr__(self) -> str: - return "ArrayType(%s, %s)" % (self.elementType, - str(self.containsNull)) + return "ArrayType(%s, %s)" % (self.elementType, str(self.containsNull)) def jsonValue(self) -> Dict[str, Any]: return { @@ -486,8 +485,7 @@ def simpleString(self) -> str: return "map<%s,%s>" % (self.keyType.simpleString(), self.valueType.simpleString()) def __repr__(self) -> str: - return "MapType(%s, %s, %s)" % (self.keyType, self.valueType, - str(self.valueContainsNull)) + return "MapType(%s, %s, %s)" % (self.keyType, self.valueType, str(self.valueContainsNull)) def jsonValue(self) -> Dict[str, Any]: return { @@ -568,8 +566,7 @@ def simpleString(self) -> str: return "%s:%s" % (self.name, self.dataType.simpleString()) def __repr__(self) -> str: - return "StructField('%s', %s, %s)" % (self.name, self.dataType, - str(self.nullable)) + return "StructField('%s', %s, %s)" % (self.name, self.dataType, str(self.nullable)) def jsonValue(self) -> Dict[str, Any]: return { @@ -752,8 +749,7 @@ def simpleString(self) -> str: return "struct<%s>" % (",".join(f.simpleString() for f in self)) def __repr__(self) -> str: - return ("StructType([%s])" % - ", ".join(str(field) for field in self)) + return "StructType([%s])" % ", ".join(str(field) for field in self) def jsonValue(self) -> Dict[str, Any]: return {"type": self.typeName(), "fields": [f.jsonValue() for f in self]} From 5cde0579fbf1f0937f15e4bb703dbafa6e97e095 Mon Sep 17 00:00:00 2001 From: Flynn Date: Tue, 22 Mar 2022 01:02:42 -0400 Subject: [PATCH 13/15] cleanup doctests --- python/pyspark/pandas/internal.py | 30 ++++++++++++++-------------- python/pyspark/pandas/spark/utils.py | 26 ++++++++++-------------- 2 files changed, 25 insertions(+), 31 deletions(-) diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py index 8cee4aae8fe23..c0dab6f2313d5 100644 --- a/python/pyspark/pandas/internal.py +++ b/python/pyspark/pandas/internal.py @@ -355,7 +355,7 @@ class InternalFrame: ['A', 'B', 'C', 'D', 'E'] >>> internal.index_names [('A',)] - >>> internal.data_fields + >>> internal.data_fields # doctest: +NORMALIZE_WHITESPACE [InternalField(dtype=int64, struct_field=StructField('B', LongType(), False)), InternalField(dtype=int64, struct_field=StructField('C', LongType(), False)), InternalField(dtype=int64, struct_field=StructField('D', LongType(), False)), @@ -596,12 +596,12 @@ def __init__( [('row_index_a',), ('row_index_b',), ('a', 'x')] >>> internal.index_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=object, struct_field=StructField('__index_level_0__', - StringType(), False)), - InternalField(dtype=object, struct_field=StructField('__index_level_1__', - StringType(), False)), - InternalField(dtype=int64, struct_field=StructField('(a, x)', - LongType(), False))] + [InternalField(dtype=object, + struct_field=StructField('__index_level_0__', StringType(), False)), + InternalField(dtype=object, + struct_field=StructField('__index_level_1__', StringType(), False)), + InternalField(dtype=int64, + struct_field=StructField('(a, x)', LongType(), False))] >>> internal.column_labels [('a', 'y'), ('b', 'z')] @@ -1526,10 +1526,10 @@ def prepare_pandas_frame( ... InternalFrame.prepare_pandas_frame(pdf, prefer_timestamp_ntz=True) ... ) >>> data_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=datetime64[ns], struct_field=StructField('dt', - TimestampNTZType(), False)), - InternalField(dtype=object, struct_field=StructField('dt_obj', - TimestampNTZType(), False))] + [InternalField(dtype=datetime64[ns], + struct_field=StructField('dt', TimestampNTZType(), False)), + InternalField(dtype=object, + struct_field=StructField('dt_obj', TimestampNTZType(), False))] >>> pdf = pd.DataFrame({ ... "td": [datetime.timedelta(0)], "td_obj": [datetime.timedelta(0)] @@ -1539,10 +1539,10 @@ def prepare_pandas_frame( ... InternalFrame.prepare_pandas_frame(pdf) ... ) >>> data_fields # doctest: +NORMALIZE_WHITESPACE - [InternalField(dtype=timedelta64[ns], struct_field=StructField('td', - DayTimeIntervalType(0, 3), False)), - InternalField(dtype=object, struct_field=StructField('td_obj', - DayTimeIntervalType(0, 3), False))] + [InternalField(dtype=timedelta64[ns], + struct_field=StructField('td', DayTimeIntervalType(0, 3), False)), + InternalField(dtype=object, + struct_field=StructField('td_obj', DayTimeIntervalType(0, 3), False))] """ pdf = pdf.copy() diff --git a/python/pyspark/pandas/spark/utils.py b/python/pyspark/pandas/spark/utils.py index fcae47380bd27..9b8b5bb7542ab 100644 --- a/python/pyspark/pandas/spark/utils.py +++ b/python/pyspark/pandas/spark/utils.py @@ -63,14 +63,11 @@ def as_nullable_spark_type(dt: DataType) -> DataType: ... StructField('b', StringType(), True)])), ... StructField("B", FloatType(), False)])) # doctest: +NORMALIZE_WHITESPACE StructType([StructField('A', - StructType([StructField('a', - MapType(IntegerType(), - ArrayType(IntegerType(), - True), - True), - True), - StructField('b', StringType(), True)]), True), - StructField('B', FloatType(), True)]) + StructType([StructField('a', + MapType(IntegerType(), + ArrayType(IntegerType(), True), True), True), + StructField('b', StringType(), True)]), True), + StructField('B', FloatType(), True)]) """ if isinstance(dt, StructType): new_fields = [] @@ -151,14 +148,11 @@ def force_decimal_precision_scale( ... StructField("B", DecimalType(30, 15), False)]), ... precision=30, scale=15) # doctest: +NORMALIZE_WHITESPACE StructType([StructField('A', - StructType([StructField('a', - MapType(DecimalType(30,15), - ArrayType(DecimalType(30,15), - False), - False), - False), - StructField('b', StringType(), True)]), True), - StructField('B', DecimalType(30,15), False)]) + StructType([StructField('a', + MapType(DecimalType(30,15), + ArrayType(DecimalType(30,15), False), False), False), + StructField('b', StringType(), True)]), True), + StructField('B', DecimalType(30,15), False)]) """ if isinstance(dt, StructType): new_fields = [] From f6b495d378a129dd4d535035936e58384549481f Mon Sep 17 00:00:00 2001 From: flynn Date: Tue, 22 Mar 2022 10:27:19 -0400 Subject: [PATCH 14/15] format --- python/pyspark/shuffle.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py index 410519ddca96a..0709d2de25a67 100644 --- a/python/pyspark/shuffle.py +++ b/python/pyspark/shuffle.py @@ -52,7 +52,6 @@ def get_used_memory(): info = process.get_memory_info() return info.rss >> 20 - except ImportError: def get_used_memory(): From 9baa5b7cdf35794fb4f49fe3d535309f89bcfde1 Mon Sep 17 00:00:00 2001 From: flynn Date: Tue, 22 Mar 2022 13:59:33 -0400 Subject: [PATCH 15/15] add note in migration guide --- python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst b/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst index f2701d4fb7216..d81008d63cbe9 100644 --- a/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst +++ b/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst @@ -23,3 +23,4 @@ Upgrading from PySpark 3.2 to 3.3 * In Spark 3.3, the ``pyspark.pandas.sql`` method follows [the standard Python string formatter](https://docs.python.org/3/library/string.html#format-string-syntax). To restore the previous behavior, set ``PYSPARK_PANDAS_SQL_LEGACY`` environment variable to ``1``. * In Spark 3.3, the ``drop`` method of pandas API on Spark DataFrame supports dropping rows by ``index``, and sets dropping by index instead of column by default. * In Spark 3.3, PySpark upgrades Pandas version, the new minimum required version changes from 0.23.2 to 1.0.5. +* In Spark 3.3, the ``repr`` return values of SQL DataTypes have been changed to yield an object with the same value when passed to ``eval``.