From 177240ade0e280f2c9345131d00abaf598a87318 Mon Sep 17 00:00:00 2001
From: flynn <crf204@gmail.com>
Date: Mon, 18 Oct 2021 22:18:14 -0400
Subject: [PATCH 01/15] make sql type reprs eval-able

---
 python/pyspark/sql/tests/test_types.py | 23 +++++++++++++++++++++++
 python/pyspark/sql/types.py            | 18 +++++++++---------
 2 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/python/pyspark/sql/tests/test_types.py b/python/pyspark/sql/tests/test_types.py
index 1dbddf7c30896..38736ab1c6cba 100644
--- a/python/pyspark/sql/tests/test_types.py
+++ b/python/pyspark/sql/tests/test_types.py
@@ -764,6 +764,29 @@ def assertCollectSuccess(typecode, value):
                 a = array.array(t)
                 self.spark.createDataFrame([Row(myarray=a)]).collect()
 
+    def test_repr(self):
+        instances = [
+            NullType(),
+            StringType(),
+            BinaryType(),
+            BooleanType(),
+            DateType(),
+            TimestampType(),
+            DecimalType(),
+            DoubleType(),
+            FloatType(),
+            ByteType(),
+            IntegerType(),
+            LongType(),
+            ShortType(),
+            ArrayType(StringType()),
+            MapType(StringType(), IntegerType()),
+            StructField("f1", StringType(), True),
+            StructType([StructField("f1", StringType(), True)]),
+        ]
+        for instance in instances:
+            self.assertEqual(eval(repr(instance)), instance)
+
 
 class DataTypeTests(unittest.TestCase):
     # regression test for SPARK-6055
diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 69ec96e14fd9b..b443b0468ac4e 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -60,7 +60,7 @@ class DataType(object):
     """Base class for data types."""
 
     def __repr__(self) -> str:
-        return self.__class__.__name__
+        return self.__class__.__name__ + "()"
 
     def __hash__(self) -> int:
         return hash(str(self))
@@ -336,8 +336,8 @@ def simpleString(self) -> str:
         return 'array<%s>' % self.elementType.simpleString()
 
     def __repr__(self) -> str:
-        return "ArrayType(%s,%s)" % (self.elementType,
-                                     str(self.containsNull).lower())
+        return "ArrayType(%s, %s)" % (self.elementType,
+                                     str(self.containsNull))
 
     def jsonValue(self) -> Dict[str, Any]:
         return {"type": self.typeName(),
@@ -404,8 +404,8 @@ def simpleString(self) -> str:
         return 'map<%s,%s>' % (self.keyType.simpleString(), self.valueType.simpleString())
 
     def __repr__(self) -> str:
-        return "MapType(%s,%s,%s)" % (self.keyType, self.valueType,
-                                      str(self.valueContainsNull).lower())
+        return "MapType(%s, %s, %s)" % (self.keyType, self.valueType,
+                                      str(self.valueContainsNull))
 
     def jsonValue(self) -> Dict[str, Any]:
         return {"type": self.typeName(),
@@ -478,8 +478,8 @@ def simpleString(self) -> str:
         return '%s:%s' % (self.name, self.dataType.simpleString())
 
     def __repr__(self) -> str:
-        return "StructField(%s,%s,%s)" % (self.name, self.dataType,
-                                          str(self.nullable).lower())
+        return "StructField('%s', %s, %s)" % (self.name, self.dataType,
+                                          str(self.nullable))
 
     def jsonValue(self) -> Dict[str, Any]:
         return {"name": self.name,
@@ -656,8 +656,8 @@ def simpleString(self) -> str:
         return 'struct<%s>' % (','.join(f.simpleString() for f in self))
 
     def __repr__(self) -> str:
-        return ("StructType(List(%s))" %
-                ",".join(str(field) for field in self))
+        return ("StructType([%s])" %
+                ", ".join(str(field) for field in self))
 
     def jsonValue(self) -> Dict[str, Any]:
         return {"type": self.typeName(),

From d473f3198ec24c3b3710ea6f20af8ed5887e3433 Mon Sep 17 00:00:00 2001
From: flynn <crf204@gmail.com>
Date: Tue, 19 Oct 2021 12:06:52 -0400
Subject: [PATCH 02/15] fix indenting

---
 python/pyspark/sql/types.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index b443b0468ac4e..c2f09db076625 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -337,7 +337,7 @@ def simpleString(self) -> str:
 
     def __repr__(self) -> str:
         return "ArrayType(%s, %s)" % (self.elementType,
-                                     str(self.containsNull))
+                                      str(self.containsNull))
 
     def jsonValue(self) -> Dict[str, Any]:
         return {"type": self.typeName(),
@@ -405,7 +405,7 @@ def simpleString(self) -> str:
 
     def __repr__(self) -> str:
         return "MapType(%s, %s, %s)" % (self.keyType, self.valueType,
-                                      str(self.valueContainsNull))
+                                        str(self.valueContainsNull))
 
     def jsonValue(self) -> Dict[str, Any]:
         return {"type": self.typeName(),
@@ -479,7 +479,7 @@ def simpleString(self) -> str:
 
     def __repr__(self) -> str:
         return "StructField('%s', %s, %s)" % (self.name, self.dataType,
-                                          str(self.nullable))
+                                              str(self.nullable))
 
     def jsonValue(self) -> Dict[str, Any]:
         return {"name": self.name,

From d7109515fd6837a89998b3b2ec3570670fe73958 Mon Sep 17 00:00:00 2001
From: flynn <crf204@gmail.com>
Date: Tue, 19 Oct 2021 13:37:20 -0400
Subject: [PATCH 03/15] fix dataframe test

---
 python/pyspark/sql/tests/test_dataframe.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py
index 6b3a6a23f2f17..1ab0d92c0dc7e 100644
--- a/python/pyspark/sql/tests/test_dataframe.py
+++ b/python/pyspark/sql/tests/test_dataframe.py
@@ -498,7 +498,7 @@ def test_toDF_with_schema_string(self):
                                lambda: rdd.toDF("key: int").collect())
 
         # field types mismatch will cause exception at runtime.
-        self.assertRaisesRegex(Exception, "FloatType can not accept",
+        self.assertRaisesRegex(Exception, r"FloatType\(\) can not accept",
                                lambda: rdd.toDF("key: float, value: string").collect())
 
         # flat schema values will be wrapped into row.

From 622739f6f0170be68879dfd77bc6654411149cc9 Mon Sep 17 00:00:00 2001
From: flynn <crf204@gmail.com>
Date: Tue, 19 Oct 2021 15:42:14 -0400
Subject: [PATCH 04/15] fix doctests

---
 python/pyspark/sql/types.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index c2f09db076625..fbaa871a65858 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -521,9 +521,9 @@ class StructType(DataType):
     --------
     >>> struct1 = StructType([StructField("f1", StringType(), True)])
     >>> struct1["f1"]
-    StructField(f1,StringType,true)
+    StructField('f1', StringType(), True)
     >>> struct1[0]
-    StructField(f1,StringType,true)
+    StructField('f1', StringType(), True)
 
     >>> struct1 = StructType([StructField("f1", StringType(), True)])
     >>> struct2 = StructType([StructField("f1", StringType(), True)])
@@ -862,17 +862,17 @@ def _parse_datatype_string(s: str) -> DataType:
     Examples
     --------
     >>> _parse_datatype_string("int ")
-    IntegerType
+    IntegerType()
     >>> _parse_datatype_string("INT ")
-    IntegerType
+    IntegerType()
     >>> _parse_datatype_string("a: byte, b: decimal(  16 , 8   ) ")
-    StructType(List(StructField(a,ByteType,true),StructField(b,DecimalType(16,8),true)))
+    StructType([StructField('a', ByteType(), True), StructField('b', DecimalType(16,8), True)])
     >>> _parse_datatype_string("a DOUBLE, b STRING")
-    StructType(List(StructField(a,DoubleType,true),StructField(b,StringType,true)))
+    StructType([StructField('a', DoubleType(), True), StructField('b', StringType(), True)])
     >>> _parse_datatype_string("a: array< short>")
-    StructType(List(StructField(a,ArrayType(ShortType,true),true)))
+    StructType([StructField('a', ArrayType(ShortType(), True), True)])
     >>> _parse_datatype_string(" map<string , string > ")
-    MapType(StringType,StringType,true)
+    MapType(StringType(), StringType(), True)
 
     >>> # Error cases
     >>> _parse_datatype_string("blabla") # doctest: +IGNORE_EXCEPTION_DETAIL

From 392d7511df4200d73b7c769602fc42fb6d1dcaf6 Mon Sep 17 00:00:00 2001
From: flynn <crf204@gmail.com>
Date: Tue, 19 Oct 2021 16:51:39 -0400
Subject: [PATCH 05/15] fix more doctests

---
 python/pyspark/ml/functions.py              | 8 ++++----
 python/pyspark/pandas/extensions.py         | 8 ++++----
 python/pyspark/pandas/tests/test_groupby.py | 2 +-
 python/pyspark/sql/dataframe.py             | 2 +-
 python/pyspark/sql/tests/test_dataframe.py  | 2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/python/pyspark/ml/functions.py b/python/pyspark/ml/functions.py
index 1eadbd694210e..fda00a11df7dc 100644
--- a/python/pyspark/ml/functions.py
+++ b/python/pyspark/ml/functions.py
@@ -58,11 +58,11 @@ def vector_to_array(col, dtype="float64"):
     [Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]),
      Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])]
     >>> df1.schema.fields
-    [StructField(vec,ArrayType(DoubleType,false),false),
-    StructField(oldVec,ArrayType(DoubleType,false),false)]
+    [StructField('vec', ArrayType(DoubleType(), False), False),
+     StructField('oldVec', ArrayType(DoubleType(), False), False)]
     >>> df2.schema.fields
-    [StructField(vec,ArrayType(FloatType,false),false),
-    StructField(oldVec,ArrayType(FloatType,false),false)]
+    [StructField('vec', ArrayType(FloatType(), False), False),
+     StructField('oldVec', ArrayType(FloatType(), False), False)]
     """
     sc = SparkContext._active_spark_context
     return Column(
diff --git a/python/pyspark/pandas/extensions.py b/python/pyspark/pandas/extensions.py
index 69f742541a599..eeb02f06a85e9 100644
--- a/python/pyspark/pandas/extensions.py
+++ b/python/pyspark/pandas/extensions.py
@@ -109,7 +109,7 @@ def __init__(self, pandas_on_spark_obj):
     ...
     Traceback (most recent call last):
         ...
-    ValueError: Cannot call DatetimeMethods on type StringType
+    ValueError: Cannot call DatetimeMethods on type StringType()
 
     Note: This function is not meant to be used directly - instead, use register_dataframe_accessor,
     register_series_accessor, or register_index_accessor.
@@ -169,7 +169,7 @@ def register_dataframe_accessor(name: str) -> Callable[[Type[T]], Type[T]]:
     ...
     Traceback (most recent call last):
         ...
-    ValueError: Cannot call DatetimeMethods on type StringType
+    ValueError: Cannot call DatetimeMethods on type StringType()
 
     Examples
     --------
@@ -250,7 +250,7 @@ def __init__(self, pandas_on_spark_obj):
     ...
     Traceback (most recent call last):
         ...
-    ValueError: Cannot call DatetimeMethods on type StringType
+    ValueError: Cannot call DatetimeMethods on type StringType()
 
     Examples
     --------
@@ -322,7 +322,7 @@ def __init__(self, pandas_on_spark_obj):
     ...
     Traceback (most recent call last):
         ...
-    ValueError: Cannot call DatetimeMethods on type StringType
+    ValueError: Cannot call DatetimeMethods on type StringType()
 
     Examples
     --------
diff --git a/python/pyspark/pandas/tests/test_groupby.py b/python/pyspark/pandas/tests/test_groupby.py
index 4e1c0d0674f27..dda7afa060204 100644
--- a/python/pyspark/pandas/tests/test_groupby.py
+++ b/python/pyspark/pandas/tests/test_groupby.py
@@ -2225,7 +2225,7 @@ def udf(col) -> int:
         with self.assertRaisesRegex(
             TypeError,
             "Expected the return type of this function to be of Series type, "
-            "but found type ScalarType\\[LongType\\]",
+            "but found type ScalarType\\[LongType\\(\\)\\]",
         ):
             psdf.groupby("a").transform(udf)
 
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index 90311c47ea104..a14eda7c11ee8 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -306,7 +306,7 @@ def schema(self) -> StructType:
         Examples
         --------
         >>> df.schema
-        StructType(List(StructField(age,IntegerType,true),StructField(name,StringType,true)))
+        StructType([StructField('age', IntegerType(), True), StructField('name', StringType(), True)])
         """
         if self._schema is None:
             try:
diff --git a/python/pyspark/sql/tests/test_dataframe.py b/python/pyspark/sql/tests/test_dataframe.py
index 1ab0d92c0dc7e..4061ca3235492 100644
--- a/python/pyspark/sql/tests/test_dataframe.py
+++ b/python/pyspark/sql/tests/test_dataframe.py
@@ -498,7 +498,7 @@ def test_toDF_with_schema_string(self):
                                lambda: rdd.toDF("key: int").collect())
 
         # field types mismatch will cause exception at runtime.
-        self.assertRaisesRegex(Exception, r"FloatType\(\) can not accept",
+        self.assertRaisesRegex(Exception, "FloatType\\(\\) can not accept",
                                lambda: rdd.toDF("key: float, value: string").collect())
 
         # flat schema values will be wrapped into row.

From 34e7a449438d376248d65885f817195a163406b0 Mon Sep 17 00:00:00 2001
From: flynn <crf204@gmail.com>
Date: Tue, 19 Oct 2021 17:38:29 -0400
Subject: [PATCH 06/15] fix lint err

---
 python/pyspark/sql/dataframe.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
index a14eda7c11ee8..cba5c4855a1ae 100644
--- a/python/pyspark/sql/dataframe.py
+++ b/python/pyspark/sql/dataframe.py
@@ -306,7 +306,8 @@ def schema(self) -> StructType:
         Examples
         --------
         >>> df.schema
-        StructType([StructField('age', IntegerType(), True), StructField('name', StringType(), True)])
+        StructType([StructField('age', IntegerType(), True),
+                    StructField('name', StringType(), True)])
         """
         if self._schema is None:
             try:

From b99254aaa35d64426b8677964bd69f05c97468ae Mon Sep 17 00:00:00 2001
From: flynn <crf204@gmail.com>
Date: Tue, 19 Oct 2021 18:03:40 -0400
Subject: [PATCH 07/15] fix more doctests

---
 python/pyspark/pandas/internal.py          | 58 +++++++++++-----------
 python/pyspark/pandas/tests/test_series.py |  2 +-
 2 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py
index 5cb21a78726c1..bcfb77a3bc4d2 100644
--- a/python/pyspark/pandas/internal.py
+++ b/python/pyspark/pandas/internal.py
@@ -293,13 +293,13 @@ class InternalFrame(object):
     >>> internal.index_names
     [None]
     >>> internal.data_fields    # doctest: +NORMALIZE_WHITESPACE
-    [InternalField(dtype=int64,struct_field=StructField(A,LongType,false)),
-     InternalField(dtype=int64,struct_field=StructField(B,LongType,false)),
-     InternalField(dtype=int64,struct_field=StructField(C,LongType,false)),
-     InternalField(dtype=int64,struct_field=StructField(D,LongType,false)),
-     InternalField(dtype=int64,struct_field=StructField(E,LongType,false))]
+    [InternalField(dtype=int64,struct_field=StructField('A', LongType(), False)),
+     InternalField(dtype=int64,struct_field=StructField('B', LongType(), False)),
+     InternalField(dtype=int64,struct_field=StructField('C', LongType(), False)),
+     InternalField(dtype=int64,struct_field=StructField('D', LongType(), False)),
+     InternalField(dtype=int64,struct_field=StructField('E', LongType(), False))]
     >>> internal.index_fields
-    [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,false))]
+    [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', LongType(), False))]
     >>> internal.to_internal_spark_frame.show()  # doctest: +NORMALIZE_WHITESPACE
     +-----------------+---+---+---+---+---+
     |__index_level_0__|  A|  B|  C|  D|  E|
@@ -356,12 +356,12 @@ class InternalFrame(object):
     >>> internal.index_names
     [('A',)]
     >>> internal.data_fields
-    [InternalField(dtype=int64,struct_field=StructField(B,LongType,false)),
-     InternalField(dtype=int64,struct_field=StructField(C,LongType,false)),
-     InternalField(dtype=int64,struct_field=StructField(D,LongType,false)),
-     InternalField(dtype=int64,struct_field=StructField(E,LongType,false))]
+    [InternalField(dtype=int64,struct_field=StructField('B', LongType(), False)),
+     InternalField(dtype=int64,struct_field=StructField('C', LongType(), False)),
+     InternalField(dtype=int64,struct_field=StructField('D', LongType(), False)),
+     InternalField(dtype=int64,struct_field=StructField('E', LongType(), False))]
     >>> internal.index_fields
-    [InternalField(dtype=int64,struct_field=StructField(A,LongType,false))]
+    [InternalField(dtype=int64,struct_field=StructField('A', LongType(), False))]
     >>> internal.to_internal_spark_frame.show()  # doctest: +NORMALIZE_WHITESPACE
     +---+---+---+---+---+
     |  A|  B|  C|  D|  E|
@@ -419,13 +419,13 @@ class InternalFrame(object):
     >>> internal.index_names
     [None, ('A',)]
     >>> internal.data_fields  # doctest: +NORMALIZE_WHITESPACE
-    [InternalField(dtype=int64,struct_field=StructField(B,LongType,false)),
-     InternalField(dtype=int64,struct_field=StructField(C,LongType,false)),
-     InternalField(dtype=int64,struct_field=StructField(D,LongType,false)),
-     InternalField(dtype=int64,struct_field=StructField(E,LongType,false))]
+    [InternalField(dtype=int64,struct_field=StructField('B', LongType(), False)),
+     InternalField(dtype=int64,struct_field=StructField('C', LongType(), False)),
+     InternalField(dtype=int64,struct_field=StructField('D', LongType(), False)),
+     InternalField(dtype=int64,struct_field=StructField('E', LongType(), False))]
     >>> internal.index_fields  # doctest: +NORMALIZE_WHITESPACE
-    [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,false)),
-     InternalField(dtype=int64,struct_field=StructField(A,LongType,false))]
+    [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', LongType(), False)),
+     InternalField(dtype=int64,struct_field=StructField('A', LongType(), False))]
     >>> internal.to_internal_spark_frame.show()  # doctest: +NORMALIZE_WHITESPACE
     +-----------------+---+---+---+---+---+
     |__index_level_0__|  A|  B|  C|  D|  E|
@@ -508,9 +508,9 @@ class InternalFrame(object):
     >>> internal.index_names
     [('A',)]
     >>> internal.data_fields
-    [InternalField(dtype=int64,struct_field=StructField(B,LongType,false))]
+    [InternalField(dtype=int64,struct_field=StructField('B', LongType(), False))]
     >>> internal.index_fields
-    [InternalField(dtype=int64,struct_field=StructField(A,LongType,false))]
+    [InternalField(dtype=int64,struct_field=StructField('A', LongType(), False))]
     >>> internal.to_internal_spark_frame.show()  # doctest: +NORMALIZE_WHITESPACE
     +---+---+
     |  A|  B|
@@ -596,9 +596,9 @@ def __init__(
         [('row_index_a',), ('row_index_b',), ('a', 'x')]
 
         >>> internal.index_fields  # doctest: +NORMALIZE_WHITESPACE
-        [InternalField(dtype=object,struct_field=StructField(__index_level_0__,StringType,false)),
-         InternalField(dtype=object,struct_field=StructField(__index_level_1__,StringType,false)),
-         InternalField(dtype=int64,struct_field=StructField((a, x),LongType,false))]
+        [InternalField(dtype=object,struct_field=StructField('__index_level_0__', StringType(), False)),
+         InternalField(dtype=object,struct_field=StructField('__index_level_1__', StringType(), False)),
+         InternalField(dtype=int64,struct_field=StructField('(a, x)', LongType(), False))]
 
         >>> internal.column_labels
         [('a', 'y'), ('b', 'z')]
@@ -607,8 +607,8 @@ def __init__(
         [Column<'(a, y)'>, Column<'(b, z)'>]
 
         >>> internal.data_fields  # doctest: +NORMALIZE_WHITESPACE
-        [InternalField(dtype=int64,struct_field=StructField((a, y),LongType,false)),
-         InternalField(dtype=int64,struct_field=StructField((b, z),LongType,false))]
+        [InternalField(dtype=int64,struct_field=StructField('(a, y)', LongType(), False)),
+         InternalField(dtype=int64,struct_field=StructField('(b, z)', LongType(), False))]
 
         >>> internal.column_label_names
         [('column_labels_a',), ('column_labels_b',)]
@@ -1506,12 +1506,12 @@ def prepare_pandas_frame(
         >>> index_columns
         ['__index_level_0__']
         >>> index_fields
-        [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,false))]
+        [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', LongType(), False))]
         >>> data_columns
         ['(x, a)', '(y, b)']
         >>> data_fields  # doctest: +NORMALIZE_WHITESPACE
-        [InternalField(dtype=object,struct_field=StructField((x, a),StringType,false)),
-         InternalField(dtype=category,struct_field=StructField((y, b),ByteType,false))]
+        [InternalField(dtype=object,struct_field=StructField('(x, a)', StringType(), False)),
+         InternalField(dtype=category,struct_field=StructField('(y, b)', ByteType(), False))]
 
         >>> import datetime
         >>> pdf = pd.DataFrame({
@@ -1522,8 +1522,8 @@ def prepare_pandas_frame(
         ...     InternalFrame.prepare_pandas_frame(pdf, prefer_timestamp_ntz=True)
         ... )
         >>> data_fields
-        [InternalField(dtype=datetime64[ns],struct_field=StructField(dt,TimestampNTZType,false)),
-         InternalField(dtype=object,struct_field=StructField(dt_obj,TimestampNTZType,false))]
+        [InternalField(dtype=datetime64[ns],struct_field=StructField('dt', TimestampNTZType(), False)),
+         InternalField(dtype=object,struct_field=StructField('dt_obj', TimestampNTZType(), False))]
         """
         pdf = pdf.copy()
 
diff --git a/python/pyspark/pandas/tests/test_series.py b/python/pyspark/pandas/tests/test_series.py
index 72677d18e4b88..e2429306b77a9 100644
--- a/python/pyspark/pandas/tests/test_series.py
+++ b/python/pyspark/pandas/tests/test_series.py
@@ -3010,7 +3010,7 @@ def udf(col) -> ps.Series[int]:
         with self.assertRaisesRegex(
             ValueError,
             r"Expected the return type of this function to be of scalar type, "
-            r"but found type SeriesType\[LongType\]",
+            r"but found type SeriesType\[LongType\(\)\]",
         ):
             psser.apply(udf)
 

From f6f3cfb786aba3a8be35f31d89d0004df26bfa93 Mon Sep 17 00:00:00 2001
From: flynn <crf204@gmail.com>
Date: Tue, 19 Oct 2021 19:18:41 -0400
Subject: [PATCH 08/15] doctest fixes

---
 python/pyspark/pandas/internal.py    | 20 +++++++++++++------
 python/pyspark/pandas/spark/utils.py | 29 ++++++++++++++++++++--------
 2 files changed, 35 insertions(+), 14 deletions(-)

diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py
index bcfb77a3bc4d2..d5df0da2c785e 100644
--- a/python/pyspark/pandas/internal.py
+++ b/python/pyspark/pandas/internal.py
@@ -596,8 +596,12 @@ def __init__(
         [('row_index_a',), ('row_index_b',), ('a', 'x')]
 
         >>> internal.index_fields  # doctest: +NORMALIZE_WHITESPACE
-        [InternalField(dtype=object,struct_field=StructField('__index_level_0__', StringType(), False)),
-         InternalField(dtype=object,struct_field=StructField('__index_level_1__', StringType(), False)),
+        [InternalField(
+             dtype=object,struct_field=StructField('__index_level_0__', StringType(), False)
+         ),
+         InternalField(
+             dtype=object,struct_field=StructField('__index_level_1__', StringType(), False)
+         ),
          InternalField(dtype=int64,struct_field=StructField('(a, x)', LongType(), False))]
 
         >>> internal.column_labels
@@ -1505,8 +1509,10 @@ def prepare_pandas_frame(
         2                 30      c       1
         >>> index_columns
         ['__index_level_0__']
-        >>> index_fields
-        [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', LongType(), False))]
+        >>> index_fields  # doctest: +NORMALIZE_WHITESPACE
+        [InternalField(
+             dtype=int64,struct_field=StructField('__index_level_0__', LongType(), False)
+         )]
         >>> data_columns
         ['(x, a)', '(y, b)']
         >>> data_fields  # doctest: +NORMALIZE_WHITESPACE
@@ -1521,8 +1527,10 @@ def prepare_pandas_frame(
         >>> _, _, _, _, data_fields = (
         ...     InternalFrame.prepare_pandas_frame(pdf, prefer_timestamp_ntz=True)
         ... )
-        >>> data_fields
-        [InternalField(dtype=datetime64[ns],struct_field=StructField('dt', TimestampNTZType(), False)),
+        >>> data_fields  # doctest: +NORMALIZE_WHITESPACE
+        [InternalField(
+             dtype=datetime64[ns],struct_field=StructField('dt', TimestampNTZType(), False)
+         ),
          InternalField(dtype=object,struct_field=StructField('dt_obj', TimestampNTZType(), False))]
         """
         pdf = pdf.copy()
diff --git a/python/pyspark/pandas/spark/utils.py b/python/pyspark/pandas/spark/utils.py
index 0940a5e508071..3c37cdacad919 100644
--- a/python/pyspark/pandas/spark/utils.py
+++ b/python/pyspark/pandas/spark/utils.py
@@ -52,7 +52,7 @@ def as_nullable_spark_type(dt: DataType) -> DataType:
     >>> as_nullable_spark_type(StructType([
     ...     StructField("A", IntegerType(), True),
     ...     StructField("B", FloatType(), False)]))  # doctest: +NORMALIZE_WHITESPACE
-    StructType(List(StructField(A,IntegerType,true),StructField(B,FloatType,true)))
+    StructType([StructField('A', IntegerType(), True), StructField('B', FloatType(), True)])
 
     >>> as_nullable_spark_type(StructType([
     ...     StructField("A",
@@ -62,9 +62,13 @@ def as_nullable_spark_type(dt: DataType) -> DataType:
     ...                 ArrayType(IntegerType(), False), False), False),
     ...             StructField('b', StringType(), True)])),
     ...     StructField("B", FloatType(), False)]))  # doctest: +NORMALIZE_WHITESPACE
-    StructType(List(StructField(A,StructType(List(StructField(a,MapType(IntegerType,ArrayType\
-(IntegerType,true),true),true),StructField(b,StringType,true))),true),\
-StructField(B,FloatType,true)))
+    StructType([
+        StructField('A', StructType([
+            StructField('a', MapType(IntegerType(), ArrayType(IntegerType(), True), True), True),
+            StructField('b', StringType(), True)
+        ]), True),
+        StructField('B', FloatType(), True)
+    ])
     """
     if isinstance(dt, StructType):
         new_fields = []
@@ -132,7 +136,9 @@ def force_decimal_precision_scale(
     >>> force_decimal_precision_scale(StructType([
     ...     StructField("A", DecimalType(10, 0), True),
     ...     StructField("B", DecimalType(14, 7), False)]))  # doctest: +NORMALIZE_WHITESPACE
-    StructType(List(StructField(A,DecimalType(38,18),true),StructField(B,DecimalType(38,18),false)))
+    StructType([
+        StructField('A', DecimalType(38,18), True), StructField('B', DecimalType(38,18), False)
+    ])
 
     >>> force_decimal_precision_scale(StructType([
     ...     StructField("A",
@@ -143,9 +149,16 @@ def force_decimal_precision_scale(
     ...             StructField('b', StringType(), True)])),
     ...     StructField("B", DecimalType(30, 15), False)]),
     ...     precision=30, scale=15)  # doctest: +NORMALIZE_WHITESPACE
-    StructType(List(StructField(A,StructType(List(StructField(a,MapType(DecimalType(30,15),\
-ArrayType(DecimalType(30,15),false),false),false),StructField(b,StringType,true))),true),\
-StructField(B,DecimalType(30,15),false)))
+    StructType([
+        StructField('A', StructType([
+            StructField(
+                'a', MapType(DecimalType(30,15), ArrayType(DecimalType(30,15), False), False
+            ),
+            False
+        ),
+        StructField('b', StringType(), True)]), True),
+        StructField('B', DecimalType(30,15), False)
+    ])
     """
     if isinstance(dt, StructType):
         new_fields = []

From aece8fa9fcade285d19af91a403d7d6e60a3c579 Mon Sep 17 00:00:00 2001
From: flynn <crf204@gmail.com>
Date: Wed, 20 Oct 2021 21:32:18 -0400
Subject: [PATCH 09/15] fix doctest output

---
 python/pyspark/pandas/internal.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py
index d5df0da2c785e..9398a2b290795 100644
--- a/python/pyspark/pandas/internal.py
+++ b/python/pyspark/pandas/internal.py
@@ -596,13 +596,12 @@ def __init__(
         [('row_index_a',), ('row_index_b',), ('a', 'x')]
 
         >>> internal.index_fields  # doctest: +NORMALIZE_WHITESPACE
-        [InternalField(
-             dtype=object,struct_field=StructField('__index_level_0__', StringType(), False)
-         ),
-         InternalField(
-             dtype=object,struct_field=StructField('__index_level_1__', StringType(), False)
-         ),
-         InternalField(dtype=int64,struct_field=StructField('(a, x)', LongType(), False))]
+        [InternalField(dtype=object,struct_field=StructField('__index_level_0__',
+                                                             StringType(), False)),
+         InternalField(dtype=object,struct_field=StructField('__index_level_1__',
+                                                             StringType(), False)),
+         InternalField(dtype=int64,struct_field=StructField('(a, x)',
+                                                            LongType(), False))]
 
         >>> internal.column_labels
         [('a', 'y'), ('b', 'z')]
@@ -1510,9 +1509,8 @@ def prepare_pandas_frame(
         >>> index_columns
         ['__index_level_0__']
         >>> index_fields  # doctest: +NORMALIZE_WHITESPACE
-        [InternalField(
-             dtype=int64,struct_field=StructField('__index_level_0__', LongType(), False)
-         )]
+        [InternalField(dtype=int64,struct_field=StructField('__index_level_0__',
+                                                            LongType(), False))]
         >>> data_columns
         ['(x, a)', '(y, b)']
         >>> data_fields  # doctest: +NORMALIZE_WHITESPACE
@@ -1528,10 +1526,11 @@ def prepare_pandas_frame(
         ...     InternalFrame.prepare_pandas_frame(pdf, prefer_timestamp_ntz=True)
         ... )
         >>> data_fields  # doctest: +NORMALIZE_WHITESPACE
-        [InternalField(
-             dtype=datetime64[ns],struct_field=StructField('dt', TimestampNTZType(), False)
-         ),
-         InternalField(dtype=object,struct_field=StructField('dt_obj', TimestampNTZType(), False))]
+        [InternalField(dtype=datetime64[ns],struct_field=StructField('dt',
+                                                                     TimestampNTZType(), False)),
+         InternalField(dtype=object,struct_field=StructField('dt_obj',
+                                                             TimestampNTZType(), False))]
+
         """
         pdf = pdf.copy()
 

From a02d86f39b8c166cd399a45088c5a0780a580de6 Mon Sep 17 00:00:00 2001
From: flynn <crf204@gmail.com>
Date: Wed, 20 Oct 2021 22:39:59 -0400
Subject: [PATCH 10/15] fix pandas doctests

---
 python/pyspark/pandas/spark/utils.py | 40 ++++++++++++++--------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/python/pyspark/pandas/spark/utils.py b/python/pyspark/pandas/spark/utils.py
index 3c37cdacad919..fcae47380bd27 100644
--- a/python/pyspark/pandas/spark/utils.py
+++ b/python/pyspark/pandas/spark/utils.py
@@ -62,13 +62,15 @@ def as_nullable_spark_type(dt: DataType) -> DataType:
     ...                 ArrayType(IntegerType(), False), False), False),
     ...             StructField('b', StringType(), True)])),
     ...     StructField("B", FloatType(), False)]))  # doctest: +NORMALIZE_WHITESPACE
-    StructType([
-        StructField('A', StructType([
-            StructField('a', MapType(IntegerType(), ArrayType(IntegerType(), True), True), True),
-            StructField('b', StringType(), True)
-        ]), True),
-        StructField('B', FloatType(), True)
-    ])
+    StructType([StructField('A',
+                            StructType([StructField('a',
+                                                    MapType(IntegerType(),
+                                                    ArrayType(IntegerType(),
+                                                    True),
+                                        True),
+                            True),
+                StructField('b', StringType(), True)]), True),
+                StructField('B', FloatType(), True)])
     """
     if isinstance(dt, StructType):
         new_fields = []
@@ -136,9 +138,8 @@ def force_decimal_precision_scale(
     >>> force_decimal_precision_scale(StructType([
     ...     StructField("A", DecimalType(10, 0), True),
     ...     StructField("B", DecimalType(14, 7), False)]))  # doctest: +NORMALIZE_WHITESPACE
-    StructType([
-        StructField('A', DecimalType(38,18), True), StructField('B', DecimalType(38,18), False)
-    ])
+    StructType([StructField('A', DecimalType(38,18), True),
+                StructField('B', DecimalType(38,18), False)])
 
     >>> force_decimal_precision_scale(StructType([
     ...     StructField("A",
@@ -149,16 +150,15 @@ def force_decimal_precision_scale(
     ...             StructField('b', StringType(), True)])),
     ...     StructField("B", DecimalType(30, 15), False)]),
     ...     precision=30, scale=15)  # doctest: +NORMALIZE_WHITESPACE
-    StructType([
-        StructField('A', StructType([
-            StructField(
-                'a', MapType(DecimalType(30,15), ArrayType(DecimalType(30,15), False), False
-            ),
-            False
-        ),
-        StructField('b', StringType(), True)]), True),
-        StructField('B', DecimalType(30,15), False)
-    ])
+    StructType([StructField('A',
+                            StructType([StructField('a',
+                                                    MapType(DecimalType(30,15),
+                                                    ArrayType(DecimalType(30,15),
+                                                    False),
+                                        False),
+                            False),
+                StructField('b', StringType(), True)]), True),
+                StructField('B', DecimalType(30,15), False)])
     """
     if isinstance(dt, StructType):
         new_fields = []

From 4615139d32a1e329f24af4026c8a894452828ba6 Mon Sep 17 00:00:00 2001
From: flynn <crf204@gmail.com>
Date: Thu, 21 Oct 2021 08:48:36 -0400
Subject: [PATCH 11/15] fix typehints docstrings

---
 python/pyspark/pandas/typedef/typehints.py | 50 +++++++++++-----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/python/pyspark/pandas/typedef/typehints.py b/python/pyspark/pandas/typedef/typehints.py
index 14a105672933e..c693f8d796708 100644
--- a/python/pyspark/pandas/typedef/typehints.py
+++ b/python/pyspark/pandas/typedef/typehints.py
@@ -323,15 +323,15 @@ def pandas_on_spark_type(tpe: Union[str, type, Dtype]) -> Tuple[Dtype, types.Dat
     Examples
     --------
     >>> pandas_on_spark_type(int)
-    (dtype('int64'), LongType)
+    (dtype('int64'), LongType())
     >>> pandas_on_spark_type(str)
-    (dtype('<U'), StringType)
+    (dtype('<U'), StringType())
     >>> pandas_on_spark_type(datetime.date)
-    (dtype('O'), DateType)
+    (dtype('O'), DateType())
     >>> pandas_on_spark_type(datetime.datetime)
-    (dtype('<M8[ns]'), TimestampType)
+    (dtype('<M8[ns]'), TimestampType())
     >>> pandas_on_spark_type(List[bool])
-    (dtype('O'), ArrayType(BooleanType,true))
+    (dtype('O'), ArrayType(BooleanType(), True))
     """
     try:
         dtype = pandas_dtype(tpe)
@@ -383,7 +383,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtype
     dtype('int64')
     >>> inferred.spark_type
-    LongType
+    LongType()
 
     >>> def func() -> ps.Series[int]:
     ...    pass
@@ -391,7 +391,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtype
     dtype('int64')
     >>> inferred.spark_type
-    LongType
+    LongType()
 
     >>> def func() -> ps.DataFrame[np.float, str]:
     ...    pass
@@ -399,7 +399,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtypes
     [dtype('float64'), dtype('<U')]
     >>> inferred.spark_type
-    StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true)))
+    StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)])
 
     >>> def func() -> ps.DataFrame[np.float]:
     ...    pass
@@ -407,7 +407,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtypes
     [dtype('float64')]
     >>> inferred.spark_type
-    StructType(List(StructField(c0,DoubleType,true)))
+    StructType([StructField('c0', DoubleType(), True)])
 
     >>> def func() -> 'int':
     ...    pass
@@ -415,7 +415,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtype
     dtype('int64')
     >>> inferred.spark_type
-    LongType
+    LongType()
 
     >>> def func() -> 'ps.Series[int]':
     ...    pass
@@ -423,7 +423,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtype
     dtype('int64')
     >>> inferred.spark_type
-    LongType
+    LongType()
 
     >>> def func() -> 'ps.DataFrame[np.float, str]':
     ...    pass
@@ -431,7 +431,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtypes
     [dtype('float64'), dtype('<U')]
     >>> inferred.spark_type
-    StructType(List(StructField(c0,DoubleType,true),StructField(c1,StringType,true)))
+    StructType([StructField('c0', DoubleType(), True), StructField('c1', StringType(), True)])
 
     >>> def func() -> 'ps.DataFrame[np.float]':
     ...    pass
@@ -439,7 +439,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtypes
     [dtype('float64')]
     >>> inferred.spark_type
-    StructType(List(StructField(c0,DoubleType,true)))
+    StructType([StructField('c0', DoubleType(), True)])
 
     >>> def func() -> ps.DataFrame['a': np.float, 'b': int]:
     ...     pass
@@ -447,7 +447,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtypes
     [dtype('float64'), dtype('int64')]
     >>> inferred.spark_type
-    StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true)))
+    StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)])
 
     >>> def func() -> "ps.DataFrame['a': np.float, 'b': int]":
     ...     pass
@@ -455,7 +455,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtypes
     [dtype('float64'), dtype('int64')]
     >>> inferred.spark_type
-    StructType(List(StructField(a,DoubleType,true),StructField(b,LongType,true)))
+    StructType([StructField('a', DoubleType(), True), StructField('b', LongType(), True)])
 
     >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
     >>> def func() -> ps.DataFrame[pdf.dtypes]:
@@ -464,7 +464,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtypes
     [dtype('int64'), dtype('int64')]
     >>> inferred.spark_type
-    StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true)))
+    StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)])
 
     >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
     >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
@@ -473,7 +473,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtypes
     [dtype('int64'), dtype('int64')]
     >>> inferred.spark_type
-    StructType(List(StructField(a,LongType,true),StructField(b,LongType,true)))
+    StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)])
 
     >>> pdf = pd.DataFrame({("x", "a"): [1, 2, 3], ("y", "b"): [3, 4, 5]})
     >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
@@ -482,7 +482,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtypes
     [dtype('int64'), dtype('int64')]
     >>> inferred.spark_type
-    StructType(List(StructField((x, a),LongType,true),StructField((y, b),LongType,true)))
+    StructType([StructField('(x, a)', LongType(), True), StructField('(y, b)', LongType(), True)])
 
     >>> pdf = pd.DataFrame({"a": [1, 2, 3], "b": pd.Categorical([3, 4, 5])})
     >>> def func() -> ps.DataFrame[pdf.dtypes]:
@@ -491,7 +491,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtypes
     [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
     >>> inferred.spark_type
-    StructType(List(StructField(c0,LongType,true),StructField(c1,LongType,true)))
+    StructType([StructField('c0', LongType(), True), StructField('c1', LongType(), True)])
 
     >>> def func() -> ps.DataFrame[zip(pdf.columns, pdf.dtypes)]:
     ...     pass
@@ -499,7 +499,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtypes
     [dtype('int64'), CategoricalDtype(categories=[3, 4, 5], ordered=False)]
     >>> inferred.spark_type
-    StructType(List(StructField(a,LongType,true),StructField(b,LongType,true)))
+    StructType([StructField('a', LongType(), True), StructField('b', LongType(), True)])
 
     >>> def func() -> ps.Series[pdf.b.dtype]:
     ...     pass
@@ -507,7 +507,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.dtype
     CategoricalDtype(categories=[3, 4, 5], ordered=False)
     >>> inferred.spark_type
-    LongType
+    LongType()
 
     >>> def func() -> ps.DataFrame[int, [int, int]]:
     ...     pass
@@ -517,7 +517,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.spark_type.simpleString()
     'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
     >>> inferred.index_fields
-    [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))]
+    [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', LongType(), True))]
 
     >>> def func() -> ps.DataFrame[pdf.index.dtype, pdf.dtypes]:
     ...     pass
@@ -527,7 +527,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.spark_type.simpleString()
     'struct<__index_level_0__:bigint,c0:bigint,c1:bigint>'
     >>> inferred.index_fields
-    [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))]
+    [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', LongType(), True))]
 
     >>> def func() -> ps.DataFrame[
     ...     ("index", CategoricalDtype(categories=[3, 4, 5], ordered=False)),
@@ -539,7 +539,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.spark_type.simpleString()
     'struct<index:bigint,id:bigint,A:bigint>'
     >>> inferred.index_fields
-    [InternalField(dtype=category,struct_field=StructField(index,LongType,true))]
+    [InternalField(dtype=category,struct_field=StructField('index', LongType(), True))]
 
     >>> def func() -> ps.DataFrame[
     ...         (pdf.index.name, pdf.index.dtype), zip(pdf.columns, pdf.dtypes)]:
@@ -550,7 +550,7 @@ def infer_return_type(f: Callable) -> Union[SeriesType, DataFrameType, ScalarTyp
     >>> inferred.spark_type.simpleString()
     'struct<__index_level_0__:bigint,a:bigint,b:bigint>'
     >>> inferred.index_fields
-    [InternalField(dtype=int64,struct_field=StructField(__index_level_0__,LongType,true))]
+    [InternalField(dtype=int64,struct_field=StructField('__index_level_0__', LongType(), True))]
     """
     # We should re-import to make sure the class 'SeriesType' is not treated as a class
     # within this module locally. See Series.__class_getitem__ which imports this class

From 58ddcd082600f0f9ab3c0570611c03678107647b Mon Sep 17 00:00:00 2001
From: flynn <crf204@gmail.com>
Date: Wed, 22 Dec 2021 12:37:53 -0500
Subject: [PATCH 12/15] black fmt

---
 python/pyspark/sql/types.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/python/pyspark/sql/types.py b/python/pyspark/sql/types.py
index 322eb8bfcff22..3cd151b798999 100644
--- a/python/pyspark/sql/types.py
+++ b/python/pyspark/sql/types.py
@@ -415,8 +415,7 @@ def simpleString(self) -> str:
         return "array<%s>" % self.elementType.simpleString()
 
     def __repr__(self) -> str:
-        return "ArrayType(%s, %s)" % (self.elementType,
-                                      str(self.containsNull))
+        return "ArrayType(%s, %s)" % (self.elementType, str(self.containsNull))
 
     def jsonValue(self) -> Dict[str, Any]:
         return {
@@ -486,8 +485,7 @@ def simpleString(self) -> str:
         return "map<%s,%s>" % (self.keyType.simpleString(), self.valueType.simpleString())
 
     def __repr__(self) -> str:
-        return "MapType(%s, %s, %s)" % (self.keyType, self.valueType,
-                                        str(self.valueContainsNull))
+        return "MapType(%s, %s, %s)" % (self.keyType, self.valueType, str(self.valueContainsNull))
 
     def jsonValue(self) -> Dict[str, Any]:
         return {
@@ -568,8 +566,7 @@ def simpleString(self) -> str:
         return "%s:%s" % (self.name, self.dataType.simpleString())
 
     def __repr__(self) -> str:
-        return "StructField('%s', %s, %s)" % (self.name, self.dataType,
-                                              str(self.nullable))
+        return "StructField('%s', %s, %s)" % (self.name, self.dataType, str(self.nullable))
 
     def jsonValue(self) -> Dict[str, Any]:
         return {
@@ -752,8 +749,7 @@ def simpleString(self) -> str:
         return "struct<%s>" % (",".join(f.simpleString() for f in self))
 
     def __repr__(self) -> str:
-        return ("StructType([%s])" %
-                ", ".join(str(field) for field in self))
+        return "StructType([%s])" % ", ".join(str(field) for field in self)
 
     def jsonValue(self) -> Dict[str, Any]:
         return {"type": self.typeName(), "fields": [f.jsonValue() for f in self]}

From 5cde0579fbf1f0937f15e4bb703dbafa6e97e095 Mon Sep 17 00:00:00 2001
From: Flynn <crflynn@users.noreply.github.com>
Date: Tue, 22 Mar 2022 01:02:42 -0400
Subject: [PATCH 13/15] cleanup doctests

---
 python/pyspark/pandas/internal.py    | 30 ++++++++++++++--------------
 python/pyspark/pandas/spark/utils.py | 26 ++++++++++--------------
 2 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/python/pyspark/pandas/internal.py b/python/pyspark/pandas/internal.py
index 8cee4aae8fe23..c0dab6f2313d5 100644
--- a/python/pyspark/pandas/internal.py
+++ b/python/pyspark/pandas/internal.py
@@ -355,7 +355,7 @@ class InternalFrame:
     ['A', 'B', 'C', 'D', 'E']
     >>> internal.index_names
     [('A',)]
-    >>> internal.data_fields
+    >>> internal.data_fields  # doctest: +NORMALIZE_WHITESPACE
     [InternalField(dtype=int64, struct_field=StructField('B', LongType(), False)),
      InternalField(dtype=int64, struct_field=StructField('C', LongType(), False)),
      InternalField(dtype=int64, struct_field=StructField('D', LongType(), False)),
@@ -596,12 +596,12 @@ def __init__(
         [('row_index_a',), ('row_index_b',), ('a', 'x')]
 
         >>> internal.index_fields  # doctest: +NORMALIZE_WHITESPACE
-        [InternalField(dtype=object, struct_field=StructField('__index_level_0__',
-                                                             StringType(), False)),
-         InternalField(dtype=object, struct_field=StructField('__index_level_1__',
-                                                             StringType(), False)),
-         InternalField(dtype=int64, struct_field=StructField('(a, x)',
-                                                            LongType(), False))]
+        [InternalField(dtype=object,
+            struct_field=StructField('__index_level_0__', StringType(), False)),
+         InternalField(dtype=object,
+            struct_field=StructField('__index_level_1__', StringType(), False)),
+         InternalField(dtype=int64,
+            struct_field=StructField('(a, x)', LongType(), False))]
 
         >>> internal.column_labels
         [('a', 'y'), ('b', 'z')]
@@ -1526,10 +1526,10 @@ def prepare_pandas_frame(
         ...     InternalFrame.prepare_pandas_frame(pdf, prefer_timestamp_ntz=True)
         ... )
         >>> data_fields  # doctest: +NORMALIZE_WHITESPACE
-        [InternalField(dtype=datetime64[ns], struct_field=StructField('dt',
-                                                                     TimestampNTZType(), False)),
-         InternalField(dtype=object, struct_field=StructField('dt_obj',
-                                                             TimestampNTZType(), False))]
+        [InternalField(dtype=datetime64[ns],
+            struct_field=StructField('dt', TimestampNTZType(), False)),
+         InternalField(dtype=object,
+            struct_field=StructField('dt_obj', TimestampNTZType(), False))]
 
         >>> pdf = pd.DataFrame({
         ...     "td": [datetime.timedelta(0)], "td_obj": [datetime.timedelta(0)]
@@ -1539,10 +1539,10 @@ def prepare_pandas_frame(
         ...     InternalFrame.prepare_pandas_frame(pdf)
         ... )
         >>> data_fields  # doctest: +NORMALIZE_WHITESPACE
-        [InternalField(dtype=timedelta64[ns], struct_field=StructField('td',
-                                                                      DayTimeIntervalType(0, 3), False)),
-         InternalField(dtype=object, struct_field=StructField('td_obj',
-                                                             DayTimeIntervalType(0, 3), False))]
+        [InternalField(dtype=timedelta64[ns],
+            struct_field=StructField('td', DayTimeIntervalType(0, 3), False)),
+         InternalField(dtype=object,
+            struct_field=StructField('td_obj', DayTimeIntervalType(0, 3), False))]
         """
         pdf = pdf.copy()
 
diff --git a/python/pyspark/pandas/spark/utils.py b/python/pyspark/pandas/spark/utils.py
index fcae47380bd27..9b8b5bb7542ab 100644
--- a/python/pyspark/pandas/spark/utils.py
+++ b/python/pyspark/pandas/spark/utils.py
@@ -63,14 +63,11 @@ def as_nullable_spark_type(dt: DataType) -> DataType:
     ...             StructField('b', StringType(), True)])),
     ...     StructField("B", FloatType(), False)]))  # doctest: +NORMALIZE_WHITESPACE
     StructType([StructField('A',
-                            StructType([StructField('a',
-                                                    MapType(IntegerType(),
-                                                    ArrayType(IntegerType(),
-                                                    True),
-                                        True),
-                            True),
-                StructField('b', StringType(), True)]), True),
-                StructField('B', FloatType(), True)])
+        StructType([StructField('a',
+            MapType(IntegerType(),
+            ArrayType(IntegerType(), True), True), True),
+        StructField('b', StringType(), True)]), True),
+    StructField('B', FloatType(), True)])
     """
     if isinstance(dt, StructType):
         new_fields = []
@@ -151,14 +148,11 @@ def force_decimal_precision_scale(
     ...     StructField("B", DecimalType(30, 15), False)]),
     ...     precision=30, scale=15)  # doctest: +NORMALIZE_WHITESPACE
     StructType([StructField('A',
-                            StructType([StructField('a',
-                                                    MapType(DecimalType(30,15),
-                                                    ArrayType(DecimalType(30,15),
-                                                    False),
-                                        False),
-                            False),
-                StructField('b', StringType(), True)]), True),
-                StructField('B', DecimalType(30,15), False)])
+        StructType([StructField('a',
+            MapType(DecimalType(30,15),
+            ArrayType(DecimalType(30,15), False), False), False),
+        StructField('b', StringType(), True)]), True),
+    StructField('B', DecimalType(30,15), False)])
     """
     if isinstance(dt, StructType):
         new_fields = []

From f6b495d378a129dd4d535035936e58384549481f Mon Sep 17 00:00:00 2001
From: flynn <crf204@gmail.com>
Date: Tue, 22 Mar 2022 10:27:19 -0400
Subject: [PATCH 14/15] format

---
 python/pyspark/shuffle.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/pyspark/shuffle.py b/python/pyspark/shuffle.py
index 410519ddca96a..0709d2de25a67 100644
--- a/python/pyspark/shuffle.py
+++ b/python/pyspark/shuffle.py
@@ -52,7 +52,6 @@ def get_used_memory():
             info = process.get_memory_info()
         return info.rss >> 20
 
-
 except ImportError:
 
     def get_used_memory():

From 9baa5b7cdf35794fb4f49fe3d535309f89bcfde1 Mon Sep 17 00:00:00 2001
From: flynn <crf204@gmail.com>
Date: Tue, 22 Mar 2022 13:59:33 -0400
Subject: [PATCH 15/15] add note in migration guide

---
 python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst b/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst
index f2701d4fb7216..d81008d63cbe9 100644
--- a/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst
+++ b/python/docs/source/migration_guide/pyspark_3.2_to_3.3.rst
@@ -23,3 +23,4 @@ Upgrading from PySpark 3.2 to 3.3
 * In Spark 3.3, the ``pyspark.pandas.sql`` method follows [the standard Python string formatter](https://docs.python.org/3/library/string.html#format-string-syntax). To restore the previous behavior, set ``PYSPARK_PANDAS_SQL_LEGACY`` environment variable to ``1``.
 * In Spark 3.3, the ``drop`` method of pandas API on Spark DataFrame supports dropping rows by ``index``, and sets dropping by index instead of column by default.
 * In Spark 3.3, PySpark upgrades Pandas version, the new minimum required version changes from 0.23.2 to 1.0.5.
+* In Spark 3.3, the ``repr`` return values of SQL DataTypes have been changed to yield an object with the same value when passed to ``eval``.