From b02527178d38323a239f651506ca609fd963f454 Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Thu, 26 Nov 2020 22:28:41 +0800
Subject: [PATCH 1/7] fix SPARK-33566

---
 .../spark/sql/catalyst/csv/CSVOptions.scala   |  8 ++++++-
 .../apache/spark/sql/DataFrameReader.scala    | 21 +++++++++++++++++++
 .../unescaped-quotes-unescaped-delimiter.csv  |  3 +++
 .../execution/datasources/csv/CSVSuite.scala  | 15 +++++++++++++
 4 files changed, 46 insertions(+), 1 deletion(-)
 create mode 100644 sql/core/src/test/resources/test-data/unescaped-quotes-unescaped-delimiter.csv
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
index f2191fcf35f1a..ec405994eadef 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala
@@ -213,6 +213,12 @@ class CSVOptions(
   }
   val lineSeparatorInWrite: Option[String] = lineSeparator
 
+  /**
+   * The handling method to be used when unescaped quotes are found in the input.
+   */
+  val unescapedQuoteHandling: UnescapedQuoteHandling = UnescapedQuoteHandling.valueOf(parameters
+    .getOrElse("unescapedQuoteHandling", "STOP_AT_DELIMITER").toUpperCase(Locale.ROOT))
+
   def asWriterSettings: CsvWriterSettings = {
     val writerSettings = new CsvWriterSettings()
     val format = writerSettings.getFormat
@@ -258,7 +264,7 @@ class CSVOptions(
     settings.setNullValue(nullValue)
     settings.setEmptyValue(emptyValueInRead)
     settings.setMaxCharsPerColumn(maxCharsPerColumn)
-    settings.setUnescapedQuoteHandling(UnescapedQuoteHandling.STOP_AT_DELIMITER)
+    settings.setUnescapedQuoteHandling(unescapedQuoteHandling)
     settings.setLineSeparatorDetectionEnabled(lineSeparatorInRead.isEmpty && multiLine)
     lineSeparatorInRead.foreach { _ =>
       settings.setNormalizeLineEndingsWithinQuotes(!multiLine)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
index b26bc6441b6cf..8f96f0b882424 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala
@@ -727,6 +727,27 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
    * a record can have.</li>
    * <li>`maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed
    * for any given value being read. By default, it is -1 meaning unlimited length</li>
+   * <li>`unescapedQuoteHandling` (default `STOP_AT_DELIMITER`): defines how the CsvParser
+   * will handle values with unescaped quotes.
+   *   <ul>
+   *     <li>`STOP_AT_CLOSING_QUOTE`: If unescaped quotes are found in the input, accumulate
+   *     the quote character and proceed parsing the value as a quoted value, until a closing
+   *     quote is found.</li>
+   *     <li>`BACK_TO_DELIMITER`: If unescaped quotes are found in the input, consider the value
+   *     as an unquoted value. This will make the parser accumulate all characters of the current
+   *     parsed value until the delimiter is found. If no
+   *     delimiter is found in the value, the parser will continue accumulating characters from
+   *     the input until a delimiter or line ending is found.</li>
+   *     <li>`STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value
+   *     as an unquoted value. This will make the parser accumulate all characters until the
+   *     delimiter or a line ending is found in the input.</li>
+   *     <li>`STOP_AT_DELIMITER`: If unescaped quotes are found in the input, the content parsed
+   *     for the given value will be skipped and the value set in nullValue will be produced
+   *     instead.</li>
+   *     <li>`RAISE_ERROR`: If unescaped quotes are found in the input, a TextParsingException
+   *     will be thrown.</li>
+   *   </ul>
+   * </li>
    * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
    *    during parsing. It supports the following case-insensitive modes. Note that Spark tries
    *    to parse only required columns in CSV under column pruning. Therefore, corrupt records
diff --git a/sql/core/src/test/resources/test-data/unescaped-quotes-unescaped-delimiter.csv b/sql/core/src/test/resources/test-data/unescaped-quotes-unescaped-delimiter.csv
new file mode 100644
index 0000000000000..a1d91b6d27a79
--- /dev/null
+++ b/sql/core/src/test/resources/test-data/unescaped-quotes-unescaped-delimiter.csv
@@ -0,0 +1,3 @@
+c1,c2
+"a,""b,c","xyz"
+"a,b,c","x""yz"
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index a236814fdcdcd..97c0fe11c17ad 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -75,6 +75,8 @@ abstract class CSVSuite
   private val valueMalformedFile = "test-data/value-malformed.csv"
   private val badAfterGoodFile = "test-data/bad_after_good.csv"
   private val malformedRowFile = "test-data/malformedRow.csv"
+  private val unescapedQuotesAndUnescapedDelimiterFile =
+    "test-data/unescaped-quotes-unescaped-delimiter.csv"
 
   /** Verifies data and schema. */
   private def verifyCars(
@@ -2428,6 +2430,19 @@ abstract class CSVSuite
       assert(readback.collect sameElements Array(Row("0"), Row("1"), Row("2")))
     }
   }
+
+  test("SPARK-33566: configure UnescapedQuoteHandling to parse " +
+    "unescapedQuotesAndUnescapedDelimiterFile correctly") {
+    // Without configure UnescapedQuoteHandling to STOP_AT_CLOSING_QUOTE,
+    // the result will be Row(""""a,""b""", """c""""), Row("""a,b,c""", """"x""yz"""")
+    val result = spark.read
+      .option("inferSchema", "true")
+      .option("header", "true")
+      .option("unescapedQuoteHandling", "STOP_AT_CLOSING_QUOTE")
+      .csv(testFile(unescapedQuotesAndUnescapedDelimiterFile)).collect()
+    val exceptResults = Array(Row("""a,""b,c""", "xyz"), Row("""a,b,c""", """x""yz"""))
+    assert(result.sameElements(exceptResults))
+  }
 }
 
 class CSVv1Suite extends CSVSuite {

From 1770c565aa573e6f32e404b4f775f2c12edcae2e Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Fri, 27 Nov 2020 10:52:36 +0800
Subject: [PATCH 2/7] add comments to DataStreamReader.scala, readwriter.py and
 streaming.py

---
 python/pyspark/sql/readwriter.py              | 21 +++++++++++++++++++
 python/pyspark/sql/streaming.py               | 21 +++++++++++++++++++
 .../sql/streaming/DataStreamReader.scala      | 21 +++++++++++++++++++
 3 files changed, 63 insertions(+)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index bb31e6a3e09f8..b492198f2959c 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -259,6 +259,27 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
             allows accepting quoting of all character
             using backslash quoting mechanism. If None is
             set, it uses the default value, ``false``.
+        unescapedQuoteHandling : str, optional
+            defines how the CsvParser will handle values with unescaped quotes. If None is
+            set, it uses the default value, ``STOP_AT_DELIMITER``.
+
+            * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate \
+              the quote character and proceed parsing the value as a quoted value, until a closing \
+              quote is found.
+            * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value \
+              as an unquoted value. This will make the parser accumulate all characters of the current \
+              parsed value until the delimiter is found. If no delimiter is found in the value, the \
+              parser will continue accumulating characters from the input until a delimiter or line \
+              ending is found.
+            * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value \
+              as an unquoted value. This will make the parser accumulate all characters until the \
+              delimiter or a line ending is found in the input.
+            * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed \
+              for the given value will be skipped and the value set in nullValue will be produced \
+              instead.
+            * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException \
+              will be thrown.
+
         mode : str, optional
             allows a mode for dealing with corrupt records during parsing. If None is
                      set, it uses the default value, ``PERMISSIVE``.
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index e7b2fa16d620a..63fcbd52fd366 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -851,6 +851,27 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         maxMalformedLogPerPartition : str or int, optional
             this parameter is no longer used since Spark 2.2.0.
             If specified, it is ignored.
+        unescapedQuoteHandling : str, optional
+            defines how the CsvParser will handle values with unescaped quotes. If None is
+            set, it uses the default value, ``STOP_AT_DELIMITER``.
+
+            * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate \
+              the quote character and proceed parsing the value as a quoted value, until a closing \
+              quote is found.
+            * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value \
+              as an unquoted value. This will make the parser accumulate all characters of the current \
+              parsed value until the delimiter is found. If no delimiter is found in the value, the \
+              parser will continue accumulating characters from the input until a delimiter or line \
+              ending is found.
+            * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value \
+              as an unquoted value. This will make the parser accumulate all characters until the \
+              delimiter or a line ending is found in the input.
+            * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed \
+              for the given value will be skipped and the value set in nullValue will be produced \
+              instead.
+            * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException \
+              will be thrown.
+
         mode : str, optional
             allows a mode for dealing with corrupt records during parsing. If None is
             set, it uses the default value, ``PERMISSIVE``.
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
index 9bc4acd49a980..7f4ef8be562fb 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/streaming/DataStreamReader.scala
@@ -396,6 +396,27 @@ final class DataStreamReader private[sql](sparkSession: SparkSession) extends Lo
    * a record can have.</li>
    * <li>`maxCharsPerColumn` (default `-1`): defines the maximum number of characters allowed
    * for any given value being read. By default, it is -1 meaning unlimited length</li>
+   * <li>`unescapedQuoteHandling` (default `STOP_AT_DELIMITER`): defines how the CsvParser
+   * will handle values with unescaped quotes.
+   *   <ul>
+   *     <li>`STOP_AT_CLOSING_QUOTE`: If unescaped quotes are found in the input, accumulate
+   *     the quote character and proceed parsing the value as a quoted value, until a closing
+   *     quote is found.</li>
+   *     <li>`BACK_TO_DELIMITER`: If unescaped quotes are found in the input, consider the value
+   *     as an unquoted value. This will make the parser accumulate all characters of the current
+   *     parsed value until the delimiter is found. If no delimiter is found in the value, the
+   *     parser will continue accumulating characters from the input until a delimiter or line
+   *     ending is found.</li>
+   *     <li>`STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value
+   *     as an unquoted value. This will make the parser accumulate all characters until the
+   *     delimiter or a line ending is found in the input.</li>
+   *     <li>`STOP_AT_DELIMITER`: If unescaped quotes are found in the input, the content parsed
+   *     for the given value will be skipped and the value set in nullValue will be produced
+   *     instead.</li>
+   *     <li>`RAISE_ERROR`: If unescaped quotes are found in the input, a TextParsingException
+   *     will be thrown.</li>
+   *   </ul>
+   * </li>
    * <li>`mode` (default `PERMISSIVE`): allows a mode for dealing with corrupt records
    *    during parsing. It supports the following case-insensitive modes.
    *   <ul>

From ca79a488929f7777b5c2262c12f85bfa0272ca5d Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Fri, 27 Nov 2020 11:16:25 +0800
Subject: [PATCH 3/7] refactor test

---
 .../unescaped-quotes-unescaped-delimiter.csv  |  3 --
 .../execution/datasources/csv/CSVSuite.scala  | 33 ++++++++++++-------
 2 files changed, 21 insertions(+), 15 deletions(-)
 delete mode 100644 sql/core/src/test/resources/test-data/unescaped-quotes-unescaped-delimiter.csv

diff --git a/sql/core/src/test/resources/test-data/unescaped-quotes-unescaped-delimiter.csv b/sql/core/src/test/resources/test-data/unescaped-quotes-unescaped-delimiter.csv
deleted file mode 100644
index a1d91b6d27a79..0000000000000
--- a/sql/core/src/test/resources/test-data/unescaped-quotes-unescaped-delimiter.csv
+++ /dev/null
@@ -1,3 +0,0 @@
-c1,c2
-"a,""b,c","xyz"
-"a,b,c","x""yz"
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
index 97c0fe11c17ad..30f0e45d04eab 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -75,8 +75,6 @@ abstract class CSVSuite
   private val valueMalformedFile = "test-data/value-malformed.csv"
   private val badAfterGoodFile = "test-data/bad_after_good.csv"
   private val malformedRowFile = "test-data/malformedRow.csv"
-  private val unescapedQuotesAndUnescapedDelimiterFile =
-    "test-data/unescaped-quotes-unescaped-delimiter.csv"
 
   /** Verifies data and schema. */
   private def verifyCars(
@@ -2432,16 +2430,27 @@ abstract class CSVSuite
   }
 
   test("SPARK-33566: configure UnescapedQuoteHandling to parse " +
-    "unescapedQuotesAndUnescapedDelimiterFile correctly") {
-    // Without configure UnescapedQuoteHandling to STOP_AT_CLOSING_QUOTE,
-    // the result will be Row(""""a,""b""", """c""""), Row("""a,b,c""", """"x""yz"""")
-    val result = spark.read
-      .option("inferSchema", "true")
-      .option("header", "true")
-      .option("unescapedQuoteHandling", "STOP_AT_CLOSING_QUOTE")
-      .csv(testFile(unescapedQuotesAndUnescapedDelimiterFile)).collect()
-    val exceptResults = Array(Row("""a,""b,c""", "xyz"), Row("""a,b,c""", """x""yz"""))
-    assert(result.sameElements(exceptResults))
+    "unescaped quotes and unescaped delimiter data correctly") {
+    withTempPath { path =>
+      val dataPath = path.getCanonicalPath
+      val row1 = Row("""a,""b,c""", "xyz")
+      val row2 = Row("""a,b,c""", """x""yz""")
+      // Generate the test data, use `,` as delimiter and `"` as quotes, but they didn't escape.
+      Seq(
+        """c1,c2""",
+        s""""${row1.getString(0)}","${row1.getString(1)}"""",
+        s""""${row2.getString(0)}","${row2.getString(1)}"""")
+        .toDF().repartition(1).write.text(dataPath)
+      // Without configure UnescapedQuoteHandling to STOP_AT_CLOSING_QUOTE,
+      // the result will be Row(""""a,""b""", """c""""), Row("""a,b,c""", """"x""yz"""")
+      val result = spark.read
+        .option("inferSchema", "true")
+        .option("header", "true")
+        .option("unescapedQuoteHandling", "STOP_AT_CLOSING_QUOTE")
+        .csv(dataPath).collect()
+      val exceptResults = Array(row1, row2)
+      assert(result.sameElements(exceptResults))
+    }
   }
 }
 

From 84c1d59e10cccd0e2f713c16b70859fd152638be Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Fri, 27 Nov 2020 11:41:13 +0800
Subject: [PATCH 4/7] try to fix python file

---
 python/pyspark/sql/readwriter.py | 46 ++++++++++++++++----------------
 python/pyspark/sql/streaming.py  | 46 ++++++++++++++++----------------
 2 files changed, 46 insertions(+), 46 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index b492198f2959c..bb82776a0a4ff 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -259,27 +259,6 @@ def json(self, path, schema=None, primitivesAsString=None, prefersDecimal=None,
             allows accepting quoting of all character
             using backslash quoting mechanism. If None is
             set, it uses the default value, ``false``.
-        unescapedQuoteHandling : str, optional
-            defines how the CsvParser will handle values with unescaped quotes. If None is
-            set, it uses the default value, ``STOP_AT_DELIMITER``.
-
-            * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate \
-              the quote character and proceed parsing the value as a quoted value, until a closing \
-              quote is found.
-            * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value \
-              as an unquoted value. This will make the parser accumulate all characters of the current \
-              parsed value until the delimiter is found. If no delimiter is found in the value, the \
-              parser will continue accumulating characters from the input until a delimiter or line \
-              ending is found.
-            * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value \
-              as an unquoted value. This will make the parser accumulate all characters until the \
-              delimiter or a line ending is found in the input.
-            * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed \
-              for the given value will be skipped and the value set in nullValue will be produced \
-              instead.
-            * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException \
-              will be thrown.
-
         mode : str, optional
             allows a mode for dealing with corrupt records during parsing. If None is
                      set, it uses the default value, ``PERMISSIVE``.
@@ -543,7 +522,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
             columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
             samplingRatio=None, enforceSchema=None, emptyValue=None, locale=None, lineSep=None,
-            pathGlobFilter=None, recursiveFileLookup=None, modifiedBefore=None, modifiedAfter=None):
+            pathGlobFilter=None, recursiveFileLookup=None, modifiedBefore=None, modifiedAfter=None,
+            unescapedQuoteHandling=None):
         r"""Loads a CSV file and returns the result as a  :class:`DataFrame`.
 
         This function will go through the input once to determine the input schema if
@@ -706,6 +686,26 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         modifiedAfter (batch only) : an optional timestamp to only include files with
             modification times occurring after the specified time. The provided timestamp
             must be in the following format: YYYY-MM-DDTHH:mm:ss (e.g. 2020-06-01T13:00:00)
+        unescapedQuoteHandling : str, optional
+            defines how the CsvParser will handle values with unescaped quotes. If None is
+            set, it uses the default value, ``STOP_AT_DELIMITER``.
+
+            * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate \
+              the quote character and proceed parsing the value as a quoted value, until a closing \
+              quote is found.
+            * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value \
+              as an unquoted value. This will make the parser accumulate all characters of the current \
+              parsed value until the delimiter is found. If no delimiter is found in the value, the \
+              parser will continue accumulating characters from the input until a delimiter or line \
+              ending is found.
+            * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value \
+              as an unquoted value. This will make the parser accumulate all characters until the \
+              delimiter or a line ending is found in the input.
+            * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed \
+              for the given value will be skipped and the value set in nullValue will be produced \
+              instead.
+            * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException \
+              will be thrown.
 
         Examples
         --------
@@ -729,7 +729,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, samplingRatio=samplingRatio,
             enforceSchema=enforceSchema, emptyValue=emptyValue, locale=locale, lineSep=lineSep,
             pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup,
-            modifiedBefore=modifiedBefore, modifiedAfter=modifiedAfter)
+            modifiedBefore=modifiedBefore, modifiedAfter=modifiedAfter, unescapedQuoteHandling=unescapedQuoteHandling)
         if isinstance(path, str):
             path = [path]
         if type(path) == list:
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 63fcbd52fd366..8afb51da55680 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -761,7 +761,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             maxCharsPerColumn=None, maxMalformedLogPerPartition=None, mode=None,
             columnNameOfCorruptRecord=None, multiLine=None, charToEscapeQuoteEscaping=None,
             enforceSchema=None, emptyValue=None, locale=None, lineSep=None,
-            pathGlobFilter=None, recursiveFileLookup=None):
+            pathGlobFilter=None, recursiveFileLookup=None, unescapedQuoteHandling=None):
         r"""Loads a CSV file stream and returns the result as a :class:`DataFrame`.
 
         This function will go through the input once to determine the input schema if
@@ -851,27 +851,6 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         maxMalformedLogPerPartition : str or int, optional
             this parameter is no longer used since Spark 2.2.0.
             If specified, it is ignored.
-        unescapedQuoteHandling : str, optional
-            defines how the CsvParser will handle values with unescaped quotes. If None is
-            set, it uses the default value, ``STOP_AT_DELIMITER``.
-
-            * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate \
-              the quote character and proceed parsing the value as a quoted value, until a closing \
-              quote is found.
-            * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value \
-              as an unquoted value. This will make the parser accumulate all characters of the current \
-              parsed value until the delimiter is found. If no delimiter is found in the value, the \
-              parser will continue accumulating characters from the input until a delimiter or line \
-              ending is found.
-            * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value \
-              as an unquoted value. This will make the parser accumulate all characters until the \
-              delimiter or a line ending is found in the input.
-            * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed \
-              for the given value will be skipped and the value set in nullValue will be produced \
-              instead.
-            * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException \
-              will be thrown.
-
         mode : str, optional
             allows a mode for dealing with corrupt records during parsing. If None is
             set, it uses the default value, ``PERMISSIVE``.
@@ -921,6 +900,26 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
         recursiveFileLookup : str or bool, optional
             recursively scan a directory for files. Using this option disables
             `partition discovery <https://spark.apache.org/docs/latest/sql-data-sources-parquet.html#partition-discovery>`_.  # noqa
+        unescapedQuoteHandling : str, optional
+            defines how the CsvParser will handle values with unescaped quotes. If None is
+            set, it uses the default value, ``STOP_AT_DELIMITER``.
+
+            * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate \
+              the quote character and proceed parsing the value as a quoted value, until a closing \
+              quote is found.
+            * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value \
+              as an unquoted value. This will make the parser accumulate all characters of the current \
+              parsed value until the delimiter is found. If no delimiter is found in the value, the \
+              parser will continue accumulating characters from the input until a delimiter or line \
+              ending is found.
+            * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value \
+              as an unquoted value. This will make the parser accumulate all characters until the \
+              delimiter or a line ending is found in the input.
+            * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed \
+              for the given value will be skipped and the value set in nullValue will be produced \
+              instead.
+            * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException \
+              will be thrown.
 
         .. versionadded:: 2.0.0
 
@@ -947,7 +946,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             columnNameOfCorruptRecord=columnNameOfCorruptRecord, multiLine=multiLine,
             charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, enforceSchema=enforceSchema,
             emptyValue=emptyValue, locale=locale, lineSep=lineSep,
-            pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup)
+            pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup,
+            unescapedQuoteHandling=unescapedQuoteHandling)
         if isinstance(path, str):
             return self._df(self._jreader.csv(path))
         else:

From 1646adb6f1d7e6789f30bcc4b94f92caaddb8960 Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Fri, 27 Nov 2020 12:03:17 +0800
Subject: [PATCH 5/7] fix py format

---
 python/pyspark/sql/readwriter.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index bb82776a0a4ff..ca886f402ed96 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -729,7 +729,8 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             charToEscapeQuoteEscaping=charToEscapeQuoteEscaping, samplingRatio=samplingRatio,
             enforceSchema=enforceSchema, emptyValue=emptyValue, locale=locale, lineSep=lineSep,
             pathGlobFilter=pathGlobFilter, recursiveFileLookup=recursiveFileLookup,
-            modifiedBefore=modifiedBefore, modifiedAfter=modifiedAfter, unescapedQuoteHandling=unescapedQuoteHandling)
+            modifiedBefore=modifiedBefore, modifiedAfter=modifiedAfter,
+            unescapedQuoteHandling=unescapedQuoteHandling)
         if isinstance(path, str):
             path = [path]
         if type(path) == list:

From d4e3993d1141daaebcd667bad8e0cce4fc23e1bf Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Fri, 27 Nov 2020 12:48:19 +0800
Subject: [PATCH 6/7] python

---
 python/pyspark/sql/readwriter.py | 22 +++++++++++-----------
 python/pyspark/sql/streaming.py  | 22 +++++++++++-----------
 2 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index ca886f402ed96..2f9078e6c48e2 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -690,21 +690,21 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             defines how the CsvParser will handle values with unescaped quotes. If None is
             set, it uses the default value, ``STOP_AT_DELIMITER``.
 
-            * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate \
-              the quote character and proceed parsing the value as a quoted value, until a closing \
+            * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate
+              the quote character and proceed parsing the value as a quoted value, until a closing
               quote is found.
-            * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value \
-              as an unquoted value. This will make the parser accumulate all characters of the current \
-              parsed value until the delimiter is found. If no delimiter is found in the value, the \
-              parser will continue accumulating characters from the input until a delimiter or line \
+            * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value
+              as an unquoted value. This will make the parser accumulate all characters of the current
+              parsed value until the delimiter is found. If no delimiter is found in the value, the
+              parser will continue accumulating characters from the input until a delimiter or line
               ending is found.
-            * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value \
-              as an unquoted value. This will make the parser accumulate all characters until the \
+            * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value
+              as an unquoted value. This will make the parser accumulate all characters until the
               delimiter or a line ending is found in the input.
-            * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed \
-              for the given value will be skipped and the value set in nullValue will be produced \
+            * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed
+              for the given value will be skipped and the value set in nullValue will be produced
               instead.
-            * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException \
+            * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException
               will be thrown.
 
         Examples
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 8afb51da55680..92ae0a6a22e38 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -904,21 +904,21 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
             defines how the CsvParser will handle values with unescaped quotes. If None is
             set, it uses the default value, ``STOP_AT_DELIMITER``.
 
-            * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate \
-              the quote character and proceed parsing the value as a quoted value, until a closing \
+            * ``STOP_AT_CLOSING_QUOTE``: If unescaped quotes are found in the input, accumulate
+              the quote character and proceed parsing the value as a quoted value, until a closing
               quote is found.
-            * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value \
-              as an unquoted value. This will make the parser accumulate all characters of the current \
-              parsed value until the delimiter is found. If no delimiter is found in the value, the \
-              parser will continue accumulating characters from the input until a delimiter or line \
+            * ``BACK_TO_DELIMITER``: If unescaped quotes are found in the input, consider the value
+              as an unquoted value. This will make the parser accumulate all characters of the current
+              parsed value until the delimiter is found. If no delimiter is found in the value, the
+              parser will continue accumulating characters from the input until a delimiter or line
               ending is found.
-            * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value \
-              as an unquoted value. This will make the parser accumulate all characters until the \
+            * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value
+              as an unquoted value. This will make the parser accumulate all characters until the
               delimiter or a line ending is found in the input.
-            * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed \
-              for the given value will be skipped and the value set in nullValue will be produced \
+            * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed
+              for the given value will be skipped and the value set in nullValue will be produced
               instead.
-            * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException \
+            * ``RAISE_ERROR``: If unescaped quotes are found in the input, a TextParsingException
               will be thrown.
 
         .. versionadded:: 2.0.0

From ca2900de52497ce5bb3bad90f0f28678248c7595 Mon Sep 17 00:00:00 2001
From: yangjie01 <yangjie01@baidu.com>
Date: Fri, 27 Nov 2020 12:57:42 +0800
Subject: [PATCH 7/7] fix python

---
 python/pyspark/sql/readwriter.py  | 2 +-
 python/pyspark/sql/readwriter.pyi | 1 +
 python/pyspark/sql/streaming.py   | 2 +-
 python/pyspark/sql/streaming.pyi  | 1 +
 4 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/pyspark/sql/readwriter.py b/python/pyspark/sql/readwriter.py
index 2f9078e6c48e2..d120daa5a9434 100644
--- a/python/pyspark/sql/readwriter.py
+++ b/python/pyspark/sql/readwriter.py
@@ -698,7 +698,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
               parsed value until the delimiter is found. If no delimiter is found in the value, the
               parser will continue accumulating characters from the input until a delimiter or line
               ending is found.
-            * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value
+            * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, consider the value
               as an unquoted value. This will make the parser accumulate all characters until the
               delimiter or a line ending is found in the input.
             * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed
diff --git a/python/pyspark/sql/readwriter.pyi b/python/pyspark/sql/readwriter.pyi
index 64c5697203a44..c3b9a428f22b3 100644
--- a/python/pyspark/sql/readwriter.pyi
+++ b/python/pyspark/sql/readwriter.pyi
@@ -113,6 +113,7 @@ class DataFrameReader(OptionUtils):
         lineSep: Optional[str] = ...,
         pathGlobFilter: Optional[Union[bool, str]] = ...,
         recursiveFileLookup: Optional[Union[bool, str]] = ...,
+        unescapedQuoteHandling: Optional[str] = ...,
     ) -> DataFrame: ...
     def orc(
         self,
diff --git a/python/pyspark/sql/streaming.py b/python/pyspark/sql/streaming.py
index 92ae0a6a22e38..365b5f38694a7 100644
--- a/python/pyspark/sql/streaming.py
+++ b/python/pyspark/sql/streaming.py
@@ -912,7 +912,7 @@ def csv(self, path, schema=None, sep=None, encoding=None, quote=None, escape=Non
               parsed value until the delimiter is found. If no delimiter is found in the value, the
               parser will continue accumulating characters from the input until a delimiter or line
               ending is found.
-            * ``STOP_AT_DELIMITER`: If unescaped quotes are found in the input, consider the value
+            * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, consider the value
               as an unquoted value. This will make the parser accumulate all characters until the
               delimiter or a line ending is found in the input.
             * ``STOP_AT_DELIMITER``: If unescaped quotes are found in the input, the content parsed
diff --git a/python/pyspark/sql/streaming.pyi b/python/pyspark/sql/streaming.pyi
index 56ce140b826d5..829610ad3b94b 100644
--- a/python/pyspark/sql/streaming.pyi
+++ b/python/pyspark/sql/streaming.pyi
@@ -149,6 +149,7 @@ class DataStreamReader(OptionUtils):
         lineSep: Optional[str] = ...,
         pathGlobFilter: Optional[Union[bool, str]] = ...,
         recursiveFileLookup: Optional[Union[bool, str]] = ...,
+        unescapedQuoteHandling: Optional[str] = ...,
     ) -> DataFrame: ...
 
 class DataStreamWriter: