♻️ Refactor header option parser to use the standard library instead …

…of a custom RegEx (#75)
Kludex · Feb 3, 2024 · 20f0ef6 · 20f0ef6
1 parent d3d16da
commit 20f0ef6
Showing 1 changed file with 26 additions and 24 deletions.
diff --git a/multipart/multipart.py b/multipart/multipart.py
@@ -9,6 +9,8 @@
 import tempfile
 from io import BytesIO
 from numbers import Number
+from email.message import Message
+from typing import Dict, Union, Tuple
 
 # Unique missing object.
 _missing = object()
@@ -76,44 +78,44 @@
 QUOTE = b'"'[0]
 
 
-def parse_options_header(value):
+def parse_options_header(value: Union[str, bytes]) -> Tuple[bytes, Dict[bytes, bytes]]:
     """
     Parses a Content-Type header into a value in the following format:
         (content_type, {parameters})
     """
+    # Uses email.message.Message to parse the header as described in PEP 594.
+    # Ref: https://peps.python.org/pep-0594/#cgi
     if not value:
         return (b'', {})
 
-    # If we are passed a string, we assume that it conforms to WSGI and does
-    # not contain any code point that's not in latin-1.
-    if isinstance(value, str):            # pragma: no cover
-        value = value.encode('latin-1')
+    # If we are passed bytes, we assume that it conforms to WSGI, encoding in latin-1.
+    if isinstance(value, bytes):  # pragma: no cover
+        value = value.decode('latin-1')
+
+    # For types
+    assert isinstance(value, str), 'Value should be a string by now'
 
     # If we have no options, return the string as-is.
-    if b';' not in value:
-        return (value.lower().strip(), {})
+    if ";" not in value:
+        return (value.lower().strip().encode('latin-1'), {})
 
     # Split at the first semicolon, to get our value and then options.
-    ctype, rest = value.split(b';', 1)
+    # ctype, rest = value.split(b';', 1)
+    message = Message()
+    message['content-type'] = value
+    params = message.get_params()
+    # If there were no parameters, this would have already returned above
+    assert params, 'At least the content type value should be present'
+    ctype = params.pop(0)[0].encode('latin-1')
     options = {}
-
-    # Parse the options.
-    for match in OPTION_RE.finditer(rest):
-        key = match.group(1).lower()
-        value = match.group(2)
-        if value[0] == QUOTE and value[-1] == QUOTE:
-            # Unquote the value.
-            value = value[1:-1]
-            value = value.replace(b'\\\\', b'\\').replace(b'\\"', b'"')
-
+    for param in params:
+        key, value = param
         # If the value is a filename, we need to fix a bug on IE6 that sends
         # the full file path instead of the filename.
-        if key == b'filename':
-            if value[1:3] == b':\\' or value[:2] == b'\\\\':
-                value = value.split(b'\\')[-1]
-
-        options[key] = value
-
+        if key == 'filename':
+            if value[1:3] == ':\\' or value[:2] == '\\\\':
+                value = value.split('\\')[-1]
+        options[key.encode('latin-1')] = value.encode('latin-1')
     return ctype, options