From bd5f92437173aae77cb128a1ebb8bf58effd13b5 Mon Sep 17 00:00:00 2001
From: Sam Bull <git@sambull.org>
Date: Fri, 6 Oct 2023 20:32:34 +0100
Subject: [PATCH] Allow lax response parsing on Py parser (#7663)

---
 CHANGES/7663.feature      |   1 +
 Makefile                  |   2 +-
 aiohttp/http_parser.py    |  86 ++++++++++++----
 tests/test_http_parser.py | 202 ++++++++++++++++++++++++++------------
 4 files changed, 205 insertions(+), 86 deletions(-)
 create mode 100644 CHANGES/7663.feature

diff --git a/CHANGES/7663.feature b/CHANGES/7663.feature
new file mode 100644
index 00000000000..509a7ad7e2a
--- /dev/null
+++ b/CHANGES/7663.feature
@@ -0,0 +1 @@
+Updated Python parser to comply with latest HTTP specs and allow lax response parsing -- by :user:`Dreamorcerer`
diff --git a/Makefile b/Makefile
index 103de1ec745..9bfea9122bc 100644
--- a/Makefile
+++ b/Makefile
@@ -58,7 +58,7 @@ aiohttp/_find_header.c: $(call to-hash,aiohttp/hdrs.py ./tools/gen.py)
 
 # _find_headers generator creates _headers.pyi as well
 aiohttp/%.c: aiohttp/%.pyx $(call to-hash,$(CYS)) aiohttp/_find_header.c
-	cython -3 -o $@ $< -I aiohttp
+	cython -3 -o $@ $< -I aiohttp -Werror
 
 vendor/llhttp/node_modules: vendor/llhttp/package.json
 	cd vendor/llhttp; npm install
diff --git a/aiohttp/http_parser.py b/aiohttp/http_parser.py
index 6d4261e337a..24be6a28bdd 100644
--- a/aiohttp/http_parser.py
+++ b/aiohttp/http_parser.py
@@ -5,9 +5,12 @@
 from contextlib import suppress
 from enum import IntEnum
 from typing import (
+    Any,
+    ClassVar,
     Final,
     Generic,
     List,
+    Literal,
     NamedTuple,
     Optional,
     Pattern,
@@ -24,7 +27,7 @@
 from . import hdrs
 from .base_protocol import BaseProtocol
 from .compression_utils import HAS_BROTLI, BrotliDecompressor, ZLibDecompressor
-from .helpers import NO_EXTENSIONS, BaseTimerContext
+from .helpers import DEBUG, NO_EXTENSIONS, BaseTimerContext
 from .http_exceptions import (
     BadHttpMessage,
     BadStatusLine,
@@ -48,6 +51,8 @@
     "RawResponseMessage",
 )
 
+_SEP = Literal[b"\r\n", b"\n"]
+
 ASCIISET: Final[Set[str]] = set(string.printable)
 
 # See https://www.rfc-editor.org/rfc/rfc9110.html#name-overview
@@ -60,6 +65,7 @@
 METHRE: Final[Pattern[str]] = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+")
 VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d).(\d)")
 HDRRE: Final[Pattern[bytes]] = re.compile(rb"[\x00-\x1F\x7F()<>@,;:\[\]={} \t\"\\]")
+HEXDIGIT = re.compile(rb"[0-9a-fA-F]+")
 
 
 class RawRequestMessage(NamedTuple):
@@ -206,6 +212,8 @@ def parse_headers(
 
 
 class HttpParser(abc.ABC, Generic[_MsgT]):
+    lax: ClassVar[bool] = False
+
     def __init__(
         self,
         protocol: BaseProtocol,
@@ -266,7 +274,7 @@ def feed_eof(self) -> Optional[_MsgT]:
     def feed_data(
         self,
         data: bytes,
-        SEP: bytes = b"\r\n",
+        SEP: _SEP = b"\r\n",
         EMPTY: bytes = b"",
         CONTENT_LENGTH: istr = hdrs.CONTENT_LENGTH,
         METH_CONNECT: str = hdrs.METH_CONNECT,
@@ -288,13 +296,16 @@ def feed_data(
                 pos = data.find(SEP, start_pos)
                 # consume \r\n
                 if pos == start_pos and not self._lines:
-                    start_pos = pos + 2
+                    start_pos = pos + len(SEP)
                     continue
 
                 if pos >= start_pos:
                     # line found
-                    self._lines.append(data[start_pos:pos])
-                    start_pos = pos + 2
+                    line = data[start_pos:pos]
+                    if SEP == b"\n":  # For lax response parsing
+                        line = line.rstrip(b"\r")
+                    self._lines.append(line)
+                    start_pos = pos + len(SEP)
 
                     # \r\n\r\n found
                     if self._lines[-1] == EMPTY:
@@ -311,7 +322,7 @@ def get_content_length() -> Optional[int]:
 
                             # Shouldn't allow +/- or other number formats.
                             # https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2
-                            if not length_hdr.strip(" \t").isdigit():
+                            if not length_hdr.strip(" \t").isdecimal():
                                 raise InvalidHeader(CONTENT_LENGTH)
 
                             return int(length_hdr)
@@ -348,6 +359,7 @@ def get_content_length() -> Optional[int]:
                                 readall=self.readall,
                                 response_with_body=self.response_with_body,
                                 auto_decompress=self._auto_decompress,
+                                lax=self.lax,
                             )
                             if not payload_parser.done:
                                 self._payload_parser = payload_parser
@@ -366,6 +378,7 @@ def get_content_length() -> Optional[int]:
                                 compression=msg.compression,
                                 readall=True,
                                 auto_decompress=self._auto_decompress,
+                                lax=self.lax,
                             )
                         else:
                             if (
@@ -389,6 +402,7 @@ def get_content_length() -> Optional[int]:
                                     readall=True,
                                     response_with_body=self.response_with_body,
                                     auto_decompress=self._auto_decompress,
+                                    lax=self.lax,
                                 )
                                 if not payload_parser.done:
                                     self._payload_parser = payload_parser
@@ -411,7 +425,7 @@ def get_content_length() -> Optional[int]:
                 assert not self._lines
                 assert self._payload_parser is not None
                 try:
-                    eof, data = self._payload_parser.feed_data(data[start_pos:])
+                    eof, data = self._payload_parser.feed_data(data[start_pos:], SEP)
                 except BaseException as exc:
                     if self.payload_exception is not None:
                         self._payload_parser.payload.set_exception(
@@ -456,12 +470,21 @@ def parse_headers(
 
         # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-6
         # https://www.rfc-editor.org/rfc/rfc9110.html#name-collected-abnf
-        singletons = (hdrs.CONTENT_LENGTH, hdrs.CONTENT_LOCATION, hdrs.CONTENT_RANGE,
-                      hdrs.CONTENT_TYPE, hdrs.ETAG, hdrs.HOST, hdrs.MAX_FORWARDS,
-                      hdrs.SERVER, hdrs.TRANSFER_ENCODING, hdrs.USER_AGENT)
+        singletons = (
+            hdrs.CONTENT_LENGTH,
+            hdrs.CONTENT_LOCATION,
+            hdrs.CONTENT_RANGE,
+            hdrs.CONTENT_TYPE,
+            hdrs.ETAG,
+            hdrs.HOST,
+            hdrs.MAX_FORWARDS,
+            hdrs.SERVER,
+            hdrs.TRANSFER_ENCODING,
+            hdrs.USER_AGENT,
+        )
         bad_hdr = next((h for h in singletons if len(headers.getall(h, ())) > 1), None)
         if bad_hdr is not None:
-            raise BadHttpMessage("Duplicate '{}' header found.".format(bad_hdr))
+            raise BadHttpMessage(f"Duplicate '{bad_hdr}' header found.")
 
         # keep-alive
         conn = headers.get(hdrs.CONNECTION)
@@ -597,6 +620,20 @@ class HttpResponseParser(HttpParser[RawResponseMessage]):
     Returns RawResponseMessage.
     """
 
+    # Lax mode should only be enabled on response parser.
+    lax = not DEBUG
+
+    def feed_data(
+        self,
+        data: bytes,
+        SEP: Optional[_SEP] = None,
+        *args: Any,
+        **kwargs: Any,
+    ) -> Tuple[List[Tuple[RawResponseMessage, StreamReader]], bool, bytes]:
+        if SEP is None:
+            SEP = b"\r\n" if DEBUG else b"\n"
+        return super().feed_data(data, SEP, *args, **kwargs)
+
     def parse_message(self, lines: List[bytes]) -> RawResponseMessage:
         line = lines[0].decode("utf-8", "surrogateescape")
         try:
@@ -621,7 +658,7 @@ def parse_message(self, lines: List[bytes]) -> RawResponseMessage:
         version_o = HttpVersion(int(match.group(1)), int(match.group(2)))
 
         # The status code is a three-digit number
-        if len(status) != 3 or not status.isdigit():
+        if len(status) != 3 or not status.isdecimal():
             raise BadStatusLine(line)
         status_i = int(status)
 
@@ -663,6 +700,7 @@ def __init__(
         readall: bool = False,
         response_with_body: bool = True,
         auto_decompress: bool = True,
+        lax: bool = False,
     ) -> None:
         self._length = 0
         self._type = ParseState.PARSE_NONE
@@ -670,6 +708,7 @@ def __init__(
         self._chunk_size = 0
         self._chunk_tail = b""
         self._auto_decompress = auto_decompress
+        self._lax = lax
         self.done = False
 
         # payload decompression wrapper
@@ -721,7 +760,7 @@ def feed_eof(self) -> None:
             )
 
     def feed_data(
-        self, chunk: bytes, SEP: bytes = b"\r\n", CHUNK_EXT: bytes = b";"
+        self, chunk: bytes, SEP: _SEP = b"\r\n", CHUNK_EXT: bytes = b";"
     ) -> Tuple[bool, bytes]:
         # Read specified amount of bytes
         if self._type == ParseState.PARSE_LENGTH:
@@ -757,7 +796,10 @@ def feed_data(
                         else:
                             size_b = chunk[:pos]
 
-                        if not size_b.isdigit():
+                        if self._lax:  # Allow whitespace in lax mode.
+                            size_b = size_b.strip()
+
+                        if not re.fullmatch(HEXDIGIT, size_b):
                             exc = TransferEncodingError(
                                 chunk[:pos].decode("ascii", "surrogateescape")
                             )
@@ -765,9 +807,11 @@ def feed_data(
                             raise exc
                         size = int(bytes(size_b), 16)
 
-                        chunk = chunk[pos + 2 :]
+                        chunk = chunk[pos + len(SEP) :]
                         if size == 0:  # eof marker
                             self._chunk = ChunkState.PARSE_MAYBE_TRAILERS
+                            if self._lax and chunk.startswith(b"\r"):
+                                chunk = chunk[1:]
                         else:
                             self._chunk = ChunkState.PARSE_CHUNKED_CHUNK
                             self._chunk_size = size
@@ -789,13 +833,15 @@ def feed_data(
                         self._chunk_size = 0
                         self.payload.feed_data(chunk[:required], required)
                         chunk = chunk[required:]
+                        if self._lax and chunk.startswith(b"\r"):
+                            chunk = chunk[1:]
                         self._chunk = ChunkState.PARSE_CHUNKED_CHUNK_EOF
                         self.payload.end_http_chunk_receiving()
 
                 # toss the CRLF at the end of the chunk
                 if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK_EOF:
-                    if chunk[:2] == SEP:
-                        chunk = chunk[2:]
+                    if chunk[: len(SEP)] == SEP:
+                        chunk = chunk[len(SEP) :]
                         self._chunk = ChunkState.PARSE_CHUNKED_SIZE
                     else:
                         self._chunk_tail = chunk
@@ -805,11 +851,11 @@ def feed_data(
                 # we should get another \r\n otherwise
                 # trailers needs to be skiped until \r\n\r\n
                 if self._chunk == ChunkState.PARSE_MAYBE_TRAILERS:
-                    head = chunk[:2]
+                    head = chunk[: len(SEP)]
                     if head == SEP:
                         # end of stream
                         self.payload.feed_eof()
-                        return True, chunk[2:]
+                        return True, chunk[len(SEP) :]
                     # Both CR and LF, or only LF may not be received yet. It is
                     # expected that CRLF or LF will be shown at the very first
                     # byte next time, otherwise trailers should come. The last
@@ -827,7 +873,7 @@ def feed_data(
                 if self._chunk == ChunkState.PARSE_TRAILERS:
                     pos = chunk.find(SEP)
                     if pos >= 0:
-                        chunk = chunk[pos + 2 :]
+                        chunk = chunk[pos + len(SEP) :]
                         self._chunk = ChunkState.PARSE_MAYBE_TRAILERS
                     else:
                         self._chunk_tail = chunk
diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py
index 89d936211da..bfdd10389c8 100644
--- a/tests/test_http_parser.py
+++ b/tests/test_http_parser.py
@@ -161,6 +161,83 @@ def test_invalid_linebreak(loop: Any, protocol: Any, request: Any) -> None:
         parser.feed_data(text)
 
 
+def test_cve_2023_37276(parser: Any) -> None:
+    text = b"""POST / HTTP/1.1\r\nHost: localhost:8080\r\nX-Abc: \rxTransfer-Encoding: chunked\r\n\r\n"""
+    with pytest.raises(http_exceptions.BadHttpMessage):
+        parser.feed_data(text)
+
+
+@pytest.mark.parametrize(
+    "hdr",
+    (
+        "Content-Length: -5",  # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length
+        "Content-Length: +256",
+        "Foo: abc\rdef",  # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
+        "Bar: abc\ndef",
+        "Baz: abc\x00def",
+        "Foo : bar",  # https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2
+        "Foo\t: bar",
+    ),
+)
+def test_bad_headers(parser: Any, hdr: str) -> None:
+    text = f"POST / HTTP/1.1\r\n{hdr}\r\n\r\n".encode()
+    with pytest.raises(http_exceptions.BadHttpMessage):
+        parser.feed_data(text)
+
+
+def test_content_length_transfer_encoding(parser: Any) -> None:
+    text = (
+        b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\nTransfer-Encoding: a\r\n\r\n"
+        + b"apple\r\n"
+    )
+    with pytest.raises(http_exceptions.BadHttpMessage):
+        parser.feed_data(text)
+
+
+def test_bad_chunked_py(loop: Any, protocol: Any) -> None:
+    """Test that invalid chunked encoding doesn't allow content-length to be used."""
+    parser = HttpRequestParserPy(
+        protocol,
+        loop,
+        2**16,
+        max_line_size=8190,
+        max_field_size=8190,
+    )
+    text = (
+        b"GET / HTTP/1.1\r\nHost: a\r\nTransfer-Encoding: chunked\r\n\r\n0_2e\r\n\r\n"
+        + b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\n\r\n0\r\n\r\n"
+    )
+    messages, upgrade, tail = parser.feed_data(text)
+    assert isinstance(messages[0][1].exception(), http_exceptions.TransferEncodingError)
+
+
+@pytest.mark.skipif(
+    "HttpRequestParserC" not in dir(aiohttp.http_parser),
+    reason="C based HTTP parser not available",
+)
+def test_bad_chunked_c(loop: Any, protocol: Any) -> None:
+    """C parser behaves differently. Maybe we should align them later."""
+    parser = HttpRequestParserC(
+        protocol,
+        loop,
+        2**16,
+        max_line_size=8190,
+        max_field_size=8190,
+    )
+    text = (
+        b"GET / HTTP/1.1\r\nHost: a\r\nTransfer-Encoding: chunked\r\n\r\n0_2e\r\n\r\n"
+        + b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\n\r\n0\r\n\r\n"
+    )
+    with pytest.raises(http_exceptions.BadHttpMessage):
+        parser.feed_data(text)
+
+
+def test_whitespace_before_header(parser: Any) -> None:
+    text = b"GET / HTTP/1.1\r\n\tContent-Length: 1\r\n\r\nX"
+    with pytest.raises(http_exceptions.BadHttpMessage):
+        parser.feed_data(text)
+
+
 def test_parse_headers_longline(parser: Any) -> None:
     invalid_unicode_byte = b"\xd9"
     header_name = b"Test" + invalid_unicode_byte + b"Header" + b"A" * 8192
@@ -475,71 +552,6 @@ def test_invalid_name(parser: Any) -> None:
         parser.feed_data(text)
 
 
-def test_cve_2023_37276(parser: Any) -> None:
-    text = b"""POST / HTTP/1.1\r\nHost: localhost:8080\r\nX-Abc: \rxTransfer-Encoding: chunked\r\n\r\n"""
-    with pytest.raises(http_exceptions.BadHttpMessage):
-        parser.feed_data(text)
-
-
-@pytest.mark.parametrize(
-    "hdr",
-    (
-        "Content-Length: -5",  # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length
-        "Content-Length: +256",
-        "Foo: abc\rdef",  # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5
-        "Bar: abc\ndef",
-        "Baz: abc\x00def",
-        "Foo : bar",  # https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2
-        "Foo\t: bar",
-    )
-)
-def test_bad_headers(parser: Any, hdr: str) -> None:
-    text = "POST / HTTP/1.1\r\n{}\r\n\r\n".format(hdr).encode()
-    with pytest.raises(http_exceptions.InvalidHeader):
-        parser.feed_data(text)
-
-
-def test_bad_chunked_py(loop: Any, protocol: Any) -> None:
-    """Test that invalid chunked encoding doesn't allow content-length to be used."""
-    parser = HttpRequestParserPy(
-        protocol,
-        loop,
-        2**16,
-        max_line_size=8190,
-        max_field_size=8190,
-    )
-    text = (b"GET / HTTP/1.1\r\nHost: a\r\nTransfer-Encoding: chunked\r\n\r\n0_2e\r\n\r\n"
-            + b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\n\r\n0\r\n\r\n")
-    messages, upgrade, tail = parser.feed_data(text)
-    assert isinstance(messages[0][1].exception(), http_exceptions.TransferEncodingError)
-
-
-@pytest.mark.skipif(
-    "HttpRequestParserC" not in dir(aiohttp.http_parser),
-    reason="C based HTTP parser not available",
-)
-def test_bad_chunked_c(loop: Any, protocol: Any) -> None:
-    """C parser behaves differently. Maybe we should align them later."""
-    payload = b"GET1 /test HTTP/1.1\r\n\r\n"
-    parser = HttpRequestParserC(
-        protocol,
-        loop,
-        2**16,
-        max_line_size=8190,
-        max_field_size=8190,
-    )
-    text = (b"GET / HTTP/1.1\r\nHost: a\r\nTransfer-Encoding: chunked\r\n\r\n0_2e\r\n\r\n"
-            + b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\n\r\n0\r\n\r\n")
-    with pytest.raises(http_exceptions.BadHttpMessage):
-        parser.feed_data(text)
-
-
-def test_whitespace_before_header(parser: Any) -> None:
-    text = b"GET / HTTP/1.1\r\n\tContent-Length: 1\r\n\r\nX"
-    with pytest.raises(http_exceptions.BadHttpMessage):
-        parser.feed_data(text)
-
-
 @pytest.mark.parametrize("size", [40960, 8191])
 def test_max_header_field_size(parser: Any, size: Any) -> None:
     name = b"t" * size
@@ -825,6 +837,66 @@ def test_http_response_parser_strict_headers(response: Any) -> None:
         response.feed_data(b"HTTP/1.1 200 test\r\nFoo: abc\x01def\r\n\r\n")
 
 
+def test_http_response_parser_bad_crlf(response: Any) -> None:
+    """Still a lot of dodgy servers sending bad requests like this."""
+    messages, upgrade, tail = response.feed_data(
+        b"HTTP/1.0 200 OK\nFoo: abc\nBar: def\n\nBODY\n"
+    )
+    msg = messages[0][0]
+
+    assert msg.headers["Foo"] == "abc"
+    assert msg.headers["Bar"] == "def"
+
+
+async def test_http_response_parser_bad_chunked_lax(response: Any) -> None:
+    text = (
+        b"HTTP/1.1 200 OK\r\nTransfer-Encoding: chunked\r\n\r\n5 \r\nabcde\r\n0\r\n\r\n"
+    )
+    messages, upgrade, tail = response.feed_data(text)
+
+    assert await messages[0][1].read(5) == b"abcde"
+
+
+@pytest.mark.dev_mode
+async def test_http_response_parser_bad_chunked_strict_py(
+    loop: Any, protocol: Any
+) -> None:
+    response = HttpResponseParserPy(
+        protocol,
+        loop,
+        2**16,
+        max_line_size=8190,
+        max_field_size=8190,
+    )
+    text = (
+        b"HTTP/1.1 200 OK\r\nTransfer-Encoding: chunked\r\n\r\n5 \r\nabcde\r\n0\r\n\r\n"
+    )
+    messages, upgrade, tail = response.feed_data(text)
+    assert isinstance(messages[0][1].exception(), http_exceptions.TransferEncodingError)
+
+
+@pytest.mark.dev_mode
+@pytest.mark.skipif(
+    "HttpRequestParserC" not in dir(aiohttp.http_parser),
+    reason="C based HTTP parser not available",
+)
+async def test_http_response_parser_bad_chunked_strict_c(
+    loop: Any, protocol: Any
+) -> None:
+    response = HttpResponseParserC(
+        protocol,
+        loop,
+        2**16,
+        max_line_size=8190,
+        max_field_size=8190,
+    )
+    text = (
+        b"HTTP/1.1 200 OK\r\nTransfer-Encoding: chunked\r\n\r\n5 \r\nabcde\r\n0\r\n\r\n"
+    )
+    with pytest.raises(http_exceptions.BadHttpMessage):
+        response.feed_data(text)
+
+
 def test_http_response_parser_bad(response: Any) -> None:
     with pytest.raises(http_exceptions.BadHttpMessage):
         response.feed_data(b"HTT/1\r\n\r\n")