diff --git a/CHANGES/7663.feature b/CHANGES/7663.feature new file mode 100644 index 00000000000..509a7ad7e2a --- /dev/null +++ b/CHANGES/7663.feature @@ -0,0 +1 @@ +Updated Python parser to comply with latest HTTP specs and allow lax response parsing -- by :user:`Dreamorcerer` diff --git a/Makefile b/Makefile index 6d23911d6b0..cdeb0ad0ed9 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,7 @@ aiohttp/_find_header.c: $(call to-hash,aiohttp/hdrs.py ./tools/gen.py) # _find_headers generator creates _headers.pyi as well aiohttp/%.c: aiohttp/%.pyx $(call to-hash,$(CYS)) aiohttp/_find_header.c - cython -3 -o $@ $< -I aiohttp + cython -3 -o $@ $< -I aiohttp -Werror vendor/llhttp/node_modules: vendor/llhttp/package.json cd vendor/llhttp; npm install diff --git a/aiohttp/http_parser.py b/aiohttp/http_parser.py index ff1df13d982..21b0038eb55 100644 --- a/aiohttp/http_parser.py +++ b/aiohttp/http_parser.py @@ -6,9 +6,11 @@ from enum import IntEnum from typing import ( Any, + ClassVar, Final, Generic, List, + Literal, NamedTuple, Optional, Pattern, @@ -25,7 +27,7 @@ from . import hdrs from .base_protocol import BaseProtocol from .compression_utils import HAS_BROTLI, BrotliDecompressor, ZLibDecompressor -from .helpers import NO_EXTENSIONS, BaseTimerContext +from .helpers import DEBUG, NO_EXTENSIONS, BaseTimerContext from .http_exceptions import ( BadHttpMessage, BadStatusLine, @@ -49,6 +51,8 @@ "RawResponseMessage", ) +_SEP = Literal[b"\r\n", b"\n"] + ASCIISET: Final[Set[str]] = set(string.printable) # See https://www.rfc-editor.org/rfc/rfc9110.html#name-overview @@ -61,6 +65,7 @@ METHRE: Final[Pattern[str]] = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+") VERSRE: Final[Pattern[str]] = re.compile(r"HTTP/(\d).(\d)") HDRRE: Final[Pattern[bytes]] = re.compile(rb"[\x00-\x1F\x7F()<>@,;:\[\]={} \t\"\\]") +HEXDIGIT = re.compile(rb"[0-9a-fA-F]+") class RawRequestMessage(NamedTuple): @@ -210,6 +215,8 @@ def parse_headers( class HttpParser(abc.ABC, Generic[_MsgT]): + lax: ClassVar[bool] = False + def __init__( self, protocol: Optional[BaseProtocol] = None, @@ -272,7 +279,7 @@ def feed_eof(self) -> Optional[_MsgT]: def feed_data( self, data: bytes, - SEP: bytes = b"\r\n", + SEP: _SEP = b"\r\n", EMPTY: bytes = b"", CONTENT_LENGTH: istr = hdrs.CONTENT_LENGTH, METH_CONNECT: str = hdrs.METH_CONNECT, @@ -296,13 +303,16 @@ def feed_data( pos = data.find(SEP, start_pos) # consume \r\n if pos == start_pos and not self._lines: - start_pos = pos + 2 + start_pos = pos + len(SEP) continue if pos >= start_pos: # line found - self._lines.append(data[start_pos:pos]) - start_pos = pos + 2 + line = data[start_pos:pos] + if SEP == b"\n": # For lax response parsing + line = line.rstrip(b"\r") + self._lines.append(line) + start_pos = pos + len(SEP) # \r\n\r\n found if self._lines[-1] == EMPTY: @@ -319,7 +329,7 @@ def get_content_length() -> Optional[int]: # Shouldn't allow +/- or other number formats. # https://www.rfc-editor.org/rfc/rfc9110#section-8.6-2 - if not length_hdr.strip(" \t").isdigit(): + if not length_hdr.strip(" \t").isdecimal(): raise InvalidHeader(CONTENT_LENGTH) return int(length_hdr) @@ -356,6 +366,7 @@ def get_content_length() -> Optional[int]: readall=self.readall, response_with_body=self.response_with_body, auto_decompress=self._auto_decompress, + lax=self.lax, ) if not payload_parser.done: self._payload_parser = payload_parser @@ -374,6 +385,7 @@ def get_content_length() -> Optional[int]: compression=msg.compression, readall=True, auto_decompress=self._auto_decompress, + lax=self.lax, ) else: if ( @@ -397,6 +409,7 @@ def get_content_length() -> Optional[int]: readall=True, response_with_body=self.response_with_body, auto_decompress=self._auto_decompress, + lax=self.lax, ) if not payload_parser.done: self._payload_parser = payload_parser @@ -419,7 +432,7 @@ def get_content_length() -> Optional[int]: assert not self._lines assert self._payload_parser is not None try: - eof, data = self._payload_parser.feed_data(data[start_pos:]) + eof, data = self._payload_parser.feed_data(data[start_pos:], SEP) except BaseException as exc: if self.payload_exception is not None: self._payload_parser.payload.set_exception( @@ -614,6 +627,20 @@ class HttpResponseParser(HttpParser[RawResponseMessage]): Returns RawResponseMessage. """ + # Lax mode should only be enabled on response parser. + lax = not DEBUG + + def feed_data( + self, + data: bytes, + SEP: Optional[_SEP] = None, + *args: Any, + **kwargs: Any, + ) -> Tuple[List[Tuple[RawResponseMessage, StreamReader]], bool, bytes]: + if SEP is None: + SEP = b"\r\n" if DEBUG else b"\n" + return super().feed_data(data, SEP, *args, **kwargs) + def parse_message(self, lines: List[bytes]) -> RawResponseMessage: line = lines[0].decode("utf-8", "surrogateescape") try: @@ -638,7 +665,7 @@ def parse_message(self, lines: List[bytes]) -> RawResponseMessage: version_o = HttpVersion(int(match.group(1)), int(match.group(2))) # The status code is a three-digit number - if len(status) != 3 or not status.isdigit(): + if len(status) != 3 or not status.isdecimal(): raise BadStatusLine(line) status_i = int(status) @@ -680,6 +707,7 @@ def __init__( readall: bool = False, response_with_body: bool = True, auto_decompress: bool = True, + lax: bool = False, ) -> None: self._length = 0 self._type = ParseState.PARSE_NONE @@ -687,6 +715,7 @@ def __init__( self._chunk_size = 0 self._chunk_tail = b"" self._auto_decompress = auto_decompress + self._lax = lax self.done = False # payload decompression wrapper @@ -738,7 +767,7 @@ def feed_eof(self) -> None: ) def feed_data( - self, chunk: bytes, SEP: bytes = b"\r\n", CHUNK_EXT: bytes = b";" + self, chunk: bytes, SEP: _SEP = b"\r\n", CHUNK_EXT: bytes = b";" ) -> Tuple[bool, bytes]: # Read specified amount of bytes if self._type == ParseState.PARSE_LENGTH: @@ -775,7 +804,10 @@ def feed_data( else: size_b = chunk[:pos] - if not size_b.isdigit(): + if self._lax: # Allow whitespace in lax mode. + size_b = size_b.strip() + + if not re.fullmatch(HEXDIGIT, size_b): exc = TransferEncodingError( chunk[:pos].decode("ascii", "surrogateescape") ) @@ -783,9 +815,11 @@ def feed_data( raise exc size = int(bytes(size_b), 16) - chunk = chunk[pos + 2 :] + chunk = chunk[pos + len(SEP) :] if size == 0: # eof marker self._chunk = ChunkState.PARSE_MAYBE_TRAILERS + if self._lax and chunk.startswith(b"\r"): + chunk = chunk[1:] else: self._chunk = ChunkState.PARSE_CHUNKED_CHUNK self._chunk_size = size @@ -807,13 +841,15 @@ def feed_data( self._chunk_size = 0 self.payload.feed_data(chunk[:required], required) chunk = chunk[required:] + if self._lax and chunk.startswith(b"\r"): + chunk = chunk[1:] self._chunk = ChunkState.PARSE_CHUNKED_CHUNK_EOF self.payload.end_http_chunk_receiving() # toss the CRLF at the end of the chunk if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK_EOF: - if chunk[:2] == SEP: - chunk = chunk[2:] + if chunk[: len(SEP)] == SEP: + chunk = chunk[len(SEP) :] self._chunk = ChunkState.PARSE_CHUNKED_SIZE else: self._chunk_tail = chunk @@ -823,11 +859,11 @@ def feed_data( # we should get another \r\n otherwise # trailers needs to be skiped until \r\n\r\n if self._chunk == ChunkState.PARSE_MAYBE_TRAILERS: - head = chunk[:2] + head = chunk[: len(SEP)] if head == SEP: # end of stream self.payload.feed_eof() - return True, chunk[2:] + return True, chunk[len(SEP) :] # Both CR and LF, or only LF may not be received yet. It is # expected that CRLF or LF will be shown at the very first # byte next time, otherwise trailers should come. The last @@ -845,7 +881,7 @@ def feed_data( if self._chunk == ChunkState.PARSE_TRAILERS: pos = chunk.find(SEP) if pos >= 0: - chunk = chunk[pos + 2 :] + chunk = chunk[pos + len(SEP) :] self._chunk = ChunkState.PARSE_MAYBE_TRAILERS else: self._chunk_tail = chunk diff --git a/tests/test_http_parser.py b/tests/test_http_parser.py index 1965f2ab328..6d5cc6502a1 100644 --- a/tests/test_http_parser.py +++ b/tests/test_http_parser.py @@ -162,6 +162,83 @@ def test_invalid_linebreak(loop: Any, protocol: Any, request: Any) -> None: parser.feed_data(text) +def test_cve_2023_37276(parser: Any) -> None: + text = b"""POST / HTTP/1.1\r\nHost: localhost:8080\r\nX-Abc: \rxTransfer-Encoding: chunked\r\n\r\n""" + with pytest.raises(http_exceptions.BadHttpMessage): + parser.feed_data(text) + + +@pytest.mark.parametrize( + "hdr", + ( + "Content-Length: -5", # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length + "Content-Length: +256", + "Foo: abc\rdef", # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5 + "Bar: abc\ndef", + "Baz: abc\x00def", + "Foo : bar", # https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2 + "Foo\t: bar", + ), +) +def test_bad_headers(parser: Any, hdr: str) -> None: + text = f"POST / HTTP/1.1\r\n{hdr}\r\n\r\n".encode() + with pytest.raises(http_exceptions.BadHttpMessage): + parser.feed_data(text) + + +def test_content_length_transfer_encoding(parser: Any) -> None: + text = ( + b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\nTransfer-Encoding: a\r\n\r\n" + + b"apple\r\n" + ) + with pytest.raises(http_exceptions.BadHttpMessage): + parser.feed_data(text) + + +def test_bad_chunked_py(loop: Any, protocol: Any) -> None: + """Test that invalid chunked encoding doesn't allow content-length to be used.""" + parser = HttpRequestParserPy( + protocol, + loop, + 2**16, + max_line_size=8190, + max_field_size=8190, + ) + text = ( + b"GET / HTTP/1.1\r\nHost: a\r\nTransfer-Encoding: chunked\r\n\r\n0_2e\r\n\r\n" + + b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\n\r\n0\r\n\r\n" + ) + messages, upgrade, tail = parser.feed_data(text) + assert isinstance(messages[0][1].exception(), http_exceptions.TransferEncodingError) + + +@pytest.mark.skipif( + "HttpRequestParserC" not in dir(aiohttp.http_parser), + reason="C based HTTP parser not available", +) +def test_bad_chunked_c(loop: Any, protocol: Any) -> None: + """C parser behaves differently. Maybe we should align them later.""" + parser = HttpRequestParserC( + protocol, + loop, + 2**16, + max_line_size=8190, + max_field_size=8190, + ) + text = ( + b"GET / HTTP/1.1\r\nHost: a\r\nTransfer-Encoding: chunked\r\n\r\n0_2e\r\n\r\n" + + b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\n\r\n0\r\n\r\n" + ) + with pytest.raises(http_exceptions.BadHttpMessage): + parser.feed_data(text) + + +def test_whitespace_before_header(parser: Any) -> None: + text = b"GET / HTTP/1.1\r\n\tContent-Length: 1\r\n\r\nX" + with pytest.raises(http_exceptions.BadHttpMessage): + parser.feed_data(text) + + def test_parse_headers_longline(parser: Any) -> None: invalid_unicode_byte = b"\xd9" header_name = b"Test" + invalid_unicode_byte + b"Header" + b"A" * 8192 @@ -476,74 +553,6 @@ def test_invalid_name(parser) -> None: parser.feed_data(text) -def test_cve_2023_37276(parser: Any) -> None: - text = b"""POST / HTTP/1.1\r\nHost: localhost:8080\r\nX-Abc: \rxTransfer-Encoding: chunked\r\n\r\n""" - with pytest.raises(http_exceptions.BadHttpMessage): - parser.feed_data(text) - - -@pytest.mark.parametrize( - "hdr", - ( - "Content-Length: -5", # https://www.rfc-editor.org/rfc/rfc9110.html#name-content-length - "Content-Length: +256", - "Foo: abc\rdef", # https://www.rfc-editor.org/rfc/rfc9110.html#section-5.5-5 - "Bar: abc\ndef", - "Baz: abc\x00def", - "Foo : bar", # https://www.rfc-editor.org/rfc/rfc9112.html#section-5.1-2 - "Foo\t: bar", - ), -) -def test_bad_headers(parser: Any, hdr: str) -> None: - text = f"POST / HTTP/1.1\r\n{hdr}\r\n\r\n".encode() - with pytest.raises(http_exceptions.InvalidHeader): - parser.feed_data(text) - - -def test_bad_chunked_py(loop: Any, protocol: Any) -> None: - """Test that invalid chunked encoding doesn't allow content-length to be used.""" - parser = HttpRequestParserPy( - protocol, - loop, - 2**16, - max_line_size=8190, - max_field_size=8190, - ) - text = ( - b"GET / HTTP/1.1\r\nHost: a\r\nTransfer-Encoding: chunked\r\n\r\n0_2e\r\n\r\n" - + b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\n\r\n0\r\n\r\n" - ) - messages, upgrade, tail = parser.feed_data(text) - assert isinstance(messages[0][1].exception(), http_exceptions.TransferEncodingError) - - -@pytest.mark.skipif( - "HttpRequestParserC" not in dir(aiohttp.http_parser), - reason="C based HTTP parser not available", -) -def test_bad_chunked_c(loop: Any, protocol: Any) -> None: - """C parser behaves differently. Maybe we should align them later.""" - parser = HttpRequestParserC( - protocol, - loop, - 2**16, - max_line_size=8190, - max_field_size=8190, - ) - text = ( - b"GET / HTTP/1.1\r\nHost: a\r\nTransfer-Encoding: chunked\r\n\r\n0_2e\r\n\r\n" - + b"GET / HTTP/1.1\r\nHost: a\r\nContent-Length: 5\r\n\r\n0\r\n\r\n" - ) - with pytest.raises(http_exceptions.BadHttpMessage): - parser.feed_data(text) - - -def test_whitespace_before_header(parser: Any) -> None: - text = b"GET / HTTP/1.1\r\n\tContent-Length: 1\r\n\r\nX" - with pytest.raises(http_exceptions.BadHttpMessage): - parser.feed_data(text) - - @pytest.mark.parametrize("size", [40960, 8191]) def test_max_header_field_size(parser, size) -> None: name = b"t" * size @@ -827,6 +836,62 @@ def test_http_response_parser_strict_headers(response) -> None: response.feed_data(b"HTTP/1.1 200 test\r\nFoo: abc\x01def\r\n\r\n") +def test_http_response_parser_bad_crlf(response) -> None: + """Still a lot of dodgy servers sending bad requests like this.""" + messages, upgrade, tail = response.feed_data( + b"HTTP/1.0 200 OK\nFoo: abc\nBar: def\n\nBODY\n" + ) + msg = messages[0][0] + + assert msg.headers["Foo"] == "abc" + assert msg.headers["Bar"] == "def" + + +async def test_http_response_parser_bad_chunked_lax(response) -> None: + text = ( + b"HTTP/1.1 200 OK\r\nTransfer-Encoding: chunked\r\n\r\n5 \r\nabcde\r\n0\r\n\r\n" + ) + messages, upgrade, tail = response.feed_data(text) + + assert await messages[0][1].read(5) == b"abcde" + + +@pytest.mark.dev_mode +async def test_http_response_parser_bad_chunked_strict_py(loop, protocol) -> None: + response = HttpResponseParserPy( + protocol, + loop, + 2**16, + max_line_size=8190, + max_field_size=8190, + ) + text = ( + b"HTTP/1.1 200 OK\r\nTransfer-Encoding: chunked\r\n\r\n5 \r\nabcde\r\n0\r\n\r\n" + ) + messages, upgrade, tail = response.feed_data(text) + assert isinstance(messages[0][1].exception(), http_exceptions.TransferEncodingError) + + +@pytest.mark.dev_mode +@pytest.mark.skipif( + "HttpRequestParserC" not in dir(aiohttp.http_parser), + reason="C based HTTP parser not available", +) +async def test_http_response_parser_bad_chunked_strict_c(loop, protocol) -> None: + response = HttpResponseParserC( + protocol, + loop, + 2**16, + max_line_size=8190, + max_field_size=8190, + ) + text = ( + b"HTTP/1.1 200 OK\r\nTransfer-Encoding: chunked\r\n\r\n5 \r\nabcde\r\n0\r\n\r\n" + ) + with pytest.raises(http_exceptions.BadHttpMessage): + response.feed_data(text) + + def test_http_response_parser_bad(response) -> None: with pytest.raises(http_exceptions.BadHttpMessage): response.feed_data(b"HTT/1\r\n\r\n")