import abc import asyncio import collections import re import string import zlib from enum import IntEnum from typing import Any, List, Optional, Tuple, Type, Union from multidict import CIMultiDict, CIMultiDictProxy, istr from yarl import URL from . import hdrs from .base_protocol import BaseProtocol from .helpers import NO_EXTENSIONS, BaseTimerContext from .http_exceptions import ( BadStatusLine, ContentEncodingError, ContentLengthError, InvalidHeader, LineTooLong, TransferEncodingError, ) from .http_writer import HttpVersion, HttpVersion10 from .log import internal_logger from .streams import EMPTY_PAYLOAD, StreamReader from .typedefs import RawHeaders try: import brotli HAS_BROTLI = True except ImportError: # pragma: no cover HAS_BROTLI = False __all__ = ( "HeadersParser", "HttpParser", "HttpRequestParser", "HttpResponseParser", "RawRequestMessage", "RawResponseMessage", ) ASCIISET = set(string.printable) # See https://tools.ietf.org/html/rfc7230#section-3.1.1 # and https://tools.ietf.org/html/rfc7230#appendix-B # # method = token # tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" / "+" / "-" / "." / # "^" / "_" / "`" / "|" / "~" / DIGIT / ALPHA # token = 1*tchar METHRE = re.compile(r"[!#$%&'*+\-.^_`|~0-9A-Za-z]+") VERSRE = re.compile(r"HTTP/(\d+).(\d+)") HDRRE = re.compile(rb"[\x00-\x1F\x7F()<>@,;:\[\]={} \t\\\\\"]") RawRequestMessage = collections.namedtuple( "RawRequestMessage", [ "method", "path", "version", "headers", "raw_headers", "should_close", "compression", "upgrade", "chunked", "url", ], ) RawResponseMessage = collections.namedtuple( "RawResponseMessage", [ "version", "code", "reason", "headers", "raw_headers", "should_close", "compression", "upgrade", "chunked", ], ) class ParseState(IntEnum): PARSE_NONE = 0 PARSE_LENGTH = 1 PARSE_CHUNKED = 2 PARSE_UNTIL_EOF = 3 class ChunkState(IntEnum): PARSE_CHUNKED_SIZE = 0 PARSE_CHUNKED_CHUNK = 1 PARSE_CHUNKED_CHUNK_EOF = 2 PARSE_MAYBE_TRAILERS = 3 PARSE_TRAILERS = 4 class HeadersParser: def __init__( self, max_line_size: int = 8190, max_headers: int = 32768, max_field_size: int = 8190, ) -> None: self.max_line_size = max_line_size self.max_headers = max_headers self.max_field_size = max_field_size def parse_headers( self, lines: List[bytes] ) -> Tuple["CIMultiDictProxy[str]", RawHeaders]: headers = CIMultiDict() # type: CIMultiDict[str] raw_headers = [] lines_idx = 1 line = lines[1] line_count = len(lines) while line: # Parse initial header name : value pair. try: bname, bvalue = line.split(b":", 1) except ValueError: raise InvalidHeader(line) from None bname = bname.strip(b" \t") bvalue = bvalue.lstrip() if HDRRE.search(bname): raise InvalidHeader(bname) if len(bname) > self.max_field_size: raise LineTooLong( "request header name {}".format( bname.decode("utf8", "xmlcharrefreplace") ), str(self.max_field_size), str(len(bname)), ) header_length = len(bvalue) # next line lines_idx += 1 line = lines[lines_idx] # consume continuation lines continuation = line and line[0] in (32, 9) # (' ', '\t') if continuation: bvalue_lst = [bvalue] while continuation: header_length += len(line) if header_length > self.max_field_size: raise LineTooLong( "request header field {}".format( bname.decode("utf8", "xmlcharrefreplace") ), str(self.max_field_size), str(header_length), ) bvalue_lst.append(line) # next line lines_idx += 1 if lines_idx < line_count: line = lines[lines_idx] if line: continuation = line[0] in (32, 9) # (' ', '\t') else: line = b"" break bvalue = b"".join(bvalue_lst) else: if header_length > self.max_field_size: raise LineTooLong( "request header field {}".format( bname.decode("utf8", "xmlcharrefreplace") ), str(self.max_field_size), str(header_length), ) bvalue = bvalue.strip() name = bname.decode("utf-8", "surrogateescape") value = bvalue.decode("utf-8", "surrogateescape") headers.add(name, value) raw_headers.append((bname, bvalue)) return (CIMultiDictProxy(headers), tuple(raw_headers)) class HttpParser(abc.ABC): def __init__( self, protocol: Optional[BaseProtocol] = None, loop: Optional[asyncio.AbstractEventLoop] = None, limit: int = 2 ** 16, max_line_size: int = 8190, max_headers: int = 32768, max_field_size: int = 8190, timer: Optional[BaseTimerContext] = None, code: Optional[int] = None, method: Optional[str] = None, readall: bool = False, payload_exception: Optional[Type[BaseException]] = None, response_with_body: bool = True, read_until_eof: bool = False, auto_decompress: bool = True, ) -> None: self.protocol = protocol self.loop = loop self.max_line_size = max_line_size self.max_headers = max_headers self.max_field_size = max_field_size self.timer = timer self.code = code self.method = method self.readall = readall self.payload_exception = payload_exception self.response_with_body = response_with_body self.read_until_eof = read_until_eof self._lines = [] # type: List[bytes] self._tail = b"" self._upgraded = False self._payload = None self._payload_parser = None # type: Optional[HttpPayloadParser] self._auto_decompress = auto_decompress self._limit = limit self._headers_parser = HeadersParser(max_line_size, max_headers, max_field_size) @abc.abstractmethod def parse_message(self, lines: List[bytes]) -> Any: pass def feed_eof(self) -> Any: if self._payload_parser is not None: self._payload_parser.feed_eof() self._payload_parser = None else: # try to extract partial message if self._tail: self._lines.append(self._tail) if self._lines: if self._lines[-1] != "\r\n": self._lines.append(b"") try: return self.parse_message(self._lines) except Exception: return None def feed_data( self, data: bytes, SEP: bytes = b"\r\n", EMPTY: bytes = b"", CONTENT_LENGTH: istr = hdrs.CONTENT_LENGTH, METH_CONNECT: str = hdrs.METH_CONNECT, SEC_WEBSOCKET_KEY1: istr = hdrs.SEC_WEBSOCKET_KEY1, ) -> Tuple[List[Any], bool, bytes]: messages = [] if self._tail: data, self._tail = self._tail + data, b"" data_len = len(data) start_pos = 0 loop = self.loop while start_pos < data_len: # read HTTP message (request/response line + headers), \r\n\r\n # and split by lines if self._payload_parser is None and not self._upgraded: pos = data.find(SEP, start_pos) # consume \r\n if pos == start_pos and not self._lines: start_pos = pos + 2 continue if pos >= start_pos: # line found self._lines.append(data[start_pos:pos]) start_pos = pos + 2 # \r\n\r\n found if self._lines[-1] == EMPTY: try: msg = self.parse_message(self._lines) finally: self._lines.clear() # payload length length = msg.headers.get(CONTENT_LENGTH) if length is not None: try: length = int(length) except ValueError: raise InvalidHeader(CONTENT_LENGTH) if length < 0: raise InvalidHeader(CONTENT_LENGTH) # do not support old websocket spec if SEC_WEBSOCKET_KEY1 in msg.headers: raise InvalidHeader(SEC_WEBSOCKET_KEY1) self._upgraded = msg.upgrade method = getattr(msg, "method", self.method) assert self.protocol is not None # calculate payload if ( (length is not None and length > 0) or msg.chunked and not msg.upgrade ): payload = StreamReader( self.protocol, timer=self.timer, loop=loop, limit=self._limit, ) payload_parser = HttpPayloadParser( payload, length=length, chunked=msg.chunked, method=method, compression=msg.compression, code=self.code, readall=self.readall, response_with_body=self.response_with_body, auto_decompress=self._auto_decompress, ) if not payload_parser.done: self._payload_parser = payload_parser elif method == METH_CONNECT: payload = StreamReader( self.protocol, timer=self.timer, loop=loop, limit=self._limit, ) self._upgraded = True self._payload_parser = HttpPayloadParser( payload, method=msg.method, compression=msg.compression, readall=True, auto_decompress=self._auto_decompress, ) else: if ( getattr(msg, "code", 100) >= 199 and length is None and self.read_until_eof ): payload = StreamReader( self.protocol, timer=self.timer, loop=loop, limit=self._limit, ) payload_parser = HttpPayloadParser( payload, length=length, chunked=msg.chunked, method=method, compression=msg.compression, code=self.code, readall=True, response_with_body=self.response_with_body, auto_decompress=self._auto_decompress, ) if not payload_parser.done: self._payload_parser = payload_parser else: payload = EMPTY_PAYLOAD # type: ignore messages.append((msg, payload)) else: self._tail = data[start_pos:] data = EMPTY break # no parser, just store elif self._payload_parser is None and self._upgraded: assert not self._lines break # feed payload elif data and start_pos < data_len: assert not self._lines assert self._payload_parser is not None try: eof, data = self._payload_parser.feed_data(data[start_pos:]) except BaseException as exc: if self.payload_exception is not None: self._payload_parser.payload.set_exception( self.payload_exception(str(exc)) ) else: self._payload_parser.payload.set_exception(exc) eof = True data = b"" if eof: start_pos = 0 data_len = len(data) self._payload_parser = None continue else: break if data and start_pos < data_len: data = data[start_pos:] else: data = EMPTY return messages, self._upgraded, data def parse_headers( self, lines: List[bytes] ) -> Tuple[ "CIMultiDictProxy[str]", RawHeaders, Optional[bool], Optional[str], bool, bool ]: """Parses RFC 5322 headers from a stream. Line continuations are supported. Returns list of header name and value pairs. Header name is in upper case. """ headers, raw_headers = self._headers_parser.parse_headers(lines) close_conn = None encoding = None upgrade = False chunked = False # keep-alive conn = headers.get(hdrs.CONNECTION) if conn: v = conn.lower() if v == "close": close_conn = True elif v == "keep-alive": close_conn = False elif v == "upgrade": upgrade = True # encoding enc = headers.get(hdrs.CONTENT_ENCODING) if enc: enc = enc.lower() if enc in ("gzip", "deflate", "br"): encoding = enc # chunking te = headers.get(hdrs.TRANSFER_ENCODING) if te and "chunked" in te.lower(): chunked = True return (headers, raw_headers, close_conn, encoding, upgrade, chunked) def set_upgraded(self, val: bool) -> None: """Set connection upgraded (to websocket) mode. :param bool val: new state. """ self._upgraded = val class HttpRequestParser(HttpParser): """Read request status line. Exception .http_exceptions.BadStatusLine could be raised in case of any errors in status line. Returns RawRequestMessage. """ def parse_message(self, lines: List[bytes]) -> Any: # request line line = lines[0].decode("utf-8", "surrogateescape") try: method, path, version = line.split(None, 2) except ValueError: raise BadStatusLine(line) from None if len(path) > self.max_line_size: raise LineTooLong( "Status line is too long", str(self.max_line_size), str(len(path)) ) path_part, _hash_separator, url_fragment = path.partition("#") path_part, _question_mark_separator, qs_part = path_part.partition("?") # method if not METHRE.match(method): raise BadStatusLine(method) # version try: if version.startswith("HTTP/"): n1, n2 = version[5:].split(".", 1) version_o = HttpVersion(int(n1), int(n2)) else: raise BadStatusLine(version) except Exception: raise BadStatusLine(version) # read headers ( headers, raw_headers, close, compression, upgrade, chunked, ) = self.parse_headers(lines) if close is None: # then the headers weren't set in the request if version_o <= HttpVersion10: # HTTP 1.0 must asks to not close close = True else: # HTTP 1.1 must ask to close. close = False return RawRequestMessage( method, path, version_o, headers, raw_headers, close, compression, upgrade, chunked, # NOTE: `yarl.URL.build()` is used to mimic what the Cython-based # NOTE: parser does, otherwise it results into the same # NOTE: HTTP Request-Line input producing different # NOTE: `yarl.URL()` objects URL.build( path=path_part, query_string=qs_part, fragment=url_fragment, encoded=True, ), ) class HttpResponseParser(HttpParser): """Read response status line and headers. BadStatusLine could be raised in case of any errors in status line. Returns RawResponseMessage""" def parse_message(self, lines: List[bytes]) -> Any: line = lines[0].decode("utf-8", "surrogateescape") try: version, status = line.split(None, 1) except ValueError: raise BadStatusLine(line) from None try: status, reason = status.split(None, 1) except ValueError: reason = "" if len(reason) > self.max_line_size: raise LineTooLong( "Status line is too long", str(self.max_line_size), str(len(reason)) ) # version match = VERSRE.match(version) if match is None: raise BadStatusLine(line) version_o = HttpVersion(int(match.group(1)), int(match.group(2))) # The status code is a three-digit number try: status_i = int(status) except ValueError: raise BadStatusLine(line) from None if status_i > 999: raise BadStatusLine(line) # read headers ( headers, raw_headers, close, compression, upgrade, chunked, ) = self.parse_headers(lines) if close is None: close = version_o <= HttpVersion10 return RawResponseMessage( version_o, status_i, reason.strip(), headers, raw_headers, close, compression, upgrade, chunked, ) class HttpPayloadParser: def __init__( self, payload: StreamReader, length: Optional[int] = None, chunked: bool = False, compression: Optional[str] = None, code: Optional[int] = None, method: Optional[str] = None, readall: bool = False, response_with_body: bool = True, auto_decompress: bool = True, ) -> None: self._length = 0 self._type = ParseState.PARSE_NONE self._chunk = ChunkState.PARSE_CHUNKED_SIZE self._chunk_size = 0 self._chunk_tail = b"" self._auto_decompress = auto_decompress self.done = False # payload decompression wrapper if response_with_body and compression and self._auto_decompress: real_payload = DeflateBuffer( payload, compression ) # type: Union[StreamReader, DeflateBuffer] else: real_payload = payload # payload parser if not response_with_body: # don't parse payload if it's not expected to be received self._type = ParseState.PARSE_NONE real_payload.feed_eof() self.done = True elif chunked: self._type = ParseState.PARSE_CHUNKED elif length is not None: self._type = ParseState.PARSE_LENGTH self._length = length if self._length == 0: real_payload.feed_eof() self.done = True else: if readall and code != 204: self._type = ParseState.PARSE_UNTIL_EOF elif method in ("PUT", "POST"): internal_logger.warning( # pragma: no cover "Content-Length or Transfer-Encoding header is required" ) self._type = ParseState.PARSE_NONE real_payload.feed_eof() self.done = True self.payload = real_payload def feed_eof(self) -> None: if self._type == ParseState.PARSE_UNTIL_EOF: self.payload.feed_eof() elif self._type == ParseState.PARSE_LENGTH: raise ContentLengthError( "Not enough data for satisfy content length header." ) elif self._type == ParseState.PARSE_CHUNKED: raise TransferEncodingError( "Not enough data for satisfy transfer length header." ) def feed_data( self, chunk: bytes, SEP: bytes = b"\r\n", CHUNK_EXT: bytes = b";" ) -> Tuple[bool, bytes]: # Read specified amount of bytes if self._type == ParseState.PARSE_LENGTH: required = self._length chunk_len = len(chunk) if required >= chunk_len: self._length = required - chunk_len self.payload.feed_data(chunk, chunk_len) if self._length == 0: self.payload.feed_eof() return True, b"" else: self._length = 0 self.payload.feed_data(chunk[:required], required) self.payload.feed_eof() return True, chunk[required:] # Chunked transfer encoding parser elif self._type == ParseState.PARSE_CHUNKED: if self._chunk_tail: chunk = self._chunk_tail + chunk self._chunk_tail = b"" while chunk: # read next chunk size if self._chunk == ChunkState.PARSE_CHUNKED_SIZE: pos = chunk.find(SEP) if pos >= 0: i = chunk.find(CHUNK_EXT, 0, pos) if i >= 0: size_b = chunk[:i] # strip chunk-extensions else: size_b = chunk[:pos] try: size = int(bytes(size_b), 16) except ValueError: exc = TransferEncodingError( chunk[:pos].decode("ascii", "surrogateescape") ) self.payload.set_exception(exc) raise exc from None chunk = chunk[pos + 2 :] if size == 0: # eof marker self._chunk = ChunkState.PARSE_MAYBE_TRAILERS else: self._chunk = ChunkState.PARSE_CHUNKED_CHUNK self._chunk_size = size self.payload.begin_http_chunk_receiving() else: self._chunk_tail = chunk return False, b"" # read chunk and feed buffer if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK: required = self._chunk_size chunk_len = len(chunk) if required > chunk_len: self._chunk_size = required - chunk_len self.payload.feed_data(chunk, chunk_len) return False, b"" else: self._chunk_size = 0 self.payload.feed_data(chunk[:required], required) chunk = chunk[required:] self._chunk = ChunkState.PARSE_CHUNKED_CHUNK_EOF self.payload.end_http_chunk_receiving() # toss the CRLF at the end of the chunk if self._chunk == ChunkState.PARSE_CHUNKED_CHUNK_EOF: if chunk[:2] == SEP: chunk = chunk[2:] self._chunk = ChunkState.PARSE_CHUNKED_SIZE else: self._chunk_tail = chunk return False, b"" # if stream does not contain trailer, after 0\r\n # we should get another \r\n otherwise # trailers needs to be skiped until \r\n\r\n if self._chunk == ChunkState.PARSE_MAYBE_TRAILERS: head = chunk[:2] if head == SEP: # end of stream self.payload.feed_eof() return True, chunk[2:] # Both CR and LF, or only LF may not be received yet. It is # expected that CRLF or LF will be shown at the very first # byte next time, otherwise trailers should come. The last # CRLF which marks the end of response might not be # contained in the same TCP segment which delivered the # size indicator. if not head: return False, b"" if head == SEP[:1]: self._chunk_tail = head return False, b"" self._chunk = ChunkState.PARSE_TRAILERS # read and discard trailer up to the CRLF terminator if self._chunk == ChunkState.PARSE_TRAILERS: pos = chunk.find(SEP) if pos >= 0: chunk = chunk[pos + 2 :] self._chunk = ChunkState.PARSE_MAYBE_TRAILERS else: self._chunk_tail = chunk return False, b"" # Read all bytes until eof elif self._type == ParseState.PARSE_UNTIL_EOF: self.payload.feed_data(chunk, len(chunk)) return False, b"" class DeflateBuffer: """DeflateStream decompress stream and feed data into specified stream.""" def __init__(self, out: StreamReader, encoding: Optional[str]) -> None: self.out = out self.size = 0 self.encoding = encoding self._started_decoding = False if encoding == "br": if not HAS_BROTLI: # pragma: no cover raise ContentEncodingError( "Can not decode content-encoding: brotli (br). " "Please install `brotlipy`" ) self.decompressor = brotli.Decompressor() else: zlib_mode = 16 + zlib.MAX_WBITS if encoding == "gzip" else zlib.MAX_WBITS self.decompressor = zlib.decompressobj(wbits=zlib_mode) def set_exception(self, exc: BaseException) -> None: self.out.set_exception(exc) def feed_data(self, chunk: bytes, size: int) -> None: if not size: return self.size += size # RFC1950 # bits 0..3 = CM = 0b1000 = 8 = "deflate" # bits 4..7 = CINFO = 1..7 = windows size. if ( not self._started_decoding and self.encoding == "deflate" and chunk[0] & 0xF != 8 ): # Change the decoder to decompress incorrectly compressed data # Actually we should issue a warning about non-RFC-compliant data. self.decompressor = zlib.decompressobj(wbits=-zlib.MAX_WBITS) try: chunk = self.decompressor.decompress(chunk) except Exception: raise ContentEncodingError( "Can not decode content-encoding: %s" % self.encoding ) self._started_decoding = True if chunk: self.out.feed_data(chunk, len(chunk)) def feed_eof(self) -> None: chunk = self.decompressor.flush() if chunk or self.size > 0: self.out.feed_data(chunk, len(chunk)) if self.encoding == "deflate" and not self.decompressor.eof: raise ContentEncodingError("deflate") self.out.feed_eof() def begin_http_chunk_receiving(self) -> None: self.out.begin_http_chunk_receiving() def end_http_chunk_receiving(self) -> None: self.out.end_http_chunk_receiving() HttpRequestParserPy = HttpRequestParser HttpResponseParserPy = HttpResponseParser RawRequestMessagePy = RawRequestMessage RawResponseMessagePy = RawResponseMessage try: if not NO_EXTENSIONS: from ._http_parser import ( # type: ignore HttpRequestParser, HttpResponseParser, RawRequestMessage, RawResponseMessage, ) HttpRequestParserC = HttpRequestParser HttpResponseParserC = HttpResponseParser RawRequestMessageC = RawRequestMessage RawResponseMessageC = RawResponseMessage except ImportError: # pragma: no cover pass