diff --git a/scrapy_zyte_api/responses.py b/scrapy_zyte_api/responses.py index a51d3c21..f0fc4835 100644 --- a/scrapy_zyte_api/responses.py +++ b/scrapy_zyte_api/responses.py @@ -8,6 +8,14 @@ class ZyteAPIMixin: + + REMOVE_HEADERS = { + # Zyte API already decompresses the HTTP Response Body. Scrapy's + # HttpCompressionMiddleware will error out when it attempts to + # decompress an already decompressed body based on this header. + "content-encoding" + } + def __init__(self, *args, zyte_api_response: Dict = None, **kwargs): super().__init__(*args, **kwargs) self._zyte_api_response = zyte_api_response @@ -27,11 +35,15 @@ def zyte_api_response(self) -> Optional[Dict]: """ return self._zyte_api_response - @staticmethod - def _prepare_headers(init_headers: Optional[List[Dict[str, str]]]): + @classmethod + def _prepare_headers(cls, init_headers: Optional[List[Dict[str, str]]]): if not init_headers: return None - return {h["name"]: h["value"] for h in init_headers} + return { + h["name"]: h["value"] + for h in init_headers + if h["name"].lower() not in cls.REMOVE_HEADERS + } class ZyteAPITextResponse(ZyteAPIMixin, TextResponse): diff --git a/tests/test_responses.py b/tests/test_responses.py index a24bac93..6eb949e2 100644 --- a/tests/test_responses.py +++ b/tests/test_responses.py @@ -135,3 +135,32 @@ def test_non_utf8_response(): response = ZyteAPITextResponse.from_api_response(sample_zyte_api_response) assert response.text == content assert response.encoding == "utf-8" + + +@pytest.mark.parametrize( + "api_response,cls", + [ + (api_response_browser, ZyteAPITextResponse), + (api_response_body, ZyteAPIResponse), + ], +) +def test_response_headers_removal(api_response, cls): + """Headers like 'Content-Encoding' should be removed later in the response + instance returned to Scrapy. + + However, it should still be present inside 'zyte_api_response.headers'. + """ + additional_headers = [ + {"name": "Content-Encoding", "value": "gzip"}, + {"name": "X-Some-Other-Value", "value": "123"}, + ] + raw_response = api_response() + raw_response["httpResponseHeaders"] = additional_headers + + response = cls.from_api_response(raw_response) + + assert response.headers == {b"X-Some-Other-Value": [b"123"]} + assert ( + response.zyte_api_response["httpResponseHeaders"] + == raw_response["httpResponseHeaders"] + )