Source code for kingfisher_scrapy.downloadhandlers

# https://docs.scrapy.org/en/latest/topics/download-handlers.html
import asyncio
import logging

from curl_cffi import requests
from curl_cffi.const import CurlIpResolve, CurlOpt
from curl_cffi.requests.exceptions import RequestException, Timeout
from scrapy.exceptions import DownloadFailedError, DownloadTimeoutError
from scrapy.http import Headers
from scrapy.responsetypes import responsetypes

logger = logging.getLogger(__name__)



[docs]
class CurlImpersonateDownloadHandler:
    """
    A download handler that uses ``curl_cffi`` to impersonate a browser's TLS/JA3 fingerprint.

    Some sites use an anti-bot service (like Cloudflare) that rejects Scrapy's default Twisted client by its TLS/JA3
    fingerprint. ``curl_cffi`` reproduces a real browser's fingerprint.

    To use it for a spider, override the ``https`` (and/or ``http``) handler in the spider's ``custom_settings``:

    .. code-block:: python

       custom_settings = {
           "DOWNLOAD_HANDLERS": {
               "https": "kingfisher_scrapy.downloadhandlers.CurlImpersonateDownloadHandler",
           },
       }

    And optionally:

    -  Set the ``CURL_IMPERSONATE`` setting to a `browser profile <https://github.com/lexiforest/curl_cffi#sessions>`__.
       Choose a specific version (like ``"chrome146"``) for a consistent fingerprint across ``curl_cffi`` upgrades.
    -  Set the ``CURL_IP_VERSION`` setting to ``"4"`` or ``"6"`` for a consistent version across requests. If unset,
       ``curl_cffi`` chooses.
    """

    lazy = True

    IP_RESOLVE = {"4": CurlIpResolve.V4, "6": CurlIpResolve.V6}

    def __init__(self, settings):
        self.impersonate = settings.get("CURL_IMPERSONATE") or "chrome"
        self.ip_resolve = self.IP_RESOLVE.get(settings.get("CURL_IP_VERSION"))


[docs]
    @classmethod
    def from_crawler(cls, crawler):
        return cls(crawler.settings)



[docs]
    async def download_request(self, request):
        return await asyncio.to_thread(self._download, request)



[docs]
    async def close(self):
        pass


    def _download(self, request):
        kwargs = {
            "method": request.method,
            "url": request.url,
            # curl_cffi expects a plain dict of strings. Join multi-valued headers like Scrapy's HTTP/1.1 handler.
            "headers": {key.decode(): b", ".join(values).decode() for key, values in request.headers.items()},
            "data": request.body,
            "impersonate": self.impersonate,
            # Let Scrapy's RedirectMiddleware handle redirects.
            "allow_redirects": False,
        }
        # Scrapy sets the download_timeout meta from the DOWNLOAD_TIMEOUT setting.
        if timeout := request.meta.get("download_timeout"):
            kwargs["timeout"] = timeout
        # curl_cffi must ignore the http_proxy and https_proxy environment variables, unless HTTPPROXY_ENABLED is True.
        proxy = request.meta.get("proxy") or ""
        kwargs["proxies"] = {"http": proxy, "https": proxy}
        # Force the IP version, so that the request uses, e.g., the same version that solved a Cloudflare challenge.
        if self.ip_resolve is not None:
            kwargs["curl_options"] = {CurlOpt.IPRESOLVE: self.ip_resolve}

        try:
            response = requests.request(**kwargs)
        # Translate curl_cffi exceptions to Scrapy exceptions.
        except Timeout as exception:  # Timeout is a subclass of RequestException.
            raise DownloadTimeoutError(str(exception)) from exception
        except RequestException as exception:
            raise DownloadFailedError(str(exception)) from exception

        # curl_cffi already decompressed the body, so drop Content-Encoding (and the now-incorrect Content-Length), to
        # stop Scrapy's HttpCompressionMiddleware from trying to decompress it again (raising BadGzipFile).
        headers = Headers(
            [
                (name, value)
                for name, value in response.headers.multi_items()
                if name.lower() not in ("content-encoding", "content-length")
            ]
        )
        response_class = responsetypes.from_args(headers=headers, url=response.url, body=response.content)
        return response_class(
            url=response.url,
            status=response.status_code,
            headers=headers,
            body=response.content,
            request=request,
        )