Source code for kingfisher_scrapy.downloadhandlers
# https://docs.scrapy.org/en/latest/topics/download-handlers.html
import asyncio
import logging
from curl_cffi import requests
from curl_cffi.const import CurlIpResolve, CurlOpt
from curl_cffi.requests.exceptions import RequestException, Timeout
from scrapy.exceptions import DownloadFailedError, DownloadTimeoutError
from scrapy.http import Headers
from scrapy.responsetypes import responsetypes
logger = logging.getLogger(__name__)
[docs]
class CurlImpersonateDownloadHandler:
"""
A download handler that uses ``curl_cffi`` to impersonate a browser's TLS/JA3 fingerprint.
Some sites use an anti-bot service (like Cloudflare) that rejects Scrapy's default Twisted client by its TLS/JA3
fingerprint. ``curl_cffi`` reproduces a real browser's fingerprint.
To use it for a spider, override the ``https`` (and/or ``http``) handler in the spider's ``custom_settings``:
.. code-block:: python
custom_settings = {
"DOWNLOAD_HANDLERS": {
"https": "kingfisher_scrapy.downloadhandlers.CurlImpersonateDownloadHandler",
},
}
And optionally:
- Set the ``CURL_IMPERSONATE`` setting to a `browser profile <https://github.com/lexiforest/curl_cffi#sessions>`__.
Choose a specific version (like ``"chrome146"``) for a consistent fingerprint across ``curl_cffi`` upgrades.
- Set the ``CURL_IP_VERSION`` setting to ``"4"`` or ``"6"`` for a consistent version across requests. If unset,
``curl_cffi`` chooses.
"""
lazy = True
IP_RESOLVE = {"4": CurlIpResolve.V4, "6": CurlIpResolve.V6}
def __init__(self, settings):
self.impersonate = settings.get("CURL_IMPERSONATE") or "chrome"
self.ip_resolve = self.IP_RESOLVE.get(settings.get("CURL_IP_VERSION"))
[docs]
@classmethod
def from_crawler(cls, crawler):
return cls(crawler.settings)
[docs]
async def download_request(self, request):
return await asyncio.to_thread(self._download, request)
[docs]
async def close(self):
pass
def _download(self, request):
kwargs = {
"method": request.method,
"url": request.url,
# curl_cffi expects a plain dict of strings. Join multi-valued headers like Scrapy's HTTP/1.1 handler.
"headers": {key.decode(): b", ".join(values).decode() for key, values in request.headers.items()},
"data": request.body,
"impersonate": self.impersonate,
# Let Scrapy's RedirectMiddleware handle redirects.
"allow_redirects": False,
}
# Scrapy sets the download_timeout meta from the DOWNLOAD_TIMEOUT setting.
if timeout := request.meta.get("download_timeout"):
kwargs["timeout"] = timeout
# curl_cffi must ignore the http_proxy and https_proxy environment variables, unless HTTPPROXY_ENABLED is True.
proxy = request.meta.get("proxy") or ""
kwargs["proxies"] = {"http": proxy, "https": proxy}
# Force the IP version, so that the request uses, e.g., the same version that solved a Cloudflare challenge.
if self.ip_resolve is not None:
kwargs["curl_options"] = {CurlOpt.IPRESOLVE: self.ip_resolve}
try:
response = requests.request(**kwargs)
# Translate curl_cffi exceptions to Scrapy exceptions.
except Timeout as exception: # Timeout is a subclass of RequestException.
raise DownloadTimeoutError(str(exception)) from exception
except RequestException as exception:
raise DownloadFailedError(str(exception)) from exception
# curl_cffi already decompressed the body, so drop Content-Encoding (and the now-incorrect Content-Length), to
# stop Scrapy's HttpCompressionMiddleware from trying to decompress it again (raising BadGzipFile).
headers = Headers(
[
(name, value)
for name, value in response.headers.multi_items()
if name.lower() not in ("content-encoding", "content-length")
]
)
response_class = responsetypes.from_args(headers=headers, url=response.url, body=response.content)
return response_class(
url=response.url,
status=response.status_code,
headers=headers,
body=response.content,
request=request,
)