import codecs
import datetime
import scrapy
from kingfisher_scrapy.exceptions import IncoherentConfigurationError, SpiderArgumentError
from kingfisher_scrapy.items import File, FileItem
from kingfisher_scrapy.util import add_path_components, add_query_string
[docs]
class BaseSpider(scrapy.Spider):
"""
Base class for all spiders.
With respect to the data's source:
- If the source can support ``from_date`` and ``until_date`` spider arguments:
- Set a ``date_format`` class attribute to "date", "datetime", "year" or "year-month" (default "date").
- Set a ``default_from_date`` class attribute to a date ("YYYY-MM-DD"), datetime ("YYYY-MM-DDTHH:MM:SS"),
year ("YYYY") or year-month ("YYYY-MM").
- If the source stopped publishing, set a ``default_until_date`` class attribute to a date or datetime.
- If the spider requires date parameters to be set, add a ``date_required = True`` class attribute, and set the
``date_format`` and ``default_from_date`` class attributes as above.
- If the spider needs to parse the JSON response in its ``parse`` method, set ``dont_truncate = True``.
Otherwise, the :class:`~kingfisher_scrapy.extensions.pluck.Pluck` extension can fail when using the
``KINGFISHER_PLUCK_MAX_BYTES`` setting.
.. tip::
If ``date_required`` is ``True``, or if either the ``from_date`` or ``until_date`` spider arguments are set,
then ``from_date`` defaults to the ``default_from_date`` class attribute, and ``until_date`` defaults to the
``get_default_until_date()`` return value (which is the current time, by default).
With respect to the data's format:
- If the data is not encoded using UTF-8, set an ``encoding`` class attribute to its encoding.
- If the data is concatenated JSON, add a ``concatenated_json = True`` class attribute.
- If the data is line-delimited JSON, add a ``line_delimited = True`` class attribute.
- If the data can be invalid JSON, add a ``validate_json = True`` class attribute.
- If the data embeds OCDS data within other objects or arrays, set a ``root_path`` class attribute to the path to
the OCDS data, e.g. ``'releasePackage'`` or ``'results.item'``.
- If the data is in CSV or XLSX format, add a ``unflatten = True`` class attribute to convert it to JSON using
Flatten Tool's ``unflatten`` function. To pass arguments to ``unflatten``, set a ``unflatten_args`` dict.
- If the data source uses OCDS 1.0, add an ``ocds_version = '1.0'`` class attribute. This is used for the
:ref:`Kingfisher Process<kingfisher-process>` extension.
With respect to support for Kingfisher Collect's features:
- If the spider doesn't work with the ``pluck`` command, set a ``skip_pluck`` class attribute to the reason.
"""
VALID_DATE_FORMATS = {"date": "%Y-%m-%d", "datetime": "%Y-%m-%dT%H:%M:%S", "year": "%Y", "year-month": "%Y-%m"}
# Regarding the data source.
date_format = "date"
date_required = False
dont_truncate = False
# Regarding the data format.
encoding = "utf-8"
concatenated_json = False
line_delimited = False
validate_json = False
root_path = ""
resize_package = False
unflatten = False
unflatten_args = {}
ocds_version = "1.1"
# Regarding the access method.
max_attempts = 5
retry_http_codes = []
# Not to be overridden by sub-classes.
available_steps = {"compile", "check"}
def __init__(
self,
sample=None,
path=None,
from_date=None,
until_date=None,
crawl_time=None,
note=None,
keep_collection_open=None,
steps=None,
compile_releases=None,
table_name=None,
force_version=None,
ignore_version=None,
package_pointer=None,
release_pointer=None,
truncate=None,
*args,
**kwargs,
):
"""
:param sample: the number of items to download (``'true'`` means ``1``; ``'false'`` and ``None`` mean no limit)
:param path: path components to append to the URLs yielded by the ``start`` method (see :ref:`filter`)
:param from_date: the date from which to download data (see :ref:`spider-arguments`)
:param until_date: the date until which to download data (see :ref:`spider-arguments`)
:param crawl_time: override the crawl's start time (see :ref:`increment`)
:param note: a note to add to the collection in Kingfisher Process
:param keep_collection_open: whether to close the collection in Kingfisher Process when the crawl is finished
:param steps: a comma-separated list of steps to run in Kingfisher Process (``'compile'`` and/or ``'check'``)
:param compile_releases: whether to create compiled releases from individual releases when using the
:class:`~kingfisher_scrapy.extensions.database_store.DatabaseStore` extension
:param table_name: override the crawl's table name in the database (see :ref:`database_store`)
:param force_version: version to use instead of the version of the first package,
if ``compile_releases`` is ``'true'``
:param ignore_version: do not raise an error if the versions are inconsistent across packages to merge,
if ``compile_releases`` is ``'true'``
:param package_pointer: the JSON Pointer to the value in the package (see the :ref:`pluck` command)
:param release_pointer: the JSON Pointer to the value in the release (see the :ref:`pluck` command)
:param truncate: the number of characters to which the value is truncated (see the :ref:`pluck` command)
"""
super().__init__(*args, **kwargs)
if self.concatenated_json and self.line_delimited:
raise IncoherentConfigurationError("concatenated_json = True is incompatible with line_delimited = True.")
# https://docs.scrapy.org/en/latest/topics/spiders.html#spider-arguments
# Related to filtering data from the source.
if sample == "true":
self.sample = 1
elif sample == "false":
self.sample = None
else:
self.sample = sample
self.from_date = from_date
self.until_date = until_date
# Related to incremental crawls (whether KingfisherProcessAPI2 data_version or DatabaseStore directory).
self.crawl_time = crawl_time
# KingfisherProcessAPI2 extension.
self.kingfisher_process_note = note
self.kingfisher_process_keep_collection_open = keep_collection_open == "true"
if steps is None:
self.kingfisher_process_steps = {"compile"}
else:
self.kingfisher_process_steps = set(steps.split(",")) & self.available_steps
# DatabaseStore extension.
self.database_store_compile_releases = compile_releases == "true"
self.database_store_table_name = table_name
self.database_store_force_version = force_version
self.database_store_ignore_version = ignore_version == "true"
# Pluck pipeline.
self.pluck_package_pointer = package_pointer
self.pluck_release_pointer = release_pointer
self.pluck_truncate = int(truncate) if truncate else None
self.pluck = bool(package_pointer or release_pointer)
self.query_string_parameters = {}
for key, value in kwargs.items():
if key.startswith("qs:"):
self.query_string_parameters[key[3:]] = value
self.date_format = self.VALID_DATE_FORMATS[self.date_format]
if hasattr(self, "start"):
if path:
self.start = add_path_components(self.start, path)
if self.query_string_parameters:
self.start = add_query_string(self.start, self.query_string_parameters)
self.filter_arguments = {
"from_date": from_date,
"until_date": until_date,
"path": path,
}
self.filter_arguments.update(kwargs)
spider_arguments = {
"sample": sample,
"note": note,
"from_date": from_date,
"until_date": until_date,
"crawl_time": crawl_time,
"keep_collection_open": keep_collection_open,
"package_pointer": package_pointer,
"release_pointer": release_pointer,
"truncate": truncate,
"compile_releases": compile_releases,
}
spider_arguments.update(kwargs)
self.logger.info("Spider arguments: %r", spider_arguments)
# Scrapy calls this method to merge the spider's custom_settings into the project's settings.
[docs]
@classmethod
def update_settings(cls, settings):
if cls.custom_settings is None:
cls.custom_settings = {}
if not cls.custom_settings.get("HTTPPROXY_ENABLED"):
cls.custom_settings["HTTPPROXY_ENABLED"] = cls.name in settings.getlist("PROXY_SPIDERS")
super().update_settings(settings)
[docs]
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super().from_crawler(crawler, *args, **kwargs)
if spider.pluck_package_pointer and spider.pluck_release_pointer:
raise SpiderArgumentError("You cannot specify both package_pointer and release_pointer spider arguments.")
if spider.sample:
crawler.settings.set("CONCURRENT_REQUESTS", 1, priority="spider")
try:
spider.sample = int(spider.sample)
except ValueError:
raise SpiderArgumentError(
f"spider argument `sample`: invalid integer value: {spider.sample!r}"
) from None
if spider.crawl_time:
try:
spider.crawl_time = datetime.datetime.strptime(spider.crawl_time, "%Y-%m-%dT%H:%M:%S")
except ValueError as e:
raise SpiderArgumentError(f"spider argument `crawl_time`: invalid date value: {e}") from None
if spider.from_date or spider.until_date or spider.date_required:
if not spider.from_date:
spider.from_date = spider.default_from_date
try:
if isinstance(spider.from_date, str):
spider.from_date = spider.parse_date_argument(spider.from_date)
except ValueError as e:
raise SpiderArgumentError(f"spider argument `from_date`: invalid date value: {e}") from None
if not spider.until_date:
spider.until_date = cls.get_default_until_date(spider)
try:
if isinstance(spider.until_date, str):
spider.until_date = spider.parse_date_argument(spider.until_date)
except ValueError as e:
raise SpiderArgumentError(f"spider argument `until_date`: invalid date value: {e}") from None
# DatabaseStore-related logic.
if crawler.settings["DATABASE_URL"] and not spider.crawl_time:
raise SpiderArgumentError(
"spider argument `crawl_time`: can't be blank if `DATABASE_URL` is set"
) from None
return spider
[docs]
def parse_date_argument(self, date):
"""Return the date argument as a datetime object."""
return datetime.datetime.strptime(date, self.date_format).replace(tzinfo=datetime.timezone.utc)
[docs]
def is_http_success(self, response):
"""Return whether the response's status is a 2xx code."""
# All 2xx codes are successful.
# https://tools.ietf.org/html/rfc7231#section-6.3
return 200 <= response.status < 300
[docs]
def is_http_error_expected(self, response):
"""Return whether the response's status is expected to be a non-2xx code."""
return False
[docs]
def is_http_retryable(self, response):
"""
Return whether the response's status is retryable.
Set the ``retry_http_codes`` class attribute to a list of status codes to retry.
"""
return response.status in self.retry_http_codes
[docs]
def get_start_time(self, date_format):
"""Return the formatted start time of the crawl."""
date = self.crawl_time or self.crawler.stats.get_value("start_time")
return date.strftime(date_format)
[docs]
def get_retry_wait_time(self, response):
"""Return the number of seconds to wait before retrying a URL."""
return int(response.headers.get("Retry-After", 30))
[docs]
def build_request(self, url, formatter, **kwargs):
"""
Return a Scrapy request, with a file name added to the request's ``meta`` attribute. If the file name doesn't
have a ``.json``, ``.csv``, ``.xlsx``, ``.rar`` or ``.zip`` extension, it adds a ``.json`` extension.
If the last component of a URL's path is unique, use it as the file name. For example:
>>> from kingfisher_scrapy.base_spiders import BaseSpider
>>> from kingfisher_scrapy.util import components
>>> url = 'https://example.com/package.json'
>>> formatter = components(-1)
>>> BaseSpider(name='my_spider').build_request(url, formatter=formatter).meta
{'file_name': 'package.json'}
To use a query string parameter as the file name:
>>> from kingfisher_scrapy.util import parameters
>>> url = 'https://example.com/packages?page=1&per_page=100'
>>> formatter = parameters('page')
>>> BaseSpider(name='my_spider').build_request(url, formatter=formatter).meta
{'file_name': 'page-1.json'}
To use a URL path component *and* a query string parameter as the file name:
>>> from kingfisher_scrapy.util import join
>>> url = 'https://example.com/packages?page=1&per_page=100'
>>> formatter = join(components(-1), parameters('page'))
>>> BaseSpider(name='my_spider').build_request(url, formatter=formatter).meta
{'file_name': 'packages-page-1.json'}
:param str url: the URL to request
:param formatter: a function that accepts a URL and returns a file name
:returns: a Scrapy request
:rtype: scrapy.Request
"""
meta = {}
if formatter is None:
if not kwargs["meta"]["file_name"]:
raise AssertionError("build_request() must be passed a file_name or a formatter")
else:
meta["file_name"] = formatter(url)
# Other extensions are related to the Unflatten pipeline and CompressedFileSpider base class.
if not meta["file_name"].endswith((".json", ".csv", ".xlsx", ".rar", ".zip")):
meta["file_name"] += ".json"
if "meta" in kwargs:
meta.update(kwargs.pop("meta"))
return scrapy.Request(url, meta=meta, **kwargs)
[docs]
def build_file_from_response(self, response, /, *, data_type, **kwargs):
"""
Return a File item to yield, based on the response to a request.
If the response body starts with a byte-order mark, it is removed.
"""
kwargs.setdefault("file_name", response.request.meta["file_name"])
kwargs.setdefault("url", response.request.url)
if "data" not in kwargs:
body = response.body
# https://tools.ietf.org/html/rfc7159#section-8.1
if body.startswith(codecs.BOM_UTF8): # noqa: FURB188 # bytes instances don't have a removeprefix method.
body = body[len(codecs.BOM_UTF8) :]
kwargs["data"] = body
return self.build_file(data_type=data_type, **kwargs)
[docs]
def build_file(self, *, file_name=None, url=None, data_type=None, data=None):
"""Return a File item to yield."""
return File(
file_name=file_name,
url=url,
data_type=data_type,
data=data,
)
[docs]
def build_file_item(self, number, data, item):
"""Return a FileItem item to yield."""
return FileItem(
file_name=item.file_name,
url=item.url,
data_type=item.data_type,
data=data,
number=number,
)
[docs]
def log_error_from_response(self, response, *, level="error", status=None, message=""):
"""Log an error message, based on the response to a request."""
getattr(self.logger, level)(
"status=%d message=%r request=%s file_name=%s",
status or response.status,
message,
response.request,
response.request.meta.get("file_name", ""),
)
[docs]
@classmethod
def get_default_until_date(cls, spider):
"""Return the ``default_until_date`` class attribute if truthy. Otherwise, return the current time."""
if getattr(spider, "default_until_date", None):
return spider.default_until_date
return datetime.datetime.now(tz=datetime.timezone.utc)