Source code for kingfisher_scrapy.spiders.armenia

import scrapy

from kingfisher_scrapy.base_spiders import LinksSpider
from kingfisher_scrapy.util import get_parameter_value, parameters, replace_parameters

MILLISECONDS_PER_DAY = 86400000
EXPONENT_LIMIT = 10  # 1024 days
THRESHOLD = 2700000  # 45 minutes / 5 iterations within 1 day



[docs]
class Armenia(LinksSpider):
    """
    Domain
      Armenian e-Procurement System (ARMEPS)
    Caveats
      The API paginates results using an ``offset`` query string parameter, which is a timestamp. If a timestamp causes
      an error, the spider will try to find the nearest timestamp within the following 1024 days that succeeds.
    """

    name = "armenia"

    # SimpleSpider
    data_type = "release_package"

    # LinksSpider
    formatter = staticmethod(parameters("offset"))
    next_pointer = "/next_page/uri"

    async def start(self):
        yield scrapy.Request(
            "https://armeps.am/ocds/release", meta={"file_name": "offset-0.json", "handle_httpstatus_all": True}
        )

    # LinksSpider
    def parse(self, response):
        # If the request was successful, parse the response as usual.
        if self.is_http_success(response):
            yield self.build_file_from_response(response, data_type=self.data_type)

            # Use `dont_filter` in case the search for a successful timestamp used the same offset.
            # Use `dont_retry` and `handle_httpstatus_all` since errors are expected.
            yield self.next_link(response, dont_filter=True, meta={"dont_retry": True, "handle_httpstatus_all": True})
        # Otherwise, parse the response as usual, then (1) pick a date range and (2) do a binary search within it.
        # This approach assumes that, if two offsets error, then intervening offsets error, too.
        else:
            self.log_error_from_response(response)

            # If the error occurs on the first request, we have no starting offset.
            if get_parameter_value(response.request.url, "offset"):
                yield from self.parse_date_range(response)

    # Exponential search (https://en.wikipedia.org/wiki/Exponential_search). We can do an elaborate alternative
    # (https://www.slac.stanford.edu/cgi-bin/getdoc/slac-pub-1679.pdf), but we keep it simpler for now.
    def parse_date_range(self, response):
        offset = int(get_parameter_value(response.request.url, "offset"))

        # Scrapy uses `datetime.datetime.utcnow()`, so we don't need to worry about time zones.
        start_time = int(self.crawler.stats.get_value("start_time").timestamp() * 1000)
        # We use the first offset to calculate the new offset, and in log lessages.
        first_offset = response.request.meta.get("first", offset)
        # The exponent for the exponential search.
        exponent = response.request.meta.get("exponent", -1) + 1

        # If this offset succeeded, do a binary search from the previous offset to this offset.
        if self.is_http_success(response):
            yield from self.parse_binary_search(response, response.request.meta["prev"], offset)
        # If this offset failed and reached a limit, stop.
        elif offset >= start_time or exponent > EXPONENT_LIMIT:
            self.logger.info(f"No offset found after {first_offset:,} within {2**EXPONENT_LIMIT} days.")  # noqa: G004
            self.log_error_from_response(response)
        # Otherwise, continue.
        else:
            new_offset = min(first_offset + MILLISECONDS_PER_DAY * 2**exponent, start_time)
            yield self._build_request(
                replace_parameters(response.request.url, offset=new_offset),
                self.parse_date_range,
                {"prev": offset, "exponent": exponent, "first": first_offset},
            )

    # We use one of the alternative binary search methods (https://en.wikipedia.org/wiki/Binary_search_algorithm),
    # because we only know if an offset succeeds, not whether an offset is greater than the target value.
    def parse_binary_search(self, response, minimum=None, maximum=None):
        offset = int(get_parameter_value(response.request.url, "offset"))

        first_offset = response.request.meta["first"]

        if minimum and maximum:
            self.logger.info(f"Starting binary search for {first_offset:,} within [{minimum:,}, {maximum:,}]")  # noqa: G004
        elif self.is_http_success(response):
            minimum = response.request.meta["minimum"]
            maximum = offset
        else:
            minimum = offset + 1
            maximum = response.request.meta["maximum"]

        # If the search succeeded, parse the response as usual. We use a threshold, because getting the exact
        # millisecond requires 27 requests.
        if minimum + THRESHOLD >= maximum:
            self.logger.info(f"New offset found after {first_offset:,} at {maximum:,}!")  # noqa: G004
            if offset == maximum:
                # If the last request used the offset, we can reuse its response.
                yield from self.parse(response)
            else:
                yield self._build_request(replace_parameters(response.request.url, offset=maximum), self.parse, {})
        else:
            yield self._build_request(
                replace_parameters(response.request.url, offset=(minimum + maximum) // 2),
                self.parse_binary_search,
                {"minimum": minimum, "maximum": maximum, "first": first_offset},
            )

    def _build_request(self, url, callback, meta):
        meta["dont_retry"] = True
        meta["handle_httpstatus_all"] = True
        # We need to set `formatter` in case we want to re-use the response to build a file.
        return self.build_request(url, formatter=parameters("offset"), dont_filter=True, meta=meta, callback=callback)