Source code for kingfisher_scrapy.spiders.ukraine

import scrapy

from kingfisher_scrapy.base_spiders import SimpleSpider
from kingfisher_scrapy.util import (
    BROWSER_USER_AGENT,
    append_path_components,
    components,
    replace_parameters,
)



[docs]
class Ukraine(SimpleSpider):
    """
    Domain
      ProZorro OpenProcurement API
    Spider arguments
      from_date
        Download only data from this time onward (YYYY-MM-DDThh:mm:ss format). Defaults to '2016-01-01T00:00:00'.
    API documentation
      https://prozorro-api-docs.readthedocs.io/uk/latest/tendering/index.html
    """

    name = "ukraine"
    custom_settings = {
        "USER_AGENT": BROWSER_USER_AGENT,  # to avoid HTTP 412
    }

    # BaseSpider
    date_format = "datetime"
    default_from_date = "2016-01-01T00:00:00"
    date_required = True

    # SimpleSpider
    data_type = "release_package"

    async def start(self):
        # A https://public.api.openprocurement.org/api/0/contracts endpoint also exists but the data returned from
        # there is already included in the tenders endpoint. If we would like to join both, the tender_id field from
        # the contract endpoint can be used with the id field from the tender endpoint.
        url = "https://public-api.prozorro.gov.ua/api/2.5/tenders"
        if self.from_date:
            url = f"{url}?offset={self.from_date.strftime(self.date_format)}"
        yield scrapy.Request(url, callback=self.parse_list)

    def parse_list(self, response):
        data = response.json()

        for item in data["data"]:
            yield self.build_request(
                append_path_components(replace_parameters(response.request.url, offset=None), item["id"])
                + "?opt_schema=ocds",
                formatter=components(-2),
            )

        yield scrapy.Request(data["next_page"]["uri"], callback=self.parse_list)