Source code for kingfisher_scrapy.spiders.rwanda_bulk

import datetime

import scrapy

from kingfisher_scrapy.base_spiders import CompressedFileSpider
from kingfisher_scrapy.util import MAX_DOWNLOAD_TIMEOUT, parameters



[docs]
class RwandaBulk(CompressedFileSpider):
    """
    Domain
      Rwanda Public Procurement Authority (RPPA)
    Spider arguments
      from_date
        Download only data from this month onward (YYYY-MM format).
        If ``until_date`` is provided, defaults to '2013-12'.
      until_date
        Download only data until this month (YYYY-MM format).
        If ``from_date`` is provided, defaults to the current month.
    Bulk download documentation
      https://ocds.umucyo.gov.rw/OpenData
    """

    name = "rwanda_bulk"
    custom_settings = {
        "DOWNLOAD_TIMEOUT": MAX_DOWNLOAD_TIMEOUT,
    }

    # BaseSpider
    date_format = "year-month"
    default_from_date = "2013-12"
    skip_pluck = "Already covered (see code for details)"  # rwanda_api

    # SimpleSpider
    data_type = "release_package"

    async def start(self):
        yield scrapy.Request(
            "https://ocds.umucyo.gov.rw/opendata/api/v1/ui/data_set/available_datasets",
            callback=self.parse_list,
        )

    def parse_list(self, response):
        """
        The response looks like:
        {
            "status": 200,
            "returnCode": 7,
            "message": "Available datasets successfully retrieved",
            "datasets": {
                "2025": [
                    "01-January-csv.zip",
                    "01-January-json.zip",
                    "01-January-xlsx.zip",
                    "02-February-csv.zip",
                    "02-February-json.zip",
                    "02-February-xlsx.zip",
                    ...,
                    "flattened",
                    "json"
                ],
                ...
            }
        }
        """
        for year, datasets in response.json()["datasets"].items():
            for item in datasets:
                if item.endswith("-json.zip"):
                    if self.from_date and self.until_date:
                        date = datetime.datetime(int(year), int(item[:2]), 1, tzinfo=datetime.timezone.utc)
                        if not (self.from_date <= date <= self.until_date):
                            continue
                    yield self.build_request(
                        f"https://ocds.umucyo.gov.rw/opendata/api/v1/ui/data_set/download?year={year}&month_file={item}",
                        formatter=parameters("year", "month_file"),
                    )