Source code for kingfisher_scrapy.spiders.paraguay_hacienda

import datetime

import orjson
import scrapy

from kingfisher_scrapy.base_spiders import BaseSpider
from kingfisher_scrapy.exceptions import AccessTokenError, MissingEnvVarError
from kingfisher_scrapy.util import components, date_range_by_year



[docs]
class ParaguayHacienda(BaseSpider):
    """
    Domain
      Ministerio de Hacienda
    Caveats
      This dataset was last updated by the publisher in 2018.
    Spider arguments
      from_date
        Download only data from this year onward (YYYY format). Defaults to '2011'.
      until_date
        Download only data until this year (YYYY format). Defaults to '2018'.
    Environment variables
      KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN
        To get an API account and request token go to https://datos.hacienda.gov.py/aplicaciones/new.
      KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET
        Your client secret generated.
    Swagger API documentation
      https://datos.hacienda.gov.py/odmh-api-v1/api-docs/
    """

    name = "paraguay_hacienda"
    custom_settings = {
        "CONCURRENT_REQUESTS": 1,
        "DOWNLOADER_MIDDLEWARES": {
            "kingfisher_scrapy.downloadermiddlewares.ParaguayAuthMiddleware": 543,
        },
    }

    # BaseSpider
    date_format = "year"
    default_from_date = "2011"
    default_until_date = "2018"
    date_required = True
    dont_truncate = True

    # ParaguayAuthMiddleware
    access_token = None
    access_token_scheduled_at = None
    # The maximum age is less than the API's limit, since we don't precisely control Scrapy's scheduler.
    access_token_maximum_age = 14 * 60
    access_token_request_failed = False
    requests_backlog = []

    # Local
    max_access_token_attempts = 5
    url_prefix = "https://datos.hacienda.gov.py:443/odmh-api-v1/rest/api/v1/"
    release_ids = []

    @classmethod
    def from_crawler(cls, crawler, *args, **kwargs):
        spider = super().from_crawler(crawler, *args, **kwargs)

        spider.request_token = crawler.settings.get("KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN")
        spider.client_secret = crawler.settings.get("KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET")
        if spider.request_token is None or spider.client_secret is None:
            raise MissingEnvVarError(
                "KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN and/or "
                "KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET is not set."
            )

        return spider

    async def start(self):
        # Paraguay Hacienda has a service that returns all the ids that we need to get the release packages,
        # so we first iterate over this list that is paginated.
        for year in date_range_by_year(self.from_date.year, self.until_date.year):
            yield scrapy.Request(
                f"{self.url_prefix}pagos/cdp?page=1&by_anho={year}",
                meta={"meta": True, "first": True, "year": year},
                # Send duplicate requests when the token expired and in the continuation of requests_backlog saved.
                dont_filter=True,
            )

    def parse(self, response):
        package_url_prefix = f"{self.url_prefix}ocds/release-package/"

        data = response.json()

        # If is the first URL, we need to iterate over all the pages to get all the process ids to query.
        if response.request.meta["first"]:
            year = response.request.meta["year"]
            total = data["meta"]["totalPages"]
            for page in range(2, total + 1):
                yield scrapy.Request(
                    f"{self.url_prefix}pagos/cdp?page={page}&by_anho={year}",
                    meta={"meta": True, "first": False, "year": year},
                    dont_filter=True,
                )

        # if is a meta request it means that is the page that have the process ids to query
        if response.request.meta["meta"]:
            # Now that we have the ids we iterate over them, without duplicate them, and make the
            # final requests for the release_package this time
            for row in data["results"]:
                if row["idLlamado"] and row["idLlamado"] not in self.release_ids:
                    self.release_ids.append(row["idLlamado"])
                    yield self.build_request(
                        f"{package_url_prefix}{row['idLlamado']}",
                        formatter=components(-1),
                        meta={
                            "meta": False,
                            "first": False,
                        },
                        dont_filter=True,
                    )
        else:
            yield self.build_file_from_response(response, data_type="release_package")

    def build_access_token_request(self, body=None, attempt=0):
        self.logger.info("Requesting access token, attempt %s of %s", attempt + 1, self.max_access_token_attempts)

        if body is None:
            body = orjson.dumps({"clientSecret": self.client_secret})

        self.access_token_scheduled_at = datetime.datetime.now()

        return scrapy.Request(
            f"{self.url_prefix}auth/token",
            method="POST",
            headers={"Authorization": self.request_token, "Content-Type": "application/json"},
            body=body,
            meta={"attempt": attempt + 1, "auth": False, "handle_httpstatus_all": True},
            callback=self.parse_access_token,
            dont_filter=True,
            priority=1000,
        )

    def parse_access_token(self, response):
        if self.is_http_success(response):
            token = response.json().get("accessToken")
            if token:
                self.logger.info("New access token: %s", token)
                self.access_token = f"Bearer {token}"
                # continue scraping where it stopped after getting the token
                while self.requests_backlog:
                    yield self.requests_backlog.pop(0)
            else:
                attempt = response.request.meta["attempt"]
                if attempt == self.max_access_token_attempts:
                    self.access_token_request_failed = True
                    raise AccessTokenError("Max attempts to get an access token reached.")
                else:
                    yield self.build_access_token_request(response.request.body, attempt=attempt)
        else:
            self.access_token_request_failed = True
            raise AccessTokenError(f"Authentication failed. Status code: {response.status}")