Source code for kingfisher_scrapy.spiders.paraguay_hacienda

import datetime

import orjson
import scrapy

from kingfisher_scrapy.base_spiders import BaseSpider
from kingfisher_scrapy.exceptions import AccessTokenError, MissingEnvVarError
from kingfisher_scrapy.util import components, date_range_by_year


[docs] class ParaguayHacienda(BaseSpider): """ Domain Ministerio de Hacienda Caveats This dataset was last updated by the publisher in 2018. Spider arguments from_date Download only data from this year onward (YYYY format). Defaults to '2011'. until_date Download only data until this year (YYYY format). Defaults to '2018'. Environment variables KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN To get an API account and request token go to https://datos.hacienda.gov.py/aplicaciones/new. KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET Your client secret generated. Swagger API documentation https://datos.hacienda.gov.py/odmh-api-v1/api-docs/ """ name = "paraguay_hacienda" custom_settings = { "CONCURRENT_REQUESTS": 1, "DOWNLOADER_MIDDLEWARES": { "kingfisher_scrapy.downloadermiddlewares.ParaguayAuthMiddleware": 543, }, } # BaseSpider date_format = "year" default_from_date = "2011" default_until_date = "2018" date_required = True dont_truncate = True # ParaguayAuthMiddleware access_token = None access_token_scheduled_at = None # The maximum age is less than the API's limit, since we don't precisely control Scrapy's scheduler. access_token_maximum_age = 14 * 60 access_token_request_failed = False requests_backlog = [] # Local max_access_token_attempts = 5 url_prefix = "https://datos.hacienda.gov.py:443/odmh-api-v1/rest/api/v1/" release_ids = [] @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super().from_crawler(crawler, *args, **kwargs) spider.request_token = crawler.settings.get("KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN") spider.client_secret = crawler.settings.get("KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET") if spider.request_token is None or spider.client_secret is None: raise MissingEnvVarError( "KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN and/or " "KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET is not set." ) return spider async def start(self): # Paraguay Hacienda has a service that returns all the ids that we need to get the release packages, # so we first iterate over this list that is paginated. for year in date_range_by_year(self.from_date.year, self.until_date.year): yield scrapy.Request( f"{self.url_prefix}pagos/cdp?page=1&by_anho={year}", meta={"meta": True, "first": True, "year": year}, # Send duplicate requests when the token expired and in the continuation of requests_backlog saved. dont_filter=True, ) def parse(self, response): package_url_prefix = f"{self.url_prefix}ocds/release-package/" data = response.json() # If is the first URL, we need to iterate over all the pages to get all the process ids to query. if response.request.meta["first"]: year = response.request.meta["year"] total = data["meta"]["totalPages"] for page in range(2, total + 1): yield scrapy.Request( f"{self.url_prefix}pagos/cdp?page={page}&by_anho={year}", meta={"meta": True, "first": False, "year": year}, dont_filter=True, ) # if is a meta request it means that is the page that have the process ids to query if response.request.meta["meta"]: # Now that we have the ids we iterate over them, without duplicate them, and make the # final requests for the release_package this time for row in data["results"]: if row["idLlamado"] and row["idLlamado"] not in self.release_ids: self.release_ids.append(row["idLlamado"]) yield self.build_request( f"{package_url_prefix}{row['idLlamado']}", formatter=components(-1), meta={ "meta": False, "first": False, }, dont_filter=True, ) else: yield self.build_file_from_response(response, data_type="release_package") def build_access_token_request(self, body=None, attempt=0): self.logger.info("Requesting access token, attempt %s of %s", attempt + 1, self.max_access_token_attempts) if body is None: body = orjson.dumps({"clientSecret": self.client_secret}) self.access_token_scheduled_at = datetime.datetime.now() return scrapy.Request( f"{self.url_prefix}auth/token", method="POST", headers={"Authorization": self.request_token, "Content-Type": "application/json"}, body=body, meta={"attempt": attempt + 1, "auth": False, "handle_httpstatus_all": True}, callback=self.parse_access_token, dont_filter=True, priority=1000, ) def parse_access_token(self, response): if self.is_http_success(response): token = response.json().get("accessToken") if token: self.logger.info("New access token: %s", token) self.access_token = f"Bearer {token}" # continue scraping where it stopped after getting the token while self.requests_backlog: yield self.requests_backlog.pop(0) else: attempt = response.request.meta["attempt"] if attempt == self.max_access_token_attempts: self.access_token_request_failed = True raise AccessTokenError("Max attempts to get an access token reached.") else: yield self.build_access_token_request(response.request.body, attempt=attempt) else: self.access_token_request_failed = True raise AccessTokenError(f"Authentication failed. Status code: {response.status}")