import datetime
import orjson
import scrapy
from kingfisher_scrapy.base_spiders import BaseSpider
from kingfisher_scrapy.exceptions import AccessTokenError, MissingEnvVarError
from kingfisher_scrapy.util import components, date_range_by_year
[docs]
class ParaguayHacienda(BaseSpider):
"""
Domain
Ministerio de Hacienda
Caveats
This dataset was last updated by the publisher in 2018.
Spider arguments
from_date
Download only data from this year onward (YYYY format). Defaults to '2011'.
until_date
Download only data until this year (YYYY format). Defaults to '2018'.
Environment variables
KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN
To get an API account and request token go to https://datos.hacienda.gov.py/aplicaciones/new.
KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET
Your client secret generated.
Swagger API documentation
https://datos.hacienda.gov.py/odmh-api-v1/api-docs/
"""
name = "paraguay_hacienda"
custom_settings = {
"CONCURRENT_REQUESTS": 1,
"DOWNLOADER_MIDDLEWARES": {
"kingfisher_scrapy.downloadermiddlewares.ParaguayAuthMiddleware": 543,
},
}
# BaseSpider
date_format = "year"
default_from_date = "2011"
default_until_date = "2018"
date_required = True
dont_truncate = True
# ParaguayAuthMiddleware
access_token = None
access_token_scheduled_at = None
# The maximum age is less than the API's limit, since we don't precisely control Scrapy's scheduler.
access_token_maximum_age = 14 * 60
access_token_request_failed = False
requests_backlog = []
# Local
max_access_token_attempts = 5
url_prefix = "https://datos.hacienda.gov.py:443/odmh-api-v1/rest/api/v1/"
release_ids = []
@classmethod
def from_crawler(cls, crawler, *args, **kwargs):
spider = super().from_crawler(crawler, *args, **kwargs)
spider.request_token = crawler.settings.get("KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN")
spider.client_secret = crawler.settings.get("KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET")
if spider.request_token is None or spider.client_secret is None:
raise MissingEnvVarError(
"KINGFISHER_PARAGUAY_HACIENDA_REQUEST_TOKEN and/or "
"KINGFISHER_PARAGUAY_HACIENDA_CLIENT_SECRET is not set."
)
return spider
async def start(self):
# Paraguay Hacienda has a service that returns all the ids that we need to get the release packages,
# so we first iterate over this list that is paginated.
for year in date_range_by_year(self.from_date.year, self.until_date.year):
yield scrapy.Request(
f"{self.url_prefix}pagos/cdp?page=1&by_anho={year}",
meta={"meta": True, "first": True, "year": year},
# Send duplicate requests when the token expired and in the continuation of requests_backlog saved.
dont_filter=True,
)
def parse(self, response):
package_url_prefix = f"{self.url_prefix}ocds/release-package/"
data = response.json()
# If is the first URL, we need to iterate over all the pages to get all the process ids to query.
if response.request.meta["first"]:
year = response.request.meta["year"]
total = data["meta"]["totalPages"]
for page in range(2, total + 1):
yield scrapy.Request(
f"{self.url_prefix}pagos/cdp?page={page}&by_anho={year}",
meta={"meta": True, "first": False, "year": year},
dont_filter=True,
)
# if is a meta request it means that is the page that have the process ids to query
if response.request.meta["meta"]:
# Now that we have the ids we iterate over them, without duplicate them, and make the
# final requests for the release_package this time
for row in data["results"]:
if row["idLlamado"] and row["idLlamado"] not in self.release_ids:
self.release_ids.append(row["idLlamado"])
yield self.build_request(
f"{package_url_prefix}{row['idLlamado']}",
formatter=components(-1),
meta={
"meta": False,
"first": False,
},
dont_filter=True,
)
else:
yield self.build_file_from_response(response, data_type="release_package")
def build_access_token_request(self, body=None, attempt=0):
self.logger.info("Requesting access token, attempt %s of %s", attempt + 1, self.max_access_token_attempts)
if body is None:
body = orjson.dumps({"clientSecret": self.client_secret})
self.access_token_scheduled_at = datetime.datetime.now()
return scrapy.Request(
f"{self.url_prefix}auth/token",
method="POST",
headers={"Authorization": self.request_token, "Content-Type": "application/json"},
body=body,
meta={"attempt": attempt + 1, "auth": False, "handle_httpstatus_all": True},
callback=self.parse_access_token,
dont_filter=True,
priority=1000,
)
def parse_access_token(self, response):
if self.is_http_success(response):
token = response.json().get("accessToken")
if token:
self.logger.info("New access token: %s", token)
self.access_token = f"Bearer {token}"
# continue scraping where it stopped after getting the token
while self.requests_backlog:
yield self.requests_backlog.pop(0)
else:
attempt = response.request.meta["attempt"]
if attempt == self.max_access_token_attempts:
self.access_token_request_failed = True
raise AccessTokenError("Max attempts to get an access token reached.")
else:
yield self.build_access_token_request(response.request.body, attempt=attempt)
else:
self.access_token_request_failed = True
raise AccessTokenError(f"Authentication failed. Status code: {response.status}")