Source code for kingfisher_scrapy.spiders.india_himachal_pradesh_civic_data_lab

import scrapy

from kingfisher_scrapy.base_spiders import SimpleSpider
from kingfisher_scrapy.exceptions import KingfisherScrapyError

# https://github.com/CivicDataLab/himachal-pradesh-health-procurement-OCDS/


[docs] class IndiaHimachalPradeshCivicDataLab(SimpleSpider): """ Domain Himachal Pradesh State Government Finance Department Caveats This dataset was last updated by the publisher in 2020. Bulk download documentation https://github.com/CivicDataLab/himachal-pradesh-health-procurement-OCDS/ """ name = "india_himachal_pradesh_civic_data_lab" # BaseSpider unflatten = True unflatten_args = { "metatab_name": "Meta", "metatab_vertical_orientation": True, "metatab_schema": "https://standard.open-contracting.org/schema/1__1__5/release-package-schema.json", } # SimpleSpider data_type = "release_package" # Local github_repo = "CivicDataLab/himachal-pradesh-health-procurement-OCDS" async def start(self): yield scrapy.Request( f"https://api.github.com/repos/{self.github_repo}/git/trees/master", callback=self.parse_list ) def parse_list(self, response): # Use the GitHub API to list the files in the repository, and then download the files using a non-API method, # to avoid quota/rate limits. data = response.json() # https://docs.github.com/en/rest/reference/git#get-a-tree if data["truncated"]: raise KingfisherScrapyError("Truncated results returned when querying the file list from GitHub") for node in data["tree"]: file_name = node["path"] if file_name.endswith(".xlsx"): yield scrapy.Request( f"https://github.com/{self.github_repo}/raw/master/{file_name}?raw=true", meta={"file_name": file_name}, )