Source code for kingfisher_scrapy.spiders.united_states_portland

from urllib.parse import urlencode

import scrapy

from kingfisher_scrapy.base_spiders import SimpleSpider


[docs] class UnitedStatesPortland(SimpleSpider): """ Domain City of Portland Bulk download documentation https://www.portland.gov/business-opportunities/ocds/ocds-data-publication """ name = "united_states_portland" # SimpleSpider data_type = "record_package" async def start(self): # Get the page with the link to the most recent Google Drive folder containing the JSON file. yield scrapy.Request( "https://www.portland.gov/business-opportunities/ocds/ocds-data-publication", callback=self.parse_page ) def parse_page(self, response): # Follow the link to the most recent Google Drive folder containing the JSON file. yield scrapy.Request( response.xpath('//a[contains(@href, "drive.google.com")]/@href').get(), callback=self.parse_folder ) def parse_folder(self, response): # The id of the file to download is in the `data-id` attribute of a `tr` element that contains a JSON filename. for tr in response.xpath("//tr[@data-id]"): if "json" in tr.get(): yield scrapy.Request( f"https://drive.google.com/uc?export=download&id={tr.attrib['data-id']}", callback=self.parse_file ) def parse_file(self, response): # Submit form: "FILE is too large for Google to scan for viruses. Would you still like to download this file?" form = response.xpath('//form[@id="download-form"]') params = { key: form.xpath(f".//input[@name='{key}']/@value").get() for key in ("id", "export", "confirm", "uuid") } yield scrapy.Request(f"{form.xpath('@action').get()}?{urlencode(params)}", meta={"file_name": "all.json"})