Source code for kingfisher_scrapy.spiders.united_states_portland
from urllib.parse import urlencode
import scrapy
from kingfisher_scrapy.base_spiders import SimpleSpider
[docs]
class UnitedStatesPortland(SimpleSpider):
"""
Domain
City of Portland
Bulk download documentation
https://www.portland.gov/business-opportunities/ocds/ocds-data-publication
"""
name = "united_states_portland"
# SimpleSpider
data_type = "record_package"
async def start(self):
# Get the page with the link to the most recent Google Drive folder containing the JSON file.
yield scrapy.Request(
"https://www.portland.gov/business-opportunities/ocds/ocds-data-publication", callback=self.parse_page
)
def parse_page(self, response):
# Follow the link to the most recent Google Drive folder containing the JSON file.
yield scrapy.Request(
response.xpath('//a[contains(@href, "drive.google.com")]/@href').get(), callback=self.parse_folder
)
def parse_folder(self, response):
# The id of the file to download is in the `data-id` attribute of a `tr` element that contains a JSON filename.
for tr in response.xpath("//tr[@data-id]"):
if "json" in tr.get():
yield scrapy.Request(
f"https://drive.google.com/uc?export=download&id={tr.attrib['data-id']}", callback=self.parse_file
)
def parse_file(self, response):
# Submit form: "FILE is too large for Google to scan for viruses. Would you still like to download this file?"
form = response.xpath('//form[@id="download-form"]')
params = {
key: form.xpath(f".//input[@name='{key}']/@value").get() for key in ("id", "export", "confirm", "uuid")
}
yield scrapy.Request(f"{form.xpath('@action').get()}?{urlencode(params)}", meta={"file_name": "all.json"})