Last active
April 11, 2019 19:56
-
-
Save eads/f15eb3ce913727d69de30623d18af797 to your computer and use it in GitHub Desktop.
simple scrapy example
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import csv | |
import os | |
import scrapy | |
from scrapy.http import FormRequest | |
from my_scraper.models import TableRow | |
END_DATE = '01/01/2019' | |
BASE_PATH = os.path.dirname(__file__) | |
ID_FILE = os.path.join(BASE_PATH, 'ids.csv') | |
SEARCH_URL = 'https://xxx' | |
class RemoteTableSpider(scrapy.Spider): | |
""" | |
Run with `scrapy crawl remote_table`. | |
""" | |
name = 'remote_table' | |
start_urls = ['https://xxx'] | |
def parse(self, response): | |
""" | |
The first parse step calls up the start_url(s) defined above. | |
It scrapes no data but initializes the session. | |
""" | |
with open(ID_FILE) as f: | |
reader = csv.DictReader(f) | |
rows = list(reader) | |
for row in rows: | |
formdata = { | |
'Id': row['id'], | |
'startDate': row['start_date'], | |
'endDate': END_DATE, | |
'Submit': 'Param*', | |
'Sort': '1', | |
} | |
request = FormRequest(SEARCH_URL, formdata=formdata, callback=self.extract_table) | |
yield request | |
def extract_table(self, response): | |
""" | |
Extract table from response to search POST. | |
""" | |
table_rows = response.css('table#results tr') | |
for row in table_rows: | |
rowobj = TableRow(row) | |
yield rowobj.serialize() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment