Skip to content

Instantly share code, notes, and snippets.

@eads
Last active April 11, 2019 19:56
Show Gist options
  • Star 2 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save eads/f15eb3ce913727d69de30623d18af797 to your computer and use it in GitHub Desktop.
Save eads/f15eb3ce913727d69de30623d18af797 to your computer and use it in GitHub Desktop.
simple scrapy example
# -*- coding: utf-8 -*-
import csv
import os
import scrapy
from scrapy.http import FormRequest
from my_scraper.models import TableRow
END_DATE = '01/01/2019'
BASE_PATH = os.path.dirname(__file__)
ID_FILE = os.path.join(BASE_PATH, 'ids.csv')
SEARCH_URL = 'https://xxx'
class RemoteTableSpider(scrapy.Spider):
"""
Run with `scrapy crawl remote_table`.
"""
name = 'remote_table'
start_urls = ['https://xxx']
def parse(self, response):
"""
The first parse step calls up the start_url(s) defined above.
It scrapes no data but initializes the session.
"""
with open(ID_FILE) as f:
reader = csv.DictReader(f)
rows = list(reader)
for row in rows:
formdata = {
'Id': row['id'],
'startDate': row['start_date'],
'endDate': END_DATE,
'Submit': 'Param*',
'Sort': '1',
}
request = FormRequest(SEARCH_URL, formdata=formdata, callback=self.extract_table)
yield request
def extract_table(self, response):
"""
Extract table from response to search POST.
"""
table_rows = response.css('table#results tr')
for row in table_rows:
rowobj = TableRow(row)
yield rowobj.serialize()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment