Skip to content

Instantly share code, notes, and snippets.

@RGGH
Forked from dray89/Scrapy-tables
Created June 7, 2021 12:28
Show Gist options
  • Save RGGH/d20524fd14e2e5d3280f98c128676dfe to your computer and use it in GitHub Desktop.
Save RGGH/d20524fd14e2e5d3280f98c128676dfe to your computer and use it in GitHub Desktop.
How to scrape tables using Scrapy
import scrapy
import pandas
from ..items import YahooItem
class YahooSpider(scrapy.Spider):
name = 'Yahoo'
symbols = ["ADSK","BA","CAT","EBAY","GS","HSY","IBM","JPM","WMT","SHOP",
"T", "F", "TRI", "AMZN", "C", "A", "O", "B","MSFT", "NVDA",
"DIS", "AAL", "NFLX", "JNJ","BAC","GOOGL", "WFC"]
start_urls = ['https://finance.yahoo.com/quote/{0}/history?p={0}'.format(x) for x in symbols]
def parse(self, response):
items = YahooItem()
data = response.xpath('//table//text()').extract()
title = response.xpath('//title//text()').extract()
num_cols = 7
output = [data[i:i + num_cols] for i in range(0, len(data), num_cols)]
dictionary = pandas.DataFrame(output[1:], columns=output[0]).set_index('Date').to_dict()
items['title'] = title
items['data'] = dictionary
yield items
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment