Skip to content

Instantly share code, notes, and snippets.

@BlogBlocks
Created December 17, 2017 00:05
Show Gist options
  • Save BlogBlocks/d0bb1f94cda69a5e7c96d1742cc5adca to your computer and use it in GitHub Desktop.
Save BlogBlocks/d0bb1f94cda69a5e7c96d1742cc5adca to your computer and use it in GitHub Desktop.
spider from Stackoverflow
#!/usr/bin/env python3
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
start_urls = ['http://www.usfigureskating.org/leaderboard/results/2018/25073/SEGM001.html']
def parse(self, response):
print('url:', response.url)
body = response.body.replace(b'<<+', b'&lt;&lt;+').replace(b'<+', b'&lt;+')
selector = scrapy.Selector(text=body.decode('utf-8'))
i = 1
for x in selector.css('.elem::text').extract():
if 'Elements' in x:
print('---', i, '---')
i += 1
else:
print(x)
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in CSV or JSON
#'FEED_FORMAT': 'csv', # 'json
#'FEED_URI': 'output.csv', # 'output.json
})
c.crawl(MySpider)
c.start()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment