Skip to content

Instantly share code, notes, and snippets.

@amalmurali47
Created November 25, 2017 19:39
Show Gist options
  • Save amalmurali47/bbb682beba62b3d31249d71a494bc500 to your computer and use it in GitHub Desktop.
Save amalmurali47/bbb682beba62b3d31249d71a494bc500 to your computer and use it in GitHub Desktop.
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.item import Item, Field
import logging
logger = logging.getLogger()
class WebItem(Item):
url = Field()
title = Field()
headers = Field()
def getitem(response):
item = WebItem()
item['url'] = response.url
item['title'] = response.css('title::text').extract_first()
headervalue = ''
for key, value in response.headers.iteritems():
headervalue += (key.decode('utf-8') + ":" + ','.join(e.decode('utf-8') for e in value)) + "\n"
item['headers'] = headervalue
return item
class MySpider(CrawlSpider):
name = 'BB-crawler'
custom_settings = {'DOWNLOAD_TIMEOUT': 15}
def __init__(self, url_file=None, *args, **kwars):
super(MySpider, self).__init__(*args, **kwars)
with open(url_file, "rt") as url_file:
self.start_urls = [url.strip() for url in url_file.readlines()]
rules = (Rule(LinkExtractor(), callback='parse_url', follow=False), )
def parse_start_url(self, response):
return getitem(response)
def parse_url(self, response):
return getitem(response)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment