Created
November 25, 2017 19:39
-
-
Save amalmurali47/bbb682beba62b3d31249d71a494bc500 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from scrapy.linkextractors import LinkExtractor | |
from scrapy.spiders import CrawlSpider, Rule | |
from scrapy.item import Item, Field | |
import logging | |
logger = logging.getLogger() | |
class WebItem(Item): | |
url = Field() | |
title = Field() | |
headers = Field() | |
def getitem(response): | |
item = WebItem() | |
item['url'] = response.url | |
item['title'] = response.css('title::text').extract_first() | |
headervalue = '' | |
for key, value in response.headers.iteritems(): | |
headervalue += (key.decode('utf-8') + ":" + ','.join(e.decode('utf-8') for e in value)) + "\n" | |
item['headers'] = headervalue | |
return item | |
class MySpider(CrawlSpider): | |
name = 'BB-crawler' | |
custom_settings = {'DOWNLOAD_TIMEOUT': 15} | |
def __init__(self, url_file=None, *args, **kwars): | |
super(MySpider, self).__init__(*args, **kwars) | |
with open(url_file, "rt") as url_file: | |
self.start_urls = [url.strip() for url in url_file.readlines()] | |
rules = (Rule(LinkExtractor(), callback='parse_url', follow=False), ) | |
def parse_start_url(self, response): | |
return getitem(response) | |
def parse_url(self, response): | |
return getitem(response) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment