Last active
August 23, 2016 10:55
-
-
Save lammoth/63e19a45fea48e65a10b78f1377f2563 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
import re | |
from spider.items import SpiderItem | |
class GHDBSpider(scrapy.Spider): | |
name = "ghdb" | |
allowed_domains = ["www.exploit-db.com"] | |
start_urls = [ | |
"https://www.exploit-db.com/google-hacking-database/?action=search&ghdb_search_page=1&ghdb_search_text=&ghdb_search_cat_id=0" | |
] | |
def parse(self, response): | |
for sel in response.xpath('//table[@class="category-list"]/tbody//tr'): | |
item = SpiderItem() | |
item['category'] = sel.xpath('td[@class="gd-description"]/a/text()').extract()[0] | |
yield scrapy.Request(sel.xpath('td/a[1]/@href').extract()[0], callback=self.enrich_item, meta=item) | |
links = response.xpath('//div[@class="pagination"]//a') | |
next_page = False | |
for link in links: | |
url_title = link.xpath('text()').extract()[0] | |
if url_title == 'next': | |
next_page = link.xpath('@href').extract()[0] | |
if next_page: | |
url = response.urljoin(re.sub(r'\s', '', next_page)) | |
yield scrapy.Request(url, self.parse) | |
def enrich_item(self, response): | |
item = response.meta | |
rows = response.xpath('//table[@class="category-list"]/tbody/tr/td') | |
item['desc'] = re.sub(r'\s', '', rows.xpath('text()').extract()[1]) | |
item['link'] = re.sub(r'\s', '', rows.xpath('a/@href').extract()[0]) | |
item['date'] = re.sub(r'\s', '', rows.xpath('text()').extract()[4]) | |
item['summary'] = re.sub(r'\s', '', rows.xpath('text()').extract()[5]) | |
return item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# See documentation in: | |
# http://doc.scrapy.org/en/latest/topics/items.html | |
import scrapy | |
class SpiderItem(scrapy.Item): | |
date = scrapy.Field() | |
title = scrapy.Field() | |
desc = scrapy.Field() | |
summary = scrapy.Field() | |
category = scrapy.Field() | |
source_link = scrapy.Field() | |
link = scrapy.Field() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment