Skip to content

Instantly share code, notes, and snippets.

@lammoth
Last active August 23, 2016 10:55
Show Gist options
  • Save lammoth/63e19a45fea48e65a10b78f1377f2563 to your computer and use it in GitHub Desktop.
Save lammoth/63e19a45fea48e65a10b78f1377f2563 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import scrapy
import re
from spider.items import SpiderItem
class GHDBSpider(scrapy.Spider):
name = "ghdb"
allowed_domains = ["www.exploit-db.com"]
start_urls = [
"https://www.exploit-db.com/google-hacking-database/?action=search&ghdb_search_page=1&ghdb_search_text=&ghdb_search_cat_id=0"
]
def parse(self, response):
for sel in response.xpath('//table[@class="category-list"]/tbody//tr'):
item = SpiderItem()
item['category'] = sel.xpath('td[@class="gd-description"]/a/text()').extract()[0]
yield scrapy.Request(sel.xpath('td/a[1]/@href').extract()[0], callback=self.enrich_item, meta=item)
links = response.xpath('//div[@class="pagination"]//a')
next_page = False
for link in links:
url_title = link.xpath('text()').extract()[0]
if url_title == 'next':
next_page = link.xpath('@href').extract()[0]
if next_page:
url = response.urljoin(re.sub(r'\s', '', next_page))
yield scrapy.Request(url, self.parse)
def enrich_item(self, response):
item = response.meta
rows = response.xpath('//table[@class="category-list"]/tbody/tr/td')
item['desc'] = re.sub(r'\s', '', rows.xpath('text()').extract()[1])
item['link'] = re.sub(r'\s', '', rows.xpath('a/@href').extract()[0])
item['date'] = re.sub(r'\s', '', rows.xpath('text()').extract()[4])
item['summary'] = re.sub(r'\s', '', rows.xpath('text()').extract()[5])
return item
# -*- coding: utf-8 -*-
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class SpiderItem(scrapy.Item):
date = scrapy.Field()
title = scrapy.Field()
desc = scrapy.Field()
summary = scrapy.Field()
category = scrapy.Field()
source_link = scrapy.Field()
link = scrapy.Field()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment