lammoth/ghdb_spider.py

## ghdb_spider.py
# -*- coding: utf-8 -*-

import scrapy
import re
from spider.items import SpiderItem

class GHDBSpider(scrapy.Spider):
    name = "ghdb"
    allowed_domains = ["www.exploit-db.com"]
    start_urls = [
        "https://www.exploit-db.com/google-hacking-database/?action=search&ghdb_search_page=1&ghdb_search_text=&ghdb_search_cat_id=0"
    ]

    def parse(self, response):
        for sel in response.xpath('//table[@class="category-list"]/tbody//tr'):
            item = SpiderItem()
            item['category'] = sel.xpath('td[@class="gd-description"]/a/text()').extract()[0]
            yield scrapy.Request(sel.xpath('td/a[1]/@href').extract()[0], callback=self.enrich_item, meta=item)

        links = response.xpath('//div[@class="pagination"]//a')
        next_page = False

        for link in links:
            url_title = link.xpath('text()').extract()[0]
            if url_title == 'next':
                next_page = link.xpath('@href').extract()[0]
        if next_page:
            url = response.urljoin(re.sub(r'\s', '', next_page))
            yield scrapy.Request(url, self.parse)


    def enrich_item(self, response):
        item = response.meta
        rows = response.xpath('//table[@class="category-list"]/tbody/tr/td')
        item['desc'] = re.sub(r'\s', '', rows.xpath('text()').extract()[1])
        item['link'] = re.sub(r'\s', '', rows.xpath('a/@href').extract()[0])
        item['date'] = re.sub(r'\s', '', rows.xpath('text()').extract()[4])
        item['summary'] = re.sub(r'\s', '', rows.xpath('text()').extract()[5])

        return item

## items.py
# -*- coding: utf-8 -*-

# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class SpiderItem(scrapy.Item):
    date = scrapy.Field()
    title = scrapy.Field()
    desc = scrapy.Field()
    summary = scrapy.Field()
    category = scrapy.Field()
    source_link = scrapy.Field()
    link = scrapy.Field()
	# -- coding: utf-8 --

	import scrapy
	import re
	from spider.items import SpiderItem

	class GHDBSpider(scrapy.Spider):
	name = "ghdb"
	allowed_domains = ["www.exploit-db.com"]
	start_urls = [
	"https://www.exploit-db.com/google-hacking-database/?action=search&ghdb_search_page=1&ghdb_search_text=&ghdb_search_cat_id=0"
	]

	def parse(self, response):
	for sel in response.xpath('//table[@class="category-list"]/tbody//tr'):
	item = SpiderItem()
	item['category'] = sel.xpath('td[@class="gd-description"]/a/text()').extract()[0]
	yield scrapy.Request(sel.xpath('td/a[1]/@href').extract()[0], callback=self.enrich_item, meta=item)

	links = response.xpath('//div[@class="pagination"]//a')
	next_page = False

	for link in links:
	url_title = link.xpath('text()').extract()[0]
	if url_title == 'next':
	next_page = link.xpath('@href').extract()[0]
	if next_page:
	url = response.urljoin(re.sub(r'\s', '', next_page))
	yield scrapy.Request(url, self.parse)


	def enrich_item(self, response):
	item = response.meta
	rows = response.xpath('//table[@class="category-list"]/tbody/tr/td')
	item['desc'] = re.sub(r'\s', '', rows.xpath('text()').extract()[1])
	item['link'] = re.sub(r'\s', '', rows.xpath('a/@href').extract()[0])
	item['date'] = re.sub(r'\s', '', rows.xpath('text()').extract()[4])
	item['summary'] = re.sub(r'\s', '', rows.xpath('text()').extract()[5])

	return item
	# -- coding: utf-8 --

	# See documentation in:
	# http://doc.scrapy.org/en/latest/topics/items.html

	import scrapy


	class SpiderItem(scrapy.Item):
	date = scrapy.Field()
	title = scrapy.Field()
	desc = scrapy.Field()
	summary = scrapy.Field()
	category = scrapy.Field()
	source_link = scrapy.Field()
	link = scrapy.Field()