redapple/dform_spider.py Secret

## dform_spider.py
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import XmlXPathSelector

from scrapy.http import Request
import urlparse

#from DFORM.items import SecFormD
# item defined here but ideally should be imported with the line above
from scrapy.item import Item, Field

class SecFormD(Item):
    company = Field()


class SecDform(CrawlSpider):
    name = "DFORM"
    allowed_domain = ["http://www.sec.gov"]
    start_urls = [
        "http://www.sec.gov/cgi-bin/srch-edgar?text=form-type%3Dd+and+state%3DMN&first=2008&last=2011"
        ]
    rules = (
        Rule(
            SgmlLinkExtractor(restrict_xpaths=('/html/body/div/table/tr/td[3]/a[2]')),
            callback='parse_formd',
            #follow= True   # no need to apply the link rules for formd pages
        ),
        # Nex pages to scrape:
        # fetch the link with text '[NEXT]' in the first <center> tag (above the main <table>)
        # center[2] would select the [NEXT] link below the table
        Rule(
            SgmlLinkExtractor(restrict_xpaths=('/html/body/div/center[1]/a[contains(., "[NEXT]")]')),
            follow= True
        ),
    )

    def parse_formd(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//*[@id="formDiv"]/div/table/tr[3]/td[3]/a/@href').extract()
        for site in sites:
            yield Request(url=urlparse.urljoin(response.url, site), callback=self.parse_xml_document)

    def parse_xml_document(self, response):
        xxs = XmlXPathSelector(response)
        item = SecFormD()
        item["company"] = xxs.select('./primaryIssuer/entityName/text()').extract()[0]
        # XPaths to fix
        #item['company'] = site.select('//*[@id="collapsible1"]/div[1]/div[2]/div[2]/span[2]/text()').extract()
        #item['filling_date'] = site.select('//*[@id="collapsible40"]/div[1]/div[2]/div[5]/span[2]/text()').extract()
        #item['types_of_securities'] = site.select('//*[@id="collapsible37"]/div[1]/div[2]/div[1]/span[2]/text()').extract()
        #item['offering_amount'] = site.select('//*[@id="collapsible39"]/div[1]/div[2]/div[1]/span[2]/text()').extract()
        #item['sold_amount'] = site.select('//*[@id="collapsible39"]/div[1]/div[2]/div[2]/span[2]/text()').extract()
        #item['remaining'] = site.select('//*[@id="collapsible39"]/div[1]/div[2]/div[3]/span[2]/text()').extract()
        #item['investors_accredited'] = site.select('//*[@id="collapsible40"]/div[1]/div[2]/div[2]/span[2]/text()').extract()
        #item['investors_non_accredited'] = site.select('//*[@id="collapsible40"]/div[1]/div[2]/div[1]/span[2]/text()').extract()

        return item
	from scrapy.contrib.spiders import CrawlSpider, Rule
	from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
	from scrapy.selector import HtmlXPathSelector
	from scrapy.selector import XmlXPathSelector

	from scrapy.http import Request
	import urlparse

	#from DFORM.items import SecFormD
	# item defined here but ideally should be imported with the line above
	from scrapy.item import Item, Field

	class SecFormD(Item):
	company = Field()


	class SecDform(CrawlSpider):
	name = "DFORM"
	allowed_domain = ["http://www.sec.gov"]
	start_urls = [
	"http://www.sec.gov/cgi-bin/srch-edgar?text=form-type%3Dd+and+state%3DMN&first=2008&last=2011"
	]
	rules = (
	Rule(
	SgmlLinkExtractor(restrict_xpaths=('/html/body/div/table/tr/td[3]/a[2]')),
	callback='parse_formd',
	#follow= True # no need to apply the link rules for formd pages
	),
	# Nex pages to scrape:
	# fetch the link with text '[NEXT]' in the first <center> tag (above the main <table>)
	# center[2] would select the [NEXT] link below the table
	Rule(
	SgmlLinkExtractor(restrict_xpaths=('/html/body/div/center[1]/a[contains(., "[NEXT]")]')),
	follow= True
	),
	)

	def parse_formd(self, response):
	hxs = HtmlXPathSelector(response)
	sites = hxs.select('//*[@id="formDiv"]/div/table/tr[3]/td[3]/a/@href').extract()
	for site in sites:
	yield Request(url=urlparse.urljoin(response.url, site), callback=self.parse_xml_document)

	def parse_xml_document(self, response):
	xxs = XmlXPathSelector(response)
	item = SecFormD()
	item["company"] = xxs.select('./primaryIssuer/entityName/text()').extract()[0]
	# XPaths to fix
	#item['company'] = site.select('//*[@id="collapsible1"]/div[1]/div[2]/div[2]/span[2]/text()').extract()
	#item['filling_date'] = site.select('//*[@id="collapsible40"]/div[1]/div[2]/div[5]/span[2]/text()').extract()
	#item['types_of_securities'] = site.select('//*[@id="collapsible37"]/div[1]/div[2]/div[1]/span[2]/text()').extract()
	#item['offering_amount'] = site.select('//*[@id="collapsible39"]/div[1]/div[2]/div[1]/span[2]/text()').extract()
	#item['sold_amount'] = site.select('//*[@id="collapsible39"]/div[1]/div[2]/div[2]/span[2]/text()').extract()
	#item['remaining'] = site.select('//*[@id="collapsible39"]/div[1]/div[2]/div[3]/span[2]/text()').extract()
	#item['investors_accredited'] = site.select('//*[@id="collapsible40"]/div[1]/div[2]/div[2]/span[2]/text()').extract()
	#item['investors_non_accredited'] = site.select('//*[@id="collapsible40"]/div[1]/div[2]/div[1]/span[2]/text()').extract()

	return item