Skip to content

Instantly share code, notes, and snippets.

@redapple
Last active December 20, 2015 00:29
Show Gist options
  • Save redapple/02a55aa6aaac0df2fb75 to your computer and use it in GitHub Desktop.
Save redapple/02a55aa6aaac0df2fb75 to your computer and use it in GitHub Desktop.
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.selector import HtmlXPathSelector
from scrapy.selector import XmlXPathSelector
from scrapy.http import Request
import urlparse
#from DFORM.items import SecFormD
# item defined here but ideally should be imported with the line above
from scrapy.item import Item, Field
class SecFormD(Item):
company = Field()
class SecDform(CrawlSpider):
name = "DFORM"
allowed_domain = ["http://www.sec.gov"]
start_urls = [
"http://www.sec.gov/cgi-bin/srch-edgar?text=form-type%3Dd+and+state%3DMN&first=2008&last=2011"
]
rules = (
Rule(
SgmlLinkExtractor(restrict_xpaths=('/html/body/div/table/tr/td[3]/a[2]')),
callback='parse_formd',
#follow= True # no need to apply the link rules for formd pages
),
# Nex pages to scrape:
# fetch the link with text '[NEXT]' in the first <center> tag (above the main <table>)
# center[2] would select the [NEXT] link below the table
Rule(
SgmlLinkExtractor(restrict_xpaths=('/html/body/div/center[1]/a[contains(., "[NEXT]")]')),
follow= True
),
)
def parse_formd(self, response):
hxs = HtmlXPathSelector(response)
sites = hxs.select('//*[@id="formDiv"]/div/table/tr[3]/td[3]/a/@href').extract()
for site in sites:
yield Request(url=urlparse.urljoin(response.url, site), callback=self.parse_xml_document)
def parse_xml_document(self, response):
xxs = XmlXPathSelector(response)
item = SecFormD()
item["company"] = xxs.select('./primaryIssuer/entityName/text()').extract()[0]
# XPaths to fix
#item['company'] = site.select('//*[@id="collapsible1"]/div[1]/div[2]/div[2]/span[2]/text()').extract()
#item['filling_date'] = site.select('//*[@id="collapsible40"]/div[1]/div[2]/div[5]/span[2]/text()').extract()
#item['types_of_securities'] = site.select('//*[@id="collapsible37"]/div[1]/div[2]/div[1]/span[2]/text()').extract()
#item['offering_amount'] = site.select('//*[@id="collapsible39"]/div[1]/div[2]/div[1]/span[2]/text()').extract()
#item['sold_amount'] = site.select('//*[@id="collapsible39"]/div[1]/div[2]/div[2]/span[2]/text()').extract()
#item['remaining'] = site.select('//*[@id="collapsible39"]/div[1]/div[2]/div[3]/span[2]/text()').extract()
#item['investors_accredited'] = site.select('//*[@id="collapsible40"]/div[1]/div[2]/div[2]/span[2]/text()').extract()
#item['investors_non_accredited'] = site.select('//*[@id="collapsible40"]/div[1]/div[2]/div[1]/span[2]/text()').extract()
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment