Skip to content

Instantly share code, notes, and snippets.

@hgbrian
Created August 15, 2014 23:30
Show Gist options
  • Save hgbrian/2cae6189f11375bc1080 to your computer and use it in GitHub Desktop.
Save hgbrian/2cae6189f11375bc1080 to your computer and use it in GitHub Desktop.
scrapy for iGEM
# -*- coding: utf-8 -*-
# Run with: scrapy crawl test1 -o test1.json
import scrapy
class IgemItem(scrapy.Item):
year = scrapy.Field()
title = scrapy.Field()
subtitle = scrapy.Field()
description = scrapy.Field()
class Test1Spider(scrapy.Spider):
name = "test1"
#allowed_domains = ["igem.org"]
start_urls = (
'http://2008.igem.org/Jamboree/Project_Abstract/Team_Abstracts',
'http://2009.igem.org/Jamboree/Project_Abstract/Team_Abstracts',
'http://2010.igem.org/Jamboree/Project_Abstract/Team_Abstracts',
'http://2011.igem.org/Jamboree/Team_Abstracts',
'http://2012.igem.org/Jamboree/Team_Abstracts',
'http://2013.igem.org/Jamboree/Team_Abstracts',
)
def parse(self, response):
def _clean(sel):
#if len(sel.extract()) == 0:
# return ""
return ''.join(sel.extract()).replace("\n","").strip(": ").strip()
if "2008" in response.url:
for sel in response.xpath('//h4/span[contains(@class, "mw-headline")]'):
item = IgemItem()
item['year'] = response.url[7:11]
item['title'] = _clean(sel.xpath("a/text()"))
item['subtitle'] = _clean(sel.xpath('../following-sibling::p[1]/i/text()'))
item['description'] = _clean(sel.xpath('../following-sibling::p[2]/text()'))
yield item
else:
if any(yr in response.url for yr in ("2009","2010")):
xp = '//span[contains(@class, "mw-headline")]'
else:
xp = '//h4/span[contains(@class, "mw-headline")]'
for sel in response.xpath(xp):
item = IgemItem()
item['year'] = response.url[7:11]
item['title'] = _clean(sel.xpath("a/text()"))
item['subtitle'] = _clean(sel.xpath("text()"))
item['description'] = _clean(sel.xpath('../following-sibling::p[1]/text()'))
yield item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment