hgbrian/gist:2cae6189f11375bc1080

## gistfile1.py
# -*- coding: utf-8 -*-
# Run with: scrapy crawl test1 -o test1.json
import scrapy

class IgemItem(scrapy.Item):
    year = scrapy.Field()
    title = scrapy.Field()
    subtitle = scrapy.Field()
    description = scrapy.Field()

class Test1Spider(scrapy.Spider):
    name = "test1"
    #allowed_domains = ["igem.org"]
    start_urls = (
        'http://2008.igem.org/Jamboree/Project_Abstract/Team_Abstracts',
        'http://2009.igem.org/Jamboree/Project_Abstract/Team_Abstracts',
        'http://2010.igem.org/Jamboree/Project_Abstract/Team_Abstracts',
        'http://2011.igem.org/Jamboree/Team_Abstracts',
        'http://2012.igem.org/Jamboree/Team_Abstracts',
        'http://2013.igem.org/Jamboree/Team_Abstracts',
    )

    def parse(self, response):
        def _clean(sel):
            #if len(sel.extract()) == 0:
            #    return ""
            return ''.join(sel.extract()).replace("\n","").strip(": ").strip()

        if "2008" in response.url:
            for sel in response.xpath('//h4/span[contains(@class, "mw-headline")]'):
                item = IgemItem()
                item['year'] = response.url[7:11]
                item['title'] = _clean(sel.xpath("a/text()"))
                item['subtitle'] = _clean(sel.xpath('../following-sibling::p[1]/i/text()'))
                item['description'] = _clean(sel.xpath('../following-sibling::p[2]/text()'))
                yield item
        else:
            if any(yr in response.url for yr in ("2009","2010")):
                xp = '//span[contains(@class, "mw-headline")]'
            else:
                xp = '//h4/span[contains(@class, "mw-headline")]'

            for sel in response.xpath(xp):
                item = IgemItem()
                item['year'] = response.url[7:11]
                item['title'] = _clean(sel.xpath("a/text()"))
                item['subtitle'] = _clean(sel.xpath("text()"))
                item['description'] = _clean(sel.xpath('../following-sibling::p[1]/text()'))
                yield item
	# -- coding: utf-8 --
	# Run with: scrapy crawl test1 -o test1.json
	import scrapy

	class IgemItem(scrapy.Item):
	year = scrapy.Field()
	title = scrapy.Field()
	subtitle = scrapy.Field()
	description = scrapy.Field()

	class Test1Spider(scrapy.Spider):
	name = "test1"
	#allowed_domains = ["igem.org"]
	start_urls = (
	'http://2008.igem.org/Jamboree/Project_Abstract/Team_Abstracts',
	'http://2009.igem.org/Jamboree/Project_Abstract/Team_Abstracts',
	'http://2010.igem.org/Jamboree/Project_Abstract/Team_Abstracts',
	'http://2011.igem.org/Jamboree/Team_Abstracts',
	'http://2012.igem.org/Jamboree/Team_Abstracts',
	'http://2013.igem.org/Jamboree/Team_Abstracts',
	)

	def parse(self, response):
	def _clean(sel):
	#if len(sel.extract()) == 0:
	# return ""
	return ''.join(sel.extract()).replace("\n","").strip(": ").strip()

	if "2008" in response.url:
	for sel in response.xpath('//h4/span[contains(@class, "mw-headline")]'):
	item = IgemItem()
	item['year'] = response.url[7:11]
	item['title'] = _clean(sel.xpath("a/text()"))
	item['subtitle'] = _clean(sel.xpath('../following-sibling::p[1]/i/text()'))
	item['description'] = _clean(sel.xpath('../following-sibling::p[2]/text()'))
	yield item
	else:
	if any(yr in response.url for yr in ("2009","2010")):
	xp = '//span[contains(@class, "mw-headline")]'
	else:
	xp = '//h4/span[contains(@class, "mw-headline")]'

	for sel in response.xpath(xp):
	item = IgemItem()
	item['year'] = response.url[7:11]
	item['title'] = _clean(sel.xpath("a/text()"))
	item['subtitle'] = _clean(sel.xpath("text()"))
	item['description'] = _clean(sel.xpath('../following-sibling::p[1]/text()'))
	yield item