Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#1scrapy基本
--items.py でアイテムフィールドを設定する。今回はタイトルとリンク。
from scrapy.item import Item, Field
class CraigslistSampleItem(Item):
# define the fields for your item here like:
# name = Field()
title = Field()
link = Field()
--test.py プロジェクト名、ドメイン、スタートURL、パース設定、items.pyからアイテムフィールドを呼び出す。
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from tutorial.items import CraigslistSampleItem
class Myspider(BaseSpider):
name = 'craig'
allowed_domains = ["craigslist.org"]
start_urls = ["http://sfbay.craigslist.org/sfc/npo/"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select("//p")
items = []
for titles in titles:
item = CraigslistSampleItem()
item["title"] = titles.select("a/text()").extract()
item["link"] = titles.select("a/@href").extract()
items.append(item)
return items
--scrapy.cfgのあるフォルダまで移動し、csvファイルを作成。
scrapy crawl craig -o items.csv -t csv
#2basic title-scraping without any libraries
import urllib
import re
urls = ["http://google.com","http://nytimes.com","http://CNN.com"]
i=0
regex = '<title>(.+?)</title>'
#正規表現をコンパイルしてあとで引数として使う。
pattern = re.compile(regex)
while i < len(urls):
htmlfile = urllib.urlopen(urls[i])
htmltext = htmlfile.read()
titles = re.findall(pattern,htmltext)
print titles
i += 1
#3yahooファイナンスによる上場企業の所在地スクレイピングコード
geopick.py
# -*- coding: utf-8 -*-
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from geocompany.items import GeocompanyItem
import csv
company_index = csv.reader(open("companyindex2bu1_563.csv","rb"))
company_index = company_index.next()
company_count = 0
class Myspider(BaseSpider):
start_urls = []
for i in company_index:
start_urls.append("http://profile.yahoo.co.jp/fundamental/"+i)
name = 'company'
allowed_domains = ["profile.yahoo.co.jp"]
def parse(self, response):
hxs = HtmlXPathSelector(response)
titles = hxs.select('//*[@class="yjMt"][3]/following::tr[1]/td[2]/text()').extract()
names = hxs.select('//*/h1[1]/strong/text()').extract()
item = GeocompanyItem()
item["place"] = titles
item["company_name"] = names
item["url"] = response.url
global company_count
item["index"] = company_count
company_count += 1
return item
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment