Skip to content

Instantly share code, notes, and snippets.

@t0mst0ne
Last active August 29, 2015 14:16
Show Gist options
  • Save t0mst0ne/18438a0cf7d9690e0852 to your computer and use it in GitHub Desktop.
Save t0mst0ne/18438a0cf7d9690e0852 to your computer and use it in GitHub Desktop.
Get cwb.gov.tw data
#!/usr/bin/env python
#coding:UTF-8
import pandas as pd
import scrapy
import requests
from scrapy.selector import Selector
from datetime import date
# pull observer lists
all_data = 'http://www.cwb.gov.tw/V7/observe/real/ALL.htm'
r = requests.get(all_data)
r.encoding = 'utf-8'
sel = Selector(text=r.text)
URL = sel.xpath('//a[contains(@href, "\")]/@href').extract() # parse <a href=\'C0F99.htm\'>\u5927\u96c5</a>
NAME = sel.xpath('//a[contains(@href, "\")]/text()').extract()
Observer = dict(zip(NAME,URL))
# pull all data
df_today = pd.DataFrame()
for k, v in Observer.items():
print k
url = 'http://www.cwb.gov.tw/V7/observe/24real/Data/' + v
#print url
html = requests.get(url)
html.encoding = 'utf-8'
df = pd.read_html(html.text, skiprows=0, header=0)[0]
df['date'] = pd.to_datetime(df[u'觀測時間'])
df = df.set_index(['date'])
df['location'] = k
#print df[:1]
df_today = pd.concat([df,df_today])
df_today.to_csv('%s_cwb.csv' % (date.today()) , encoding='utf-8' )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment