Skip to content

Instantly share code, notes, and snippets.

@yingminc
Created September 26, 2017 06:05
Show Gist options
  • Save yingminc/2552f8e5df1878263a7ac7411ad048ae to your computer and use it in GitHub Desktop.
Save yingminc/2552f8e5df1878263a7ac7411ad048ae to your computer and use it in GitHub Desktop.
load weather
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# load weather data (US version)"
]
},
{
"cell_type": "code",
"execution_count": 184,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"from datetime import datetime,timedelta\n",
"import re\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 215,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"#set date range\n",
"d1 = datetime(2010,1,1)\n",
"d2 = datetime(2017,7,18)\n",
"\n",
"delta = d2-d1"
]
},
{
"cell_type": "code",
"execution_count": 216,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"span = []\n",
"for i in range(delta.days+1):\n",
" intdate = map(int, str(d1+timedelta(days=i)).split(' ')[0].split('-'))\n",
" span.append(intdate)"
]
},
{
"cell_type": "code",
"execution_count": 217,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"allhours=[]\n",
"for i in range(int(delta.total_seconds()//3600)+1):\n",
" intdate = (d1+timedelta(hours=i))\n",
" allhours.append(intdate)"
]
},
{
"cell_type": "code",
"execution_count": 218,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"41"
]
},
"execution_count": 218,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(span)"
]
},
{
"cell_type": "code",
"execution_count": 219,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[2017, 6, 8]\n",
"[2017, 6, 9]\n",
"[2017, 6, 10]\n",
"[2017, 6, 11]\n",
"[2017, 6, 12]\n",
"[2017, 6, 13]\n",
"[2017, 6, 14]\n",
"[2017, 6, 15]\n",
"[2017, 6, 16]\n",
"[2017, 6, 17]\n",
"[2017, 6, 18]\n",
"[2017, 6, 19]\n",
"[2017, 6, 20]\n",
"[2017, 6, 21]\n",
"[2017, 6, 22]\n",
"[2017, 6, 23]\n",
"[2017, 6, 24]\n",
"[2017, 6, 25]\n",
"[2017, 6, 26]\n",
"[2017, 6, 27]\n",
"[2017, 6, 28]\n",
"[2017, 6, 29]\n",
"[2017, 6, 30]\n",
"[2017, 7, 1]\n",
"[2017, 7, 2]\n",
"[2017, 7, 3]\n",
"[2017, 7, 4]\n",
"[2017, 7, 5]\n",
"[2017, 7, 6]\n",
"[2017, 7, 7]\n",
"[2017, 7, 8]\n",
"[2017, 7, 9]\n",
"[2017, 7, 10]\n",
"[2017, 7, 11]\n",
"[2017, 7, 12]\n",
"[2017, 7, 13]\n",
"[2017, 7, 14]\n",
"[2017, 7, 15]\n",
"[2017, 7, 16]\n",
"[2017, 7, 17]\n",
"[2017, 7, 18]\n"
]
}
],
"source": [
"#loop the dates\n",
"for ind, day in enumerate(span):\n",
" print day\n",
" \n",
" #load data from url\n",
" y,m,d = (day[0],day[1],day[2])\n",
" url = 'https://www.wunderground.com/history/airport/KBWI/{}/{}/{}/DailyHistory.html?req_city=Baltimore&req_state=MD&req_statename=&reqdb.zip=21240&reqdb.magic=7&reqdb.wmo=99999'.format(y,m,d)\n",
" w = pd.read_html(url)[4]\n",
" \n",
" #select target columns and unify names\n",
" cols = w.columns\n",
" w.rename(columns={cols[0]:'time'},inplace=True)\n",
" w = w[['time','Temp.','Dew Point', 'Humidity', 'Pressure','Visibility','Wind Dir','Wind Speed','Gust Speed', 'Precip','Events','Conditions']]\n",
" cols = w.columns\n",
" \n",
" #reform time format\n",
" w['datetime'] = [('-').join(map(str,(y,m,d)))+' '+'00'+':'+t.split(':')[1].split(' ')[0] if('12'in t.split(':')[0])and ('AM'in t)\n",
" else ('-').join(map(str,(y,m,d)))+' '+t.split(' ')[0] if 'AM' in t \n",
" else ('-').join(map(str,(y,m,d)))+' '+'12'+':'+t.split(':')[1].split(' ')[0] if('12'in t.split(':')[0])and ('PM'in t)\n",
" else ('-').join(map(str,(y,m,d)))+' '+str(int(t.split(':')[0])+12)+':'+t.split(':')[1].split(' ')[0]\n",
" for t in w[cols[0]]]\n",
" \n",
" #remove the units \n",
" w.fillna(value='-',inplace=True)\n",
" w[cols[7]]=[v.split(' ')[0] for v in w[cols[7]]]\n",
" w[cols[8]]=[v.split(' ')[0] for v in w[cols[8]]]\n",
" \n",
" for i in [1,2,3,4,5,7,8,9]:\n",
" w[cols[i]]=[re.sub('[^-0-9.]','',v) for v in w[cols[i]]]\n",
" \n",
" #reset column names\n",
" w.columns = ['time','temp.(c)', 'dew_point(c)','humidity(%)', 'pressure(hpa)','visibiliy(km)', \n",
" 'wind_dir', 'wind_speed(km/h)','gust_speed(km/h)','precip(mm)','events','conditon','datetime']\n",
" \n",
" #set hourly time\n",
" w['datetime'] = pd.to_datetime(w['datetime'])\n",
" w['datetime_hourly'] = [t+timedelta(minutes=(60-t.minute)) if (60-t.minute)<30 \n",
" else t-timedelta(minutes=t.minute) for t in w['datetime']]\n",
" w['to_whole_point(min)'] = [(60-t.minute) if (60-t.minute)<30 \n",
" else t.minute for t in w['datetime']]\n",
" \n",
" #remove the duplicates\n",
" w.sort_values(['datetime_hourly','to_whole_point(min)'],inplace=True)\n",
" w.drop_duplicates(subset='datetime_hourly',keep='first',inplace=True)\n",
" \n",
" #save file\n",
" with open ('../weather_bwi_t.csv','a') as f:\n",
" #if ind ==0:\n",
" #w.to_csv(f, index= False, header = True,encoding = 'utf-8')\n",
" #else:\n",
" w.to_csv(f, index= False, header = None,encoding = 'utf-8')\n",
"\n",
"#total = pd.concat(frames)\n",
"#total['location'] = \n",
"\n",
"#total.to_csv('../total_weather_us.csv', index= False, header = True, encoding = 'utf-8')\n"
]
},
{
"cell_type": "code",
"execution_count": 221,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
" time temp.(c) dew_point(c) humidity(%) pressure(hpa) \\\n",
"0 12:54 AM 6.1 13.3 57.0 1020.6 \n",
"1 1:54 AM 6.7 12.8 62.0 1021.0 \n",
"2 2:54 AM 8.9 12.8 74.0 1021.0 \n",
"3 3:54 AM 8.3 12.8 71.0 1020.9 \n",
"4 4:54 AM 7.8 12.8 68.0 1021.0 \n",
"\n",
" visibiliy(km) wind_dir wind_speed(km/h) gust_speed(km/h) precip(mm) \\\n",
"0 16.1 NW 7.4 NaN NaN \n",
"1 16.1 WNW 11.1 NaN NaN \n",
"2 16.1 West 9.3 NaN NaN \n",
"3 16.1 WNW 11.1 NaN NaN \n",
"4 16.1 WNW 14.8 NaN NaN \n",
"\n",
" events conditon datetime datetime_hourly \\\n",
"0 - Partly Cloudy 2002-01-01 00:54:00 2002-01-01 01:00:00 \n",
"1 - Mostly Cloudy 2002-01-01 01:54:00 2002-01-01 02:00:00 \n",
"2 - Mostly Cloudy 2002-01-01 02:54:00 2002-01-01 03:00:00 \n",
"3 - Clear 2002-01-01 03:54:00 2002-01-01 04:00:00 \n",
"4 - Clear 2002-01-01 04:54:00 2002-01-01 05:00:00 \n",
"\n",
" to_whole_point(min) \n",
"0 6 \n",
"1 6 \n",
"2 6 \n",
"3 6 \n",
"4 6 \n"
]
}
],
"source": [
"#recheck the cross-date duplicates\n",
"d = pd.read_csv('../weather_bwi_t.csv',header=0)\n",
"print d.head()\n",
"d.drop_duplicates(subset='datetime_hourly',keep='first',inplace=True)\n",
"d.columns = ['time','temp.(c)', 'dew_point(c)','humidity(%)', 'pressure(hpa)','visibiliy(km)', \n",
" 'wind_dir', 'wind_speed(km/h)','gust_speed(km/h)','precip(mm)','events',\n",
" 'conditon','datetime','datetime_hourly','to_whole_point(min)']\n",
"d['temp.(c)']=[np.nan if t == '-' else t for t in d['temp.(c)']]\n",
"d['temp.(c)'].astype(float)\n",
"d.to_csv('../weather_bwi_t.csv',header=True, index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment