yingminc/load_weather_us.ipynb

## load_weather_us.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# load weather data (US version)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from datetime import datetime,timedelta\n",
    "import re\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 215,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "#set date range\n",
    "d1 = datetime(2010,1,1)\n",
    "d2 = datetime(2017,7,18)\n",
    "\n",
    "delta = d2-d1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 216,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "span = []\n",
    "for i in range(delta.days+1):\n",
    "     intdate = map(int, str(d1+timedelta(days=i)).split(' ')[0].split('-'))\n",
    "     span.append(intdate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 217,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "allhours=[]\n",
    "for i in range(int(delta.total_seconds()//3600)+1):\n",
    "     intdate = (d1+timedelta(hours=i))\n",
    "     allhours.append(intdate)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 218,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "41"
      ]
     },
     "execution_count": 218,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(span)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2017, 6, 8]\n",
      "[2017, 6, 9]\n",
      "[2017, 6, 10]\n",
      "[2017, 6, 11]\n",
      "[2017, 6, 12]\n",
      "[2017, 6, 13]\n",
      "[2017, 6, 14]\n",
      "[2017, 6, 15]\n",
      "[2017, 6, 16]\n",
      "[2017, 6, 17]\n",
      "[2017, 6, 18]\n",
      "[2017, 6, 19]\n",
      "[2017, 6, 20]\n",
      "[2017, 6, 21]\n",
      "[2017, 6, 22]\n",
      "[2017, 6, 23]\n",
      "[2017, 6, 24]\n",
      "[2017, 6, 25]\n",
      "[2017, 6, 26]\n",
      "[2017, 6, 27]\n",
      "[2017, 6, 28]\n",
      "[2017, 6, 29]\n",
      "[2017, 6, 30]\n",
      "[2017, 7, 1]\n",
      "[2017, 7, 2]\n",
      "[2017, 7, 3]\n",
      "[2017, 7, 4]\n",
      "[2017, 7, 5]\n",
      "[2017, 7, 6]\n",
      "[2017, 7, 7]\n",
      "[2017, 7, 8]\n",
      "[2017, 7, 9]\n",
      "[2017, 7, 10]\n",
      "[2017, 7, 11]\n",
      "[2017, 7, 12]\n",
      "[2017, 7, 13]\n",
      "[2017, 7, 14]\n",
      "[2017, 7, 15]\n",
      "[2017, 7, 16]\n",
      "[2017, 7, 17]\n",
      "[2017, 7, 18]\n"
     ]
    }
   ],
   "source": [
    "#loop the dates\n",
    "for ind, day in enumerate(span):\n",
    "    print day\n",
    "    \n",
    "    #load data from url\n",
    "    y,m,d = (day[0],day[1],day[2])\n",
    "    url = 'https://www.wunderground.com/history/airport/KBWI/{}/{}/{}/DailyHistory.html?req_city=Baltimore&req_state=MD&req_statename=&reqdb.zip=21240&reqdb.magic=7&reqdb.wmo=99999'.format(y,m,d)\n",
    "    w = pd.read_html(url)[4]\n",
    "    \n",
    "    #select target columns and unify names\n",
    "    cols = w.columns\n",
    "    w.rename(columns={cols[0]:'time'},inplace=True)\n",
    "    w = w[['time','Temp.','Dew Point', 'Humidity', 'Pressure','Visibility','Wind Dir','Wind Speed','Gust Speed', 'Precip','Events','Conditions']]\n",
    "    cols = w.columns\n",
    "    \n",
    "    #reform time format\n",
    "    w['datetime'] = [('-').join(map(str,(y,m,d)))+' '+'00'+':'+t.split(':')[1].split(' ')[0] if('12'in t.split(':')[0])and ('AM'in t)\n",
    "             else ('-').join(map(str,(y,m,d)))+' '+t.split(' ')[0] if 'AM' in t \n",
    "             else ('-').join(map(str,(y,m,d)))+' '+'12'+':'+t.split(':')[1].split(' ')[0] if('12'in t.split(':')[0])and ('PM'in t)\n",
    "             else ('-').join(map(str,(y,m,d)))+' '+str(int(t.split(':')[0])+12)+':'+t.split(':')[1].split(' ')[0]\n",
    "             for t in w[cols[0]]]\n",
    "    \n",
    "    #remove the units \n",
    "    w.fillna(value='-',inplace=True)\n",
    "    w[cols[7]]=[v.split(' ')[0] for v in w[cols[7]]]\n",
    "    w[cols[8]]=[v.split(' ')[0] for v in w[cols[8]]]\n",
    "    \n",
    "    for i in [1,2,3,4,5,7,8,9]:\n",
    "        w[cols[i]]=[re.sub('[^-0-9.]','',v) for v in w[cols[i]]]\n",
    "        \n",
    "    #reset column names\n",
    "    w.columns = ['time','temp.(c)', 'dew_point(c)','humidity(%)', 'pressure(hpa)','visibiliy(km)', \n",
    "             'wind_dir', 'wind_speed(km/h)','gust_speed(km/h)','precip(mm)','events','conditon','datetime']\n",
    "    \n",
    "    #set hourly time\n",
    "    w['datetime'] = pd.to_datetime(w['datetime'])\n",
    "    w['datetime_hourly'] = [t+timedelta(minutes=(60-t.minute)) if (60-t.minute)<30 \n",
    "                            else t-timedelta(minutes=t.minute) for t in w['datetime']]\n",
    "    w['to_whole_point(min)'] = [(60-t.minute) if (60-t.minute)<30 \n",
    "                            else t.minute for t in w['datetime']]\n",
    "    \n",
    "    #remove the duplicates\n",
    "    w.sort_values(['datetime_hourly','to_whole_point(min)'],inplace=True)\n",
    "    w.drop_duplicates(subset='datetime_hourly',keep='first',inplace=True)\n",
    "    \n",
    "    #save file\n",
    "    with open ('../weather_bwi_t.csv','a') as f:\n",
    "        #if ind ==0:\n",
    "            #w.to_csv(f, index= False, header = True,encoding = 'utf-8')\n",
    "        #else:\n",
    "        w.to_csv(f, index= False, header = None,encoding = 'utf-8')\n",
    "\n",
    "#total = pd.concat(frames)\n",
    "#total['location'] = \n",
    "\n",
    "#total.to_csv('../total_weather_us.csv', index= False, header = True, encoding = 'utf-8')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 221,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "       time  temp.(c)  dew_point(c)  humidity(%)  pressure(hpa)  \\\n",
      "0  12:54 AM       6.1          13.3         57.0         1020.6   \n",
      "1   1:54 AM       6.7          12.8         62.0         1021.0   \n",
      "2   2:54 AM       8.9          12.8         74.0         1021.0   \n",
      "3   3:54 AM       8.3          12.8         71.0         1020.9   \n",
      "4   4:54 AM       7.8          12.8         68.0         1021.0   \n",
      "\n",
      "   visibiliy(km) wind_dir  wind_speed(km/h)  gust_speed(km/h)  precip(mm)  \\\n",
      "0           16.1       NW               7.4               NaN         NaN   \n",
      "1           16.1      WNW              11.1               NaN         NaN   \n",
      "2           16.1     West               9.3               NaN         NaN   \n",
      "3           16.1      WNW              11.1               NaN         NaN   \n",
      "4           16.1      WNW              14.8               NaN         NaN   \n",
      "\n",
      "  events       conditon             datetime      datetime_hourly  \\\n",
      "0      -  Partly Cloudy  2002-01-01 00:54:00  2002-01-01 01:00:00   \n",
      "1      -  Mostly Cloudy  2002-01-01 01:54:00  2002-01-01 02:00:00   \n",
      "2      -  Mostly Cloudy  2002-01-01 02:54:00  2002-01-01 03:00:00   \n",
      "3      -          Clear  2002-01-01 03:54:00  2002-01-01 04:00:00   \n",
      "4      -          Clear  2002-01-01 04:54:00  2002-01-01 05:00:00   \n",
      "\n",
      "   to_whole_point(min)  \n",
      "0                    6  \n",
      "1                    6  \n",
      "2                    6  \n",
      "3                    6  \n",
      "4                    6  \n"
     ]
    }
   ],
   "source": [
    "#recheck the cross-date duplicates\n",
    "d = pd.read_csv('../weather_bwi_t.csv',header=0)\n",
    "print d.head()\n",
    "d.drop_duplicates(subset='datetime_hourly',keep='first',inplace=True)\n",
    "d.columns = ['time','temp.(c)', 'dew_point(c)','humidity(%)', 'pressure(hpa)','visibiliy(km)', \n",
    "             'wind_dir', 'wind_speed(km/h)','gust_speed(km/h)','precip(mm)','events',\n",
    "             'conditon','datetime','datetime_hourly','to_whole_point(min)']\n",
    "d['temp.(c)']=[np.nan if t == '-' else t for t in d['temp.(c)']]\n",
    "d['temp.(c)'].astype(float)\n",
    "d.to_csv('../weather_bwi_t.csv',header=True, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"# load weather data (US version)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 184,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"import pandas as pd\n",
	"from datetime import datetime,timedelta\n",
	"import re\n",
	"import numpy as np"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 215,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"#set date range\n",
	"d1 = datetime(2010,1,1)\n",
	"d2 = datetime(2017,7,18)\n",
	"\n",
	"delta = d2-d1"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 216,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"span = []\n",
	"for i in range(delta.days+1):\n",
	" intdate = map(int, str(d1+timedelta(days=i)).split(' ')[0].split('-'))\n",
	" span.append(intdate)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 217,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": [
	"allhours=[]\n",
	"for i in range(int(delta.total_seconds()//3600)+1):\n",
	" intdate = (d1+timedelta(hours=i))\n",
	" allhours.append(intdate)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 218,
	"metadata": {},
	"outputs": [
	{
	"data": {
	"text/plain": [
	"41"
	]
	},
	"execution_count": 218,
	"metadata": {},
	"output_type": "execute_result"
	}
	],
	"source": [
	"len(span)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 219,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"[2017, 6, 8]\n",
	"[2017, 6, 9]\n",
	"[2017, 6, 10]\n",
	"[2017, 6, 11]\n",
	"[2017, 6, 12]\n",
	"[2017, 6, 13]\n",
	"[2017, 6, 14]\n",
	"[2017, 6, 15]\n",
	"[2017, 6, 16]\n",
	"[2017, 6, 17]\n",
	"[2017, 6, 18]\n",
	"[2017, 6, 19]\n",
	"[2017, 6, 20]\n",
	"[2017, 6, 21]\n",
	"[2017, 6, 22]\n",
	"[2017, 6, 23]\n",
	"[2017, 6, 24]\n",
	"[2017, 6, 25]\n",
	"[2017, 6, 26]\n",
	"[2017, 6, 27]\n",
	"[2017, 6, 28]\n",
	"[2017, 6, 29]\n",
	"[2017, 6, 30]\n",
	"[2017, 7, 1]\n",
	"[2017, 7, 2]\n",
	"[2017, 7, 3]\n",
	"[2017, 7, 4]\n",
	"[2017, 7, 5]\n",
	"[2017, 7, 6]\n",
	"[2017, 7, 7]\n",
	"[2017, 7, 8]\n",
	"[2017, 7, 9]\n",
	"[2017, 7, 10]\n",
	"[2017, 7, 11]\n",
	"[2017, 7, 12]\n",
	"[2017, 7, 13]\n",
	"[2017, 7, 14]\n",
	"[2017, 7, 15]\n",
	"[2017, 7, 16]\n",
	"[2017, 7, 17]\n",
	"[2017, 7, 18]\n"
	]
	}
	],
	"source": [
	"#loop the dates\n",
	"for ind, day in enumerate(span):\n",
	" print day\n",
	" \n",
	" #load data from url\n",
	" y,m,d = (day[0],day[1],day[2])\n",
	" url = 'https://www.wunderground.com/history/airport/KBWI/{}/{}/{}/DailyHistory.html?req_city=Baltimore&req_state=MD&req_statename=&reqdb.zip=21240&reqdb.magic=7&reqdb.wmo=99999'.format(y,m,d)\n",
	" w = pd.read_html(url)[4]\n",
	" \n",
	" #select target columns and unify names\n",
	" cols = w.columns\n",
	" w.rename(columns={cols[0]:'time'},inplace=True)\n",
	" w = w[['time','Temp.','Dew Point', 'Humidity', 'Pressure','Visibility','Wind Dir','Wind Speed','Gust Speed', 'Precip','Events','Conditions']]\n",
	" cols = w.columns\n",
	" \n",
	" #reform time format\n",
	" w['datetime'] = [('-').join(map(str,(y,m,d)))+' '+'00'+':'+t.split(':')[1].split(' ')[0] if('12'in t.split(':')[0])and ('AM'in t)\n",
	" else ('-').join(map(str,(y,m,d)))+' '+t.split(' ')[0] if 'AM' in t \n",
	" else ('-').join(map(str,(y,m,d)))+' '+'12'+':'+t.split(':')[1].split(' ')[0] if('12'in t.split(':')[0])and ('PM'in t)\n",
	" else ('-').join(map(str,(y,m,d)))+' '+str(int(t.split(':')[0])+12)+':'+t.split(':')[1].split(' ')[0]\n",
	" for t in w[cols[0]]]\n",
	" \n",
	" #remove the units \n",
	" w.fillna(value='-',inplace=True)\n",
	" w[cols[7]]=[v.split(' ')[0] for v in w[cols[7]]]\n",
	" w[cols[8]]=[v.split(' ')[0] for v in w[cols[8]]]\n",
	" \n",
	" for i in [1,2,3,4,5,7,8,9]:\n",
	" w[cols[i]]=[re.sub('[^-0-9.]','',v) for v in w[cols[i]]]\n",
	" \n",
	" #reset column names\n",
	" w.columns = ['time','temp.(c)', 'dew_point(c)','humidity(%)', 'pressure(hpa)','visibiliy(km)', \n",
	" 'wind_dir', 'wind_speed(km/h)','gust_speed(km/h)','precip(mm)','events','conditon','datetime']\n",
	" \n",
	" #set hourly time\n",
	" w['datetime'] = pd.to_datetime(w['datetime'])\n",
	" w['datetime_hourly'] = [t+timedelta(minutes=(60-t.minute)) if (60-t.minute)<30 \n",
	" else t-timedelta(minutes=t.minute) for t in w['datetime']]\n",
	" w['to_whole_point(min)'] = [(60-t.minute) if (60-t.minute)<30 \n",
	" else t.minute for t in w['datetime']]\n",
	" \n",
	" #remove the duplicates\n",
	" w.sort_values(['datetime_hourly','to_whole_point(min)'],inplace=True)\n",
	" w.drop_duplicates(subset='datetime_hourly',keep='first',inplace=True)\n",
	" \n",
	" #save file\n",
	" with open ('../weather_bwi_t.csv','a') as f:\n",
	" #if ind ==0:\n",
	" #w.to_csv(f, index= False, header = True,encoding = 'utf-8')\n",
	" #else:\n",
	" w.to_csv(f, index= False, header = None,encoding = 'utf-8')\n",
	"\n",
	"#total = pd.concat(frames)\n",
	"#total['location'] = \n",
	"\n",
	"#total.to_csv('../total_weather_us.csv', index= False, header = True, encoding = 'utf-8')\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 221,
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" time temp.(c) dew_point(c) humidity(%) pressure(hpa) \\\n",
	"0 12:54 AM 6.1 13.3 57.0 1020.6 \n",
	"1 1:54 AM 6.7 12.8 62.0 1021.0 \n",
	"2 2:54 AM 8.9 12.8 74.0 1021.0 \n",
	"3 3:54 AM 8.3 12.8 71.0 1020.9 \n",
	"4 4:54 AM 7.8 12.8 68.0 1021.0 \n",
	"\n",
	" visibiliy(km) wind_dir wind_speed(km/h) gust_speed(km/h) precip(mm) \\\n",
	"0 16.1 NW 7.4 NaN NaN \n",
	"1 16.1 WNW 11.1 NaN NaN \n",
	"2 16.1 West 9.3 NaN NaN \n",
	"3 16.1 WNW 11.1 NaN NaN \n",
	"4 16.1 WNW 14.8 NaN NaN \n",
	"\n",
	" events conditon datetime datetime_hourly \\\n",
	"0 - Partly Cloudy 2002-01-01 00:54:00 2002-01-01 01:00:00 \n",
	"1 - Mostly Cloudy 2002-01-01 01:54:00 2002-01-01 02:00:00 \n",
	"2 - Mostly Cloudy 2002-01-01 02:54:00 2002-01-01 03:00:00 \n",
	"3 - Clear 2002-01-01 03:54:00 2002-01-01 04:00:00 \n",
	"4 - Clear 2002-01-01 04:54:00 2002-01-01 05:00:00 \n",
	"\n",
	" to_whole_point(min) \n",
	"0 6 \n",
	"1 6 \n",
	"2 6 \n",
	"3 6 \n",
	"4 6 \n"
	]
	}
	],
	"source": [
	"#recheck the cross-date duplicates\n",
	"d = pd.read_csv('../weather_bwi_t.csv',header=0)\n",
	"print d.head()\n",
	"d.drop_duplicates(subset='datetime_hourly',keep='first',inplace=True)\n",
	"d.columns = ['time','temp.(c)', 'dew_point(c)','humidity(%)', 'pressure(hpa)','visibiliy(km)', \n",
	" 'wind_dir', 'wind_speed(km/h)','gust_speed(km/h)','precip(mm)','events',\n",
	" 'conditon','datetime','datetime_hourly','to_whole_point(min)']\n",
	"d['temp.(c)']=[np.nan if t == '-' else t for t in d['temp.(c)']]\n",
	"d['temp.(c)'].astype(float)\n",
	"d.to_csv('../weather_bwi_t.csv',header=True, index=False)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.13"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 2
	}