Skip to content

Instantly share code, notes, and snippets.

@willycs40
Last active August 29, 2015 14:19
Show Gist options
  • Save willycs40/de92b5828930f751f279 to your computer and use it in GitHub Desktop.
Save willycs40/de92b5828930f751f279 to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"name": "",
"signature": "sha256:6e4589971dcfa711282b8954c23ea8483c2ac6eab66be8effd1aabe69cd37607"
},
"nbformat": 3,
"nbformat_minor": 0,
"worksheets": [
{
"cells": [
{
"cell_type": "code",
"collapsed": false,
"input": [
"import pandas as pd\n",
"from sqlalchemy import create_engine # database connection\n",
"import datetime as dt\n",
"from IPython.display import display"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 1
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"csv_path = r\"C:\\Users\\will.cubitt-smith\\Local\\Projects\\ServiceViewWeather\\all_weather.txt\""
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 2
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"display(pd.read_csv(csv_path, nrows=2).head())"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Site Code</th>\n",
" <th>Site Name</th>\n",
" <th>Latitude</th>\n",
" <th>Longitude</th>\n",
" <th>Region</th>\n",
" <th>Observation Time</th>\n",
" <th>Observation Date</th>\n",
" <th>Wind Direction</th>\n",
" <th>Wind Speed</th>\n",
" <th>Wind Gust</th>\n",
" <th>Visibility</th>\n",
" <th>Screen Temperature</th>\n",
" <th>Pressure</th>\n",
" <th>Pressure Tendency</th>\n",
" <th>Significant Weather</th>\n",
" <th>Unnamed: 15</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3008</td>\n",
" <td>FAIR ISLE (3008)</td>\n",
" <td>59.530</td>\n",
" <td>-1.630</td>\n",
" <td>Orkney &amp; Shetland</td>\n",
" <td>07:00</td>\n",
" <td>2013-10-01</td>\n",
" <td>SSE</td>\n",
" <td>22</td>\n",
" <td>30</td>\n",
" <td>NaN</td>\n",
" <td>11.4</td>\n",
" <td>1016</td>\n",
" <td>R</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3031</td>\n",
" <td>LOCH GLACARNOCH SAWS (3031)</td>\n",
" <td>57.725</td>\n",
" <td>-4.896</td>\n",
" <td>Highland &amp; Eilean Siar</td>\n",
" <td>07:00</td>\n",
" <td>2013-10-01</td>\n",
" <td>E</td>\n",
" <td>9</td>\n",
" <td>NaN</td>\n",
" <td>17000</td>\n",
" <td>3.9</td>\n",
" <td>1012</td>\n",
" <td>F</td>\n",
" <td>Sunny (Day)</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "display_data",
"text": [
" Site Code Site Name Latitude Longitude \\\n",
"0 3008 FAIR ISLE (3008) 59.530 -1.630 \n",
"1 3031 LOCH GLACARNOCH SAWS (3031) 57.725 -4.896 \n",
"\n",
" Region Observation Time Observation Date Wind Direction \\\n",
"0 Orkney & Shetland 07:00 2013-10-01 SSE \n",
"1 Highland & Eilean Siar 07:00 2013-10-01 E \n",
"\n",
" Wind Speed Wind Gust Visibility Screen Temperature Pressure \\\n",
"0 22 30 NaN 11.4 1016 \n",
"1 9 NaN 17000 3.9 1012 \n",
"\n",
" Pressure Tendency Significant Weather Unnamed: 15 \n",
"0 R NaN NaN \n",
"1 F Sunny (Day) NaN "
]
}
],
"prompt_number": 3
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"sqllite_engine = create_engine('sqlite:///Weather.db') \n"
],
"language": "python",
"metadata": {},
"outputs": [],
"prompt_number": 11
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"start = dt.datetime.now()\n",
"chunksize = 20000\n",
"j = 0\n",
"index_start = 1\n",
"\n",
"for df in pd.read_csv(csv_path, chunksize=chunksize, iterator=True, encoding='utf-8'):\n",
" \n",
" df = df.rename(columns={c: c.replace(' ', '') for c in df.columns}) # Remove spaces from columns\n",
"\n",
" df['ObservationDate'] = pd.to_datetime(df['ObservationDate']) # Convert to datetimes\n",
" df['ObservationTime'] = df['ObservationTime'].str.slice(0,2)\n",
" df['ObservationTime'] = df['ObservationTime'].astype(int)\n",
" \n",
" df.index += index_start\n",
" \n",
" #print(df.head())\n",
" # Remove the un-interesting columns\n",
" #columns = ['Agency', 'CreatedDate', 'ClosedDate', 'ComplaintType', 'Descriptor',\n",
" # 'CreatedDate', 'ClosedDate', 'TimeToCompletion',\n",
" # 'City']\n",
"\n",
" #for c in df.columns:\n",
" # if c not in columns:\n",
" # df = df.drop(c, axis=1) \n",
"\n",
" j+=1\n",
" print '{} seconds: completed {} rows'.format((dt.datetime.now() - start).seconds, j*chunksize)\n",
"\n",
" df.to_sql('Weather', sqllite_engine, if_exists='append', index=False)\n",
" index_start = df.index[-1] + 1"
],
"language": "python",
"metadata": {},
"outputs": [
{
"output_type": "stream",
"stream": "stdout",
"text": [
"0 seconds: completed 20000 rows\n",
"4 seconds: completed 40000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"6 seconds: completed 60000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"8 seconds: completed 80000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"9 seconds: completed 100000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"12 seconds: completed 120000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"13 seconds: completed 140000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"15 seconds: completed 160000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"17 seconds: completed 180000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"19 seconds: completed 200000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"20 seconds: completed 220000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"22 seconds: completed 240000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"23 seconds: completed 260000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"24 seconds: completed 280000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"26 seconds: completed 300000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"27 seconds: completed 320000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"29 seconds: completed 340000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"30 seconds: completed 360000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"32 seconds: completed 380000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"33 seconds: completed 400000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"34 seconds: completed 420000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"36 seconds: completed 440000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"38 seconds: completed 460000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"39 seconds: completed 480000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"40 seconds: completed 500000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"42 seconds: completed 520000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"43 seconds: completed 540000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"45 seconds: completed 560000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"46 seconds: completed 580000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"48 seconds: completed 600000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"49 seconds: completed 620000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"51 seconds: completed 640000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"52 seconds: completed 660000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"54 seconds: completed 680000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"55 seconds: completed 700000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"57 seconds: completed 720000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"58 seconds: completed 740000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"60 seconds: completed 760000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"61 seconds: completed 780000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"63 seconds: completed 800000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"64 seconds: completed 820000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"66 seconds: completed 840000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"67 seconds: completed 860000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"69 seconds: completed 880000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"70 seconds: completed 900000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"72 seconds: completed 920000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"73 seconds: completed 940000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"75 seconds: completed 960000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"76 seconds: completed 980000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"78 seconds: completed 1000000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"79 seconds: completed 1020000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"81 seconds: completed 1040000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"83 seconds: completed 1060000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"84 seconds: completed 1080000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"85 seconds: completed 1100000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"87 seconds: completed 1120000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"89 seconds: completed 1140000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"90 seconds: completed 1160000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"92 seconds: completed 1180000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"94 seconds: completed 1200000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"95 seconds: completed 1220000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"97 seconds: completed 1240000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"98 seconds: completed 1260000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"100 seconds: completed 1280000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"102 seconds: completed 1300000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"103 seconds: completed 1320000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"106 seconds: completed 1340000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"108 seconds: completed 1360000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n",
"110 seconds: completed 1380000 rows"
]
},
{
"output_type": "stream",
"stream": "stdout",
"text": [
"\n"
]
}
],
"prompt_number": 12
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df = pd.read_sql_query('SELECT * FROM Weather LIMIT 3', disk_engine)\n",
"df.head()"
],
"language": "python",
"metadata": {},
"outputs": [
{
"html": [
"<div style=\"max-height:1000px;max-width:1500px;overflow:auto;\">\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>SiteCode</th>\n",
" <th>SiteName</th>\n",
" <th>Latitude</th>\n",
" <th>Longitude</th>\n",
" <th>Region</th>\n",
" <th>ObservationTime</th>\n",
" <th>ObservationDate</th>\n",
" <th>WindDirection</th>\n",
" <th>WindSpeed</th>\n",
" <th>WindGust</th>\n",
" <th>Visibility</th>\n",
" <th>ScreenTemperature</th>\n",
" <th>Pressure</th>\n",
" <th>PressureTendency</th>\n",
" <th>SignificantWeather</th>\n",
" <th>Unnamed:15</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>3008</td>\n",
" <td>FAIR ISLE (3008)</td>\n",
" <td>59.530</td>\n",
" <td>-1.630</td>\n",
" <td>Orkney &amp; Shetland</td>\n",
" <td>7</td>\n",
" <td>2013-10-01 00:00:00.000000</td>\n",
" <td>SSE</td>\n",
" <td>22</td>\n",
" <td>30</td>\n",
" <td>NaN</td>\n",
" <td>11.4</td>\n",
" <td>1016</td>\n",
" <td>R</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3031</td>\n",
" <td>LOCH GLACARNOCH SAWS (3031)</td>\n",
" <td>57.725</td>\n",
" <td>-4.896</td>\n",
" <td>Highland &amp; Eilean Siar</td>\n",
" <td>7</td>\n",
" <td>2013-10-01 00:00:00.000000</td>\n",
" <td>E</td>\n",
" <td>9</td>\n",
" <td>NaN</td>\n",
" <td>17000</td>\n",
" <td>3.9</td>\n",
" <td>1012</td>\n",
" <td>F</td>\n",
" <td>Sunny (Day)</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3041</td>\n",
" <td>AONACH MOR (3041)</td>\n",
" <td>56.820</td>\n",
" <td>-4.970</td>\n",
" <td>Highland &amp; Eilean Siar</td>\n",
" <td>7</td>\n",
" <td>2013-10-01 00:00:00.000000</td>\n",
" <td>SE</td>\n",
" <td>37</td>\n",
" <td>56</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>#</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"metadata": {},
"output_type": "pyout",
"prompt_number": 13,
"text": [
" SiteCode SiteName Latitude Longitude \\\n",
"0 3008 FAIR ISLE (3008) 59.530 -1.630 \n",
"1 3031 LOCH GLACARNOCH SAWS (3031) 57.725 -4.896 \n",
"2 3041 AONACH MOR (3041) 56.820 -4.970 \n",
"\n",
" Region ObservationTime ObservationDate \\\n",
"0 Orkney & Shetland 7 2013-10-01 00:00:00.000000 \n",
"1 Highland & Eilean Siar 7 2013-10-01 00:00:00.000000 \n",
"2 Highland & Eilean Siar 7 2013-10-01 00:00:00.000000 \n",
"\n",
" WindDirection WindSpeed WindGust Visibility ScreenTemperature Pressure \\\n",
"0 SSE 22 30 NaN 11.4 1016 \n",
"1 E 9 NaN 17000 3.9 1012 \n",
"2 SE 37 56 NaN 2.0 NaN \n",
"\n",
" PressureTendency SignificantWeather Unnamed:15 \n",
"0 R None None \n",
"1 F Sunny (Day) None \n",
"2 # None None "
]
}
],
"prompt_number": 13
},
{
"cell_type": "code",
"collapsed": false,
"input": [
"df.dtypes"
],
"language": "python",
"metadata": {},
"outputs": [
{
"metadata": {},
"output_type": "pyout",
"prompt_number": 10,
"text": [
"SiteCode int64\n",
"SiteName object\n",
"Latitude float64\n",
"Longitude float64\n",
"Region object\n",
"ObservationTime int64\n",
"ObservationDate object\n",
"WindDirection object\n",
"WindSpeed int64\n",
"WindGust float64\n",
"Visibility float64\n",
"ScreenTemperature float64\n",
"Pressure float64\n",
"PressureTendency object\n",
"SignificantWeather object\n",
"Unnamed:15 object\n",
"dtype: object"
]
}
],
"prompt_number": 10
}
],
"metadata": {}
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment