View gps_overlay.r
# Overlay the small areas from the census data | |
# load small area files - remember this needs to be in GPS form for matching. | |
map_data <- readShapePoly('Census2011_Small_Areas_generalised20m/small_areas_gps.shp') | |
# Assign a small area and electoral district to each property with a GPS coordinate. | |
# The assignment of points to polygons is done using the sp::over() function. | |
# Inputs are a SpatialPoints (house locations) set, and SpatialPolygons (boundary shapes) | |
spatial_points <- SpatialPointsDataFrame(coords = ppr_data[!is.na(latitude),.(longitude,latitude)], data=ppr_data[!is.na(latitude), .(input_string, postcode)]) | |
polygon_overlap <- over(spatial_points, map_data) |
View python batch geocoding.py
""" | |
Python script for batch geocoding of addresses using the Google Geocoding API. | |
This script allows for massive lists of addresses to be geocoded for free by pausing when the | |
geocoder hits the free rate limit set by Google (2500 per day). If you have an API key for paid | |
geocoding from Google, set it in the API key section. | |
Addresses for geocoding can be specified in a list of strings "addresses". In this script, addresses | |
come from a csv file with a column "Address". Adjust the code to your own requirements as needed. | |
After every 500 successul geocode operations, a temporary file with results is recorded in case of | |
script failure / loss of connection later. | |
Addresses and data are held in memory, so this script may need to be adjusted to process files line |
View Pandas index - changing data with loc.py
# Change the first name of all rows with an ID greater than 2000 to "John" | |
data.loc[data['id'] > 2000, "first_name"] = "John" | |
# Change the first name of all rows with an ID greater than 2000 to "John" | |
data.loc[data['id'] > 2000, "first_name"] = "John" |
View Pandas index - ix selections.py
# ix indexing works just the same as .loc when passed strings | |
data.ix[['Andrade']] == data.loc[['Andrade']] | |
# ix indexing works the same as .iloc when passed integers. | |
data.ix[[33]] == data.iloc[[33]] | |
# ix only works in both modes when the index of the DataFrame is NOT an integer itself. |
View Pandas index - loc selection examples.py
# Select rows with first name Antonio, # and all columns between 'city' and 'email' | |
data.loc[data['first_name'] == 'Antonio', 'city':'email'] | |
# Select rows where the email column ends with 'hotmail.com', include all columns | |
data.loc[data['email'].str.endswith("hotmail.com")] | |
# Select rows with last_name equal to some values, all columns | |
data.loc[data['first_name'].isin(['France', 'Tyisha', 'Eric'])] | |
View Pandas Index - Select rows with loc.py
# Select rows with index values 'Andrade' and 'Veness', with all columns between 'city' and 'email' | |
data.loc[['Andrade', 'Veness'], 'city':'email'] | |
# Select same rows, with just 'first_name', 'address' and 'city' columns | |
data.loc['Andrade':'Veness', ['first_name', 'address', 'city']] | |
# Change the index to be based on the 'id' column | |
data.set_index('id', inplace=True) | |
# select the row with 'id' = 487 | |
data.loc[487] |
View Pandas Index - Setting index for iloc.py
data.set_index("last_name", inplace=True) | |
data.head() |
View Pandas Index - Multi iloc selections.py
# Multiple row and column selections using iloc and DataFrame | |
data.iloc[0:5] # first five rows of dataframe | |
data.iloc[:, 0:2] # first two columns of data frame with all rows | |
data.iloc[[0,3,6,24], [0,5,6]] # 1st, 4th, 7th, 25th row + 1st 6th 7th columns. | |
data.iloc[0:5, 5:8] # first 5 rows and 5th, 6th, 7th columns of data frame (county -> phone1). |
View Pandas Index - Single iloc selections.py
# Single selections using iloc and DataFrame | |
# Rows: | |
data.iloc[0] # first row of data frame (Aleshia Tomkiewicz) - Note a Series data type output. | |
data.iloc[1] # second row of data frame (Evan Zigomalas) | |
data.iloc[-1] # last row of data frame (Mi Richan) | |
# Columns: | |
data.iloc[:,0] # first column of data frame (first_name) | |
data.iloc[:,1] # second column of data frame (last_name) | |
data.iloc[:,-1] # last column of data frame (id) |
View Pandas Index - Loading Data.py
import pandas as pd | |
import random | |
# read the data from the downloaded CSV file. | |
data = pd.read_csv('https://s3-eu-west-1.amazonaws.com/shanebucket/downloads/uk-500.csv') | |
# set a numeric id for use as an index for examples. | |
data['id'] = [random.randint(0,1000) for x in range(data.shape[0])] | |
data.head(5) |