shanealynn

## gps_overlay.r
# Overlay the small areas from the census data
# load small area files - remember this needs to be in GPS form for matching.
map_data <- readShapePoly('Census2011_Small_Areas_generalised20m/small_areas_gps.shp')

# Assign a small area and electoral district to each property with a GPS coordinate.
# The assignment of points to polygons is done using the sp::over() function.
# Inputs are a SpatialPoints (house locations) set, and SpatialPolygons (boundary shapes)
spatial_points <- SpatialPointsDataFrame(coords = ppr_data[!is.na(latitude),.(longitude,latitude)], data=ppr_data[!is.na(latitude), .(input_string, postcode)])
polygon_overlap <- over(spatial_points, map_data)

## python batch geocoding.py
"""
Python script for batch geocoding of addresses using the Google Geocoding API.
This script allows for massive lists of addresses to be geocoded for free by pausing when the
geocoder hits the free rate limit set by Google (2500 per day).  If you have an API key for paid
geocoding from Google, set it in the API key section.
Addresses for geocoding can be specified in a list of strings "addresses". In this script, addresses
come from a csv file with a column "Address". Adjust the code to your own requirements as needed.
After every 500 successul geocode operations, a temporary file with results is recorded in case of
script failure / loss of connection later.
Addresses and data are held in memory, so this script may need to be adjusted to process files line

## Pandas index - changing data with loc.py
# Change the first name of all rows with an ID greater than 2000 to "John"
data.loc[data['id'] > 2000, "first_name"] = "John"

# Change the first name of all rows with an ID greater than 2000 to "John"
data.loc[data['id'] > 2000, "first_name"] = "John"

## Pandas index - ix selections.py

# ix indexing works just the same as .loc when passed strings
data.ix[['Andrade']] == data.loc[['Andrade']]
# ix indexing works the same as .iloc when passed integers.
data.ix[[33]] == data.iloc[[33]]

# ix only works in both modes when the index of the DataFrame is NOT an integer itself.

## Pandas index - loc selection examples.py

# Select rows with first name Antonio, # and all columns between 'city' and 'email'
data.loc[data['first_name'] == 'Antonio', 'city':'email']

# Select rows where the email column ends with 'hotmail.com', include all columns
data.loc[data['email'].str.endswith("hotmail.com")]

# Select rows with last_name equal to some values, all columns
data.loc[data['first_name'].isin(['France', 'Tyisha', 'Eric'])]


## Pandas Index - Select rows with loc.py

# Select rows with index values 'Andrade' and 'Veness', with all columns between 'city' and 'email'
data.loc[['Andrade', 'Veness'], 'city':'email']
# Select same rows, with just 'first_name', 'address' and 'city' columns
data.loc['Andrade':'Veness', ['first_name', 'address', 'city']]

# Change the index to be based on the 'id' column
data.set_index('id', inplace=True)
# select the row with 'id' = 487
data.loc[487]

## Pandas Index - Setting index for iloc.py
data.set_index("last_name", inplace=True)
data.head()

## Pandas Index - Multi iloc selections.py
# Multiple row and column selections using iloc and DataFrame
data.iloc[0:5] # first five rows of dataframe
data.iloc[:, 0:2] # first two columns of data frame with all rows
data.iloc[[0,3,6,24], [0,5,6]] # 1st, 4th, 7th, 25th row + 1st 6th 7th columns.
data.iloc[0:5, 5:8] # first 5 rows and 5th, 6th, 7th columns of data frame (county -> phone1).

## Pandas Index - Single iloc selections.py
# Single selections using iloc and DataFrame
# Rows:
data.iloc[0] # first row of data frame (Aleshia Tomkiewicz) - Note a Series data type output.
data.iloc[1] # second row of data frame (Evan Zigomalas)
data.iloc[-1] # last row of data frame (Mi Richan)
# Columns:
data.iloc[:,0] # first column of data frame (first_name)
data.iloc[:,1] # second column of data frame (last_name)
data.iloc[:,-1] # last column of data frame (id)

## Pandas Index - Loading Data.py

import pandas as pd
import random

# read the data from the downloaded CSV file.
data = pd.read_csv('https://s3-eu-west-1.amazonaws.com/shanebucket/downloads/uk-500.csv')
# set a numeric id for use as an index for examples.
data['id'] = [random.randint(0,1000) for x in range(data.shape[0])]

data.head(5)
	# Overlay the small areas from the census data
	# load small area files - remember this needs to be in GPS form for matching.
	map_data <- readShapePoly('Census2011_Small_Areas_generalised20m/small_areas_gps.shp')

	# Assign a small area and electoral district to each property with a GPS coordinate.
	# The assignment of points to polygons is done using the sp::over() function.
	# Inputs are a SpatialPoints (house locations) set, and SpatialPolygons (boundary shapes)
	spatial_points <- SpatialPointsDataFrame(coords = ppr_data[!is.na(latitude),.(longitude,latitude)], data=ppr_data[!is.na(latitude), .(input_string, postcode)])
	polygon_overlap <- over(spatial_points, map_data)
	"""
	Python script for batch geocoding of addresses using the Google Geocoding API.
	This script allows for massive lists of addresses to be geocoded for free by pausing when the
	geocoder hits the free rate limit set by Google (2500 per day). If you have an API key for paid
	geocoding from Google, set it in the API key section.
	Addresses for geocoding can be specified in a list of strings "addresses". In this script, addresses
	come from a csv file with a column "Address". Adjust the code to your own requirements as needed.
	After every 500 successul geocode operations, a temporary file with results is recorded in case of
	script failure / loss of connection later.
	Addresses and data are held in memory, so this script may need to be adjusted to process files line
	# Change the first name of all rows with an ID greater than 2000 to "John"
	data.loc[data['id'] > 2000, "first_name"] = "John"

	# Change the first name of all rows with an ID greater than 2000 to "John"
	data.loc[data['id'] > 2000, "first_name"] = "John"

	# ix indexing works just the same as .loc when passed strings
	data.ix[['Andrade']] == data.loc[['Andrade']]
	# ix indexing works the same as .iloc when passed integers.
	data.ix[[33]] == data.iloc[[33]]

	# ix only works in both modes when the index of the DataFrame is NOT an integer itself.

	# Select rows with first name Antonio, # and all columns between 'city' and 'email'
	data.loc[data['first_name'] == 'Antonio', 'city':'email']

	# Select rows where the email column ends with 'hotmail.com', include all columns
	data.loc[data['email'].str.endswith("hotmail.com")]

	# Select rows with last_name equal to some values, all columns
	data.loc[data['first_name'].isin(['France', 'Tyisha', 'Eric'])]

	# Select rows with index values 'Andrade' and 'Veness', with all columns between 'city' and 'email'
	data.loc[['Andrade', 'Veness'], 'city':'email']
	# Select same rows, with just 'first_name', 'address' and 'city' columns
	data.loc['Andrade':'Veness', ['first_name', 'address', 'city']]

	# Change the index to be based on the 'id' column
	data.set_index('id', inplace=True)
	# select the row with 'id' = 487
	data.loc[487]
	# Multiple row and column selections using iloc and DataFrame
	data.iloc[0:5] # first five rows of dataframe
	data.iloc[:, 0:2] # first two columns of data frame with all rows
	data.iloc[[0,3,6,24], [0,5,6]] # 1st, 4th, 7th, 25th row + 1st 6th 7th columns.
	data.iloc[0:5, 5:8] # first 5 rows and 5th, 6th, 7th columns of data frame (county -> phone1).
	# Single selections using iloc and DataFrame
	# Rows:
	data.iloc[0] # first row of data frame (Aleshia Tomkiewicz) - Note a Series data type output.
	data.iloc[1] # second row of data frame (Evan Zigomalas)
	data.iloc[-1] # last row of data frame (Mi Richan)
	# Columns:
	data.iloc[:,0] # first column of data frame (first_name)
	data.iloc[:,1] # second column of data frame (last_name)
	data.iloc[:,-1] # last column of data frame (id)

	import pandas as pd
	import random

	# read the data from the downloaded CSV file.
	data = pd.read_csv('https://s3-eu-west-1.amazonaws.com/shanebucket/downloads/uk-500.csv')
	# set a numeric id for use as an index for examples.
	data['id'] = [random.randint(0,1000) for x in range(data.shape[0])]

	data.head(5)