3vivekb/cleaning_commands.py

## cleaning_commands.py
import pandas as pd
import glob

#Combine multiple csv's into one df
allFiles = glob.glob('.' + "/gps_*.csv")
frame = pd.DataFrame()
frames = []
for file_ in allFiles:
    df = pd.read_csv(file_, header=0)
    frames.append(df)
df = pd.concat(frames, ignore_index=True)

#Transitime Data Formatting
# http://strftime.org/ for reference
df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S.%f')

#Ridership Data Time Formatting
df['Date_Time']=pd.to_datetime(df['Date'], format='%m/%d/%Y %I:%M:%S %p')
df['Year'] = pd.DatetimeIndex(df['Date_Time']).year
df['Month'] = pd.DatetimeIndex(df['Date_Time']).month
df['Year-Month'] = pd.DatetimeIndex(df['Date_Time']).year + pd.DatetimeIndex(df['Date_Time']).month

#There is some duplicate data we need to clean out namely March 2016
df = df.drop_duplicates()

#Get latency of time data, convert from timedelta to seconds
df['latency'] = (df['time_processed'] - df['time'])
df['latency_seconds'] = df['latency'].astype('timedelta64[s]')

#Find number of records by source, vehicle, and hour
df.groupby([df.source,df.vehicle_id,df.time.dt.hour])['source'].count()

#Splits nasty strings, pulls first element of the list.
df['long_name_with_underscores'].str.split('-').str[0]

#How to quickly add a feature to a DataFrame.  It's super slow to edit them so it's almost always better to just
#make a new dataframe with the current rows plus whichever columns you wish.
od = []
for i,r, in OD_unique.iterrows():
#     row = {}
    row = dict(json.loads(r.to_json()))
    row['from_point'] = Point(r.from_longitude,r.from_latitude)
    row['to_point'] = Point(r.to_longitude,r.to_latitude)
    od.append(row)

od = pd.DataFrame(od)

#Or if you are specifically working with geodata this method is faster:
#https://gis.stackexchange.com/questions/174159/convert-a-pandas-dataframe-to-a-geodataframe
from geopandas import GeoDataFrame
from shapely.geometry import Point

geometry = [Point(xy) for xy in zip(df.Lon, df.Lat)]
df = df.drop(['Lon', 'Lat'], axis=1)
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(df, crs=crs, geometry=geometry)

# Remove lines where the geometry is null!
geo_df = geo_df.loc[~geo_df['geometry'].isnull(),]

#Remove spaces in the name, replace with _
geo_df.rename(columns=lambda x: x.replace(' ','_'), inplace=True)

#Converting a CRS
geo_df = geo_df.to_crs({'init': 'epsg:4326'})

# If the output file exists, remove it!
try:
    os.remove('geospatial_output.geojson')
except OSError:
    pass

try:
    shutil.rmtree('geospatial_output/')
except FileNotFoundError:
    pass

# Write the file to GeoJSON and ESRI Shapefile
geo_df.to_file('geospatial_output.geojson',driver='GeoJSON')
geo_df.to_file('geospatial_output/',driver='ESRI Shapefile')

# https://codeburst.io/dealing-with-datetimes-like-a-pro-in-pandas-b80d3d808a7f
#Take a timestamp column and give it a PST time zone.
assignments.set_index('t', inplace=True)
assignments.index = assignments.index.tz_localize('America/Los_Angeles').tz_convert('UTC')
assignments.reset_index(inplace=True)

#Create a json from a dataframe and push it to an api
#http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html
data= assignments.to_json(orient='records',date_format='epoch', date_unit='ms')
post_object = '{"avl":' + data + '}'
d = json.loads(post_object)

r = requests.post(url = 'http://url', json = d)

#Display more than 20 columns of data
pd.get_option("display.max_columns")
pd.set_option("display.max_columns", 40 )
	import pandas as pd
	import glob

	#Combine multiple csv's into one df
	allFiles = glob.glob('.' + "/gps_*.csv")
	frame = pd.DataFrame()
	frames = []
	for file_ in allFiles:
	df = pd.read_csv(file_, header=0)
	frames.append(df)
	df = pd.concat(frames, ignore_index=True)

	#Transitime Data Formatting
	# http://strftime.org/ for reference
	df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S.%f')

	#Ridership Data Time Formatting
	df['Date_Time']=pd.to_datetime(df['Date'], format='%m/%d/%Y %I:%M:%S %p')
	df['Year'] = pd.DatetimeIndex(df['Date_Time']).year
	df['Month'] = pd.DatetimeIndex(df['Date_Time']).month
	df['Year-Month'] = pd.DatetimeIndex(df['Date_Time']).year + pd.DatetimeIndex(df['Date_Time']).month

	#There is some duplicate data we need to clean out namely March 2016
	df = df.drop_duplicates()

	#Get latency of time data, convert from timedelta to seconds
	df['latency'] = (df['time_processed'] - df['time'])
	df['latency_seconds'] = df['latency'].astype('timedelta64[s]')

	#Find number of records by source, vehicle, and hour
	df.groupby([df.source,df.vehicle_id,df.time.dt.hour])['source'].count()

	#Splits nasty strings, pulls first element of the list.
	df['long_name_with_underscores'].str.split('-').str[0]

	#How to quickly add a feature to a DataFrame. It's super slow to edit them so it's almost always better to just
	#make a new dataframe with the current rows plus whichever columns you wish.
	od = []
	for i,r, in OD_unique.iterrows():
	# row = {}
	row = dict(json.loads(r.to_json()))
	row['from_point'] = Point(r.from_longitude,r.from_latitude)
	row['to_point'] = Point(r.to_longitude,r.to_latitude)
	od.append(row)

	od = pd.DataFrame(od)

	#Or if you are specifically working with geodata this method is faster:
	#https://gis.stackexchange.com/questions/174159/convert-a-pandas-dataframe-to-a-geodataframe
	from geopandas import GeoDataFrame
	from shapely.geometry import Point

	geometry = [Point(xy) for xy in zip(df.Lon, df.Lat)]
	df = df.drop(['Lon', 'Lat'], axis=1)
	crs = {'init': 'epsg:4326'}
	geo_df = GeoDataFrame(df, crs=crs, geometry=geometry)

	# Remove lines where the geometry is null!
	geo_df = geo_df.loc[~geo_df['geometry'].isnull(),]

	#Remove spaces in the name, replace with _
	geo_df.rename(columns=lambda x: x.replace(' ','_'), inplace=True)

	#Converting a CRS
	geo_df = geo_df.to_crs({'init': 'epsg:4326'})

	# If the output file exists, remove it!
	try:
	os.remove('geospatial_output.geojson')
	except OSError:
	pass

	try:
	shutil.rmtree('geospatial_output/')
	except FileNotFoundError:
	pass

	# Write the file to GeoJSON and ESRI Shapefile
	geo_df.to_file('geospatial_output.geojson',driver='GeoJSON')
	geo_df.to_file('geospatial_output/',driver='ESRI Shapefile')

	# https://codeburst.io/dealing-with-datetimes-like-a-pro-in-pandas-b80d3d808a7f
	#Take a timestamp column and give it a PST time zone.
	assignments.set_index('t', inplace=True)
	assignments.index = assignments.index.tz_localize('America/Los_Angeles').tz_convert('UTC')
	assignments.reset_index(inplace=True)

	#Create a json from a dataframe and push it to an api
	#http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html
	data= assignments.to_json(orient='records',date_format='epoch', date_unit='ms')
	post_object = '{"avl":' + data + '}'
	d = json.loads(post_object)

	r = requests.post(url = 'http://url', json = d)

	#Display more than 20 columns of data
	pd.get_option("display.max_columns")
	pd.set_option("display.max_columns", 40 )