Skip to content

Instantly share code, notes, and snippets.

@3vivekb
Last active January 31, 2019 18:19
Show Gist options
  • Save 3vivekb/6d88d1e65838999c2373ba29cb46c8f1 to your computer and use it in GitHub Desktop.
Save 3vivekb/6d88d1e65838999c2373ba29cb46c8f1 to your computer and use it in GitHub Desktop.
Frequently used Python Pandas Data Cleaning and grouping commands
import pandas as pd
import glob
#Combine multiple csv's into one df
allFiles = glob.glob('.' + "/gps_*.csv")
frame = pd.DataFrame()
frames = []
for file_ in allFiles:
df = pd.read_csv(file_, header=0)
frames.append(df)
df = pd.concat(frames, ignore_index=True)
#Transitime Data Formatting
# http://strftime.org/ for reference
df['time'] = pd.to_datetime(df['time'], format='%Y-%m-%d %H:%M:%S.%f')
#Ridership Data Time Formatting
df['Date_Time']=pd.to_datetime(df['Date'], format='%m/%d/%Y %I:%M:%S %p')
df['Year'] = pd.DatetimeIndex(df['Date_Time']).year
df['Month'] = pd.DatetimeIndex(df['Date_Time']).month
df['Year-Month'] = pd.DatetimeIndex(df['Date_Time']).year + pd.DatetimeIndex(df['Date_Time']).month
#There is some duplicate data we need to clean out namely March 2016
df = df.drop_duplicates()
#Get latency of time data, convert from timedelta to seconds
df['latency'] = (df['time_processed'] - df['time'])
df['latency_seconds'] = df['latency'].astype('timedelta64[s]')
#Find number of records by source, vehicle, and hour
df.groupby([df.source,df.vehicle_id,df.time.dt.hour])['source'].count()
#Splits nasty strings, pulls first element of the list.
df['long_name_with_underscores'].str.split('-').str[0]
#How to quickly add a feature to a DataFrame. It's super slow to edit them so it's almost always better to just
#make a new dataframe with the current rows plus whichever columns you wish.
od = []
for i,r, in OD_unique.iterrows():
# row = {}
row = dict(json.loads(r.to_json()))
row['from_point'] = Point(r.from_longitude,r.from_latitude)
row['to_point'] = Point(r.to_longitude,r.to_latitude)
od.append(row)
od = pd.DataFrame(od)
#Or if you are specifically working with geodata this method is faster:
#https://gis.stackexchange.com/questions/174159/convert-a-pandas-dataframe-to-a-geodataframe
from geopandas import GeoDataFrame
from shapely.geometry import Point
geometry = [Point(xy) for xy in zip(df.Lon, df.Lat)]
df = df.drop(['Lon', 'Lat'], axis=1)
crs = {'init': 'epsg:4326'}
geo_df = GeoDataFrame(df, crs=crs, geometry=geometry)
# Remove lines where the geometry is null!
geo_df = geo_df.loc[~geo_df['geometry'].isnull(),]
#Remove spaces in the name, replace with _
geo_df.rename(columns=lambda x: x.replace(' ','_'), inplace=True)
#Converting a CRS
geo_df = geo_df.to_crs({'init': 'epsg:4326'})
# If the output file exists, remove it!
try:
os.remove('geospatial_output.geojson')
except OSError:
pass
try:
shutil.rmtree('geospatial_output/')
except FileNotFoundError:
pass
# Write the file to GeoJSON and ESRI Shapefile
geo_df.to_file('geospatial_output.geojson',driver='GeoJSON')
geo_df.to_file('geospatial_output/',driver='ESRI Shapefile')
# https://codeburst.io/dealing-with-datetimes-like-a-pro-in-pandas-b80d3d808a7f
#Take a timestamp column and give it a PST time zone.
assignments.set_index('t', inplace=True)
assignments.index = assignments.index.tz_localize('America/Los_Angeles').tz_convert('UTC')
assignments.reset_index(inplace=True)
#Create a json from a dataframe and push it to an api
#http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.to_json.html
data= assignments.to_json(orient='records',date_format='epoch', date_unit='ms')
post_object = '{"avl":' + data + '}'
d = json.loads(post_object)
r = requests.post(url = 'http://url', json = d)
#Display more than 20 columns of data
pd.get_option("display.max_columns")
pd.set_option("display.max_columns", 40 )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment