Skip to content

Instantly share code, notes, and snippets.

@rdmtinez
Last active October 20, 2017 13:12
Show Gist options
  • Save rdmtinez/b50caa212082fe23f0af9c11995602f8 to your computer and use it in GitHub Desktop.
Save rdmtinez/b50caa212082fe23f0af9c11995602f8 to your computer and use it in GitHub Desktop.
#import os
#import pandas as pd
#import datetime as dt
####you want to optimize the conversion and create a summary report####
def raw_converter_main(raw_dir, wrt_dir, wrt_massisve=False):
"""This function is designed to condense the raw csv files, made by the
'download_csv.py' function held by Christian Glunk and the Deepfiled
Connect team, into a more maneagable form"""
#print everything ('all') to console without print() statement
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
import os
import pandas as pd
import datetime as dt
os.chdir(raw_dir)
csv_list = os.listdir(raw_dir)
#relevant variables in raw csvs to extract
extract_vars = ['AN1', 'DTC', 'HM1', 'HM2',
'HM3', 'HM4', 'T1', 'T2',
'T3', 'T4', 'latitude',
'longitude', 'measurement_time']
#for DevID column indexing
b_count = 0
p_count = 0
for csv in csv_list[0:2]:
with open(csv) as f:
#import csv, parse datetime and set as index
raw_df = pd.read_csv(f)
#parse_dates=True
#draw relevant columns from rawdf
df = raw_df[extract_vars]
#Numerizing instruments
imei = csv[7:-4]
df.insert(0, column='IMEI', value=imei)
#renames initial columns columns
new_names = {'AN1':'Volts', 'DTC':'DigiTemp', 'measurement_time':'DateTime',
'longitude':'Lng', 'latitude':'Lat'}
df = df.rename(columns=new_names)
print("During OPEN_csv_df")
print(df)
#DateTime as datetime
df['DateTime'] = pd.to_datetime(df['DateTime'])
#fills missing lat, long values for the sake of row completeness
cols = ['Lat', 'Lng']
df[cols] = df[cols].ffill()
df[cols] = df[cols].bfill()
#call cleaning functions
df = rc_remove(df, imei)
df = var_insert(df, imei)
#write new csvs, and performs some last cleaning steps
if 'HM2' in df:
#insert premium_count for later indexing based on individual number,
#clear out any remaining NaNs, set DateTime as index
"""can be done at a later step"""
#p_count += 1
#df.insert(0, column='DevID', value=p_count)
#df = df[pd.notnull(df['T2'])]
#df = df[pd.notnull(df['T1'])]
#write to wrt_dir
print("writing premium_", imei)
df.to_csv(wrt_dir+'premium/premium_'+imei+'.csv', index=False)
else:
#insert basic_count for later indexing on individual number,
#clear out any remaining NaNs
"""can be done in a later step"""
#b_count +=1
#df.insert(0, column='DevID', value=b_count)
#df = df[pd.notnull(df['T1'])]
#wrtite to wrt_dir
print("writing basic_", imei)
df.to_csv(wrt_dir+'basic/basic_'+imei+'.csv', index=False)
#if wrt_massive:
#call massive writing function
def rc_remove(df, imei):
"""This function removes row columns and values from the the raw_df.
It also assigns non-ambigious temp column names. In the raw files some
temperature columns are in T1, the switches to T2, etc"""
import pandas as pd
#remove rows where Voltage = Null
df = df[pd.notnull(df['Volts'])]
keep during raw conversion, but
#drop rows where voltage values are -255.00
#(UNSURE WHY THIS '-255.0' value EXISTS)
"""df = df[df['Volts'] != -255.0]
keep for raw conversion, but remove change during
manipultion and analysis"""
#keep the the columns that are at least 30% populated
df = df.dropna(thresh=len(df)*.30, axis=1)
#This removes the ambiguity in the names of T# columns of certain csvs which temperature
#measurements go into several columns, as well as removing rows with NaN values in column
vars_len = len(df.columns.values)
if vars_len <9:
df.columns.values[-4] = 'T1'
else:
df.columns.values[-4] = 'T2'
df.columns.values[-5] = 'T1'
print('during REMOVE df')
return df
def var_insert(df, imei):
"""This function shifts around columns, inserts new variables into
into the dataframe"""
import math
from datetime import timedelta
#insert calculated volumetric water content measurements #and change in volumetric water
#content and populate for two subsequent values under DateTime if their difference == 30min
#populate the dVWC column with the difference of the adjoining values under VWC.
last_col = len(df.columns.values) -1
df.insert(last_col, column='VWC', value=None)
df['VWC'] = .000494*(df['Volts']*1000)-.544
##Take the following into account during manipulation
"""df.insert(last_col+1, column='dVWC', value=None)
df.loc[(df['DateTime'].shift(-1) - df['DateTime']) == timedelta(minutes=30),
'dVWC'] = df['VWC'].shift(-1) - df['VWC']
#Then shift those values by one row remove top row holding NaN
#fill rest of NaN with preceeding real value
df['dVWC'] = df['dVWC'].shift(+1)
df = df[1:]
df['dVWC'] = df['dVWC'].ffill()"""
#insert 'month', 'week', 'day' columns... these can be set as indeces
#for used for simpler visualization, ##better to add during other algos
""" df.insert(1, column="Month", value = df['DateTime'].dt.strftime('%b'))
df.insert(2, column="Week", value = df['DateTime'].dt.week)
df.insert(3, column="Day", value = df['DateTime'].dt.day)"""
return df
raw_dir = 'C:/Users/MAR8RNG/BOSCH-PROJECTS/1-Humidity/python/raw_CSVs/'
wrt_dir = 'C:/Users/MAR8RNG/BOSCH-PROJECTS/1-Humidity/python/condensed_CSVs/'
raw_converter_main(raw_dir, wrt_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment