rdmtinez/condenser

## condenser
#import os
#import pandas as pd
#import datetime as dt

####you want to optimize the conversion and create a summary report####

def raw_converter_main(raw_dir, wrt_dir, wrt_massisve=False):
    """This function is designed to condense the raw csv files, made by the
    'download_csv.py' function held by Christian Glunk and the Deepfiled
    Connect team, into a more maneagable form"""

    #print everything ('all') to console without print() statement
    from IPython.core.interactiveshell import InteractiveShell
    InteractiveShell.ast_node_interactivity = 'all'

    import os
    import pandas as pd
    import datetime as dt

    os.chdir(raw_dir)
    csv_list = os.listdir(raw_dir)

    #relevant variables in raw csvs to extract
    extract_vars = ['AN1', 'DTC', 'HM1', 'HM2',
                    'HM3', 'HM4', 'T1', 'T2',
                    'T3', 'T4', 'latitude',
                    'longitude', 'measurement_time']

    #for DevID column indexing
    b_count = 0
    p_count = 0

    for csv in csv_list[0:2]:


        with open(csv) as f:

            #import csv, parse datetime and set as index
            raw_df = pd.read_csv(f)
                #parse_dates=True
            #draw relevant columns from rawdf
            df = raw_df[extract_vars]

            #Numerizing instruments
            imei = csv[7:-4]
            df.insert(0, column='IMEI', value=imei)

            #renames initial columns columns
            new_names = {'AN1':'Volts', 'DTC':'DigiTemp', 'measurement_time':'DateTime',
                        'longitude':'Lng', 'latitude':'Lat'}

            df = df.rename(columns=new_names)
            print("During OPEN_csv_df")
            print(df)

            #DateTime as datetime
            df['DateTime'] = pd.to_datetime(df['DateTime'])


            #fills missing lat, long values for the sake of row completeness
            cols = ['Lat', 'Lng']
            df[cols] = df[cols].ffill()
            df[cols] = df[cols].bfill()

            #call cleaning functions
            df = rc_remove(df, imei)
            df = var_insert(df, imei)


            #write new csvs, and performs some last cleaning steps
            if 'HM2' in df:
                #insert premium_count for later indexing based on individual number,
                #clear out any remaining NaNs, set DateTime as index
                """can be done at a later step"""
                #p_count += 1
                #df.insert(0, column='DevID', value=p_count)

                #df = df[pd.notnull(df['T2'])]
                #df = df[pd.notnull(df['T1'])]

                #write to wrt_dir
                print("writing premium_", imei)
                df.to_csv(wrt_dir+'premium/premium_'+imei+'.csv', index=False)


            else:
                #insert basic_count for later indexing on individual number,
                #clear out any remaining NaNs
                """can be done in a later step"""
                #b_count +=1
                #df.insert(0, column='DevID', value=b_count)

                #df = df[pd.notnull(df['T1'])]

                #wrtite to wrt_dir
                print("writing basic_", imei)
                df.to_csv(wrt_dir+'basic/basic_'+imei+'.csv', index=False)

            #if wrt_massive:
            #call massive writing function


def rc_remove(df, imei):
    """This function removes row columns and values from the the raw_df.
    It also assigns non-ambigious temp column names. In the raw files some
    temperature columns are in T1, the switches to T2, etc"""

    import pandas as pd

    #remove rows where Voltage = Null
    df = df[pd.notnull(df['Volts'])]
    keep during raw conversion, but

    #drop rows where voltage values are -255.00
    #(UNSURE WHY THIS '-255.0' value EXISTS)

    """df = df[df['Volts'] != -255.0]
    keep for raw conversion, but remove change during
    manipultion and analysis"""

    #keep the the columns that are at least 30% populated
    df = df.dropna(thresh=len(df)*.30, axis=1)

    #This removes the ambiguity in the names of T# columns of certain csvs which temperature
    #measurements go into several columns, as well as removing rows with NaN values in column
    vars_len = len(df.columns.values)

    if vars_len <9:
        df.columns.values[-4] = 'T1'

    else:
        df.columns.values[-4] = 'T2'
        df.columns.values[-5] = 'T1'

    print('during REMOVE df')
    return df


def var_insert(df, imei):
    """This function shifts around columns, inserts new variables into
    into the dataframe"""

    import math
    from datetime import timedelta

    #insert calculated volumetric water content measurements #and change in volumetric water
    #content and populate for two subsequent values under DateTime if their difference == 30min
    #populate the dVWC column with the difference of the adjoining values under VWC.
    last_col = len(df.columns.values) -1
    df.insert(last_col, column='VWC', value=None)
    df['VWC'] = .000494*(df['Volts']*1000)-.544

    ##Take the following into account during manipulation
    """df.insert(last_col+1, column='dVWC', value=None)
    df.loc[(df['DateTime'].shift(-1) - df['DateTime']) == timedelta(minutes=30),
           'dVWC'] = df['VWC'].shift(-1) - df['VWC']


    #Then shift those values by one row remove top row holding NaN
    #fill rest of NaN with preceeding real value
    df['dVWC'] = df['dVWC'].shift(+1)
    df = df[1:]
    df['dVWC'] = df['dVWC'].ffill()"""


    #insert 'month', 'week', 'day' columns... these can be set as indeces
    #for used for simpler visualization, ##better to add during other algos
    """ df.insert(1, column="Month", value = df['DateTime'].dt.strftime('%b'))
    df.insert(2, column="Week", value = df['DateTime'].dt.week)
    df.insert(3, column="Day", value = df['DateTime'].dt.day)"""

    return df

raw_dir = 'C:/Users/MAR8RNG/BOSCH-PROJECTS/1-Humidity/python/raw_CSVs/'
wrt_dir = 'C:/Users/MAR8RNG/BOSCH-PROJECTS/1-Humidity/python/condensed_CSVs/'

raw_converter_main(raw_dir, wrt_dir)
	#import os
	#import pandas as pd
	#import datetime as dt

	####you want to optimize the conversion and create a summary report####

	def raw_converter_main(raw_dir, wrt_dir, wrt_massisve=False):
	"""This function is designed to condense the raw csv files, made by the
	'download_csv.py' function held by Christian Glunk and the Deepfiled
	Connect team, into a more maneagable form"""

	#print everything ('all') to console without print() statement
	from IPython.core.interactiveshell import InteractiveShell
	InteractiveShell.ast_node_interactivity = 'all'

	import os
	import pandas as pd
	import datetime as dt

	os.chdir(raw_dir)
	csv_list = os.listdir(raw_dir)

	#relevant variables in raw csvs to extract
	extract_vars = ['AN1', 'DTC', 'HM1', 'HM2',
	'HM3', 'HM4', 'T1', 'T2',
	'T3', 'T4', 'latitude',
	'longitude', 'measurement_time']

	#for DevID column indexing
	b_count = 0
	p_count = 0

	for csv in csv_list[0:2]:


	with open(csv) as f:

	#import csv, parse datetime and set as index
	raw_df = pd.read_csv(f)
	#parse_dates=True
	#draw relevant columns from rawdf
	df = raw_df[extract_vars]

	#Numerizing instruments
	imei = csv[7:-4]
	df.insert(0, column='IMEI', value=imei)

	#renames initial columns columns
	new_names = {'AN1':'Volts', 'DTC':'DigiTemp', 'measurement_time':'DateTime',
	'longitude':'Lng', 'latitude':'Lat'}

	df = df.rename(columns=new_names)
	print("During OPEN_csv_df")
	print(df)

	#DateTime as datetime
	df['DateTime'] = pd.to_datetime(df['DateTime'])


	#fills missing lat, long values for the sake of row completeness
	cols = ['Lat', 'Lng']
	df[cols] = df[cols].ffill()
	df[cols] = df[cols].bfill()

	#call cleaning functions
	df = rc_remove(df, imei)
	df = var_insert(df, imei)



	#write new csvs, and performs some last cleaning steps
	if 'HM2' in df:
	#insert premium_count for later indexing based on individual number,
	#clear out any remaining NaNs, set DateTime as index
	"""can be done at a later step"""
	#p_count += 1
	#df.insert(0, column='DevID', value=p_count)

	#df = df[pd.notnull(df['T2'])]
	#df = df[pd.notnull(df['T1'])]

	#write to wrt_dir
	print("writing premium_", imei)
	df.to_csv(wrt_dir+'premium/premium_'+imei+'.csv', index=False)


	else:
	#insert basic_count for later indexing on individual number,
	#clear out any remaining NaNs
	"""can be done in a later step"""
	#b_count +=1
	#df.insert(0, column='DevID', value=b_count)

	#df = df[pd.notnull(df['T1'])]

	#wrtite to wrt_dir
	print("writing basic_", imei)
	df.to_csv(wrt_dir+'basic/basic_'+imei+'.csv', index=False)

	#if wrt_massive:
	#call massive writing function




	def rc_remove(df, imei):
	"""This function removes row columns and values from the the raw_df.
	It also assigns non-ambigious temp column names. In the raw files some
	temperature columns are in T1, the switches to T2, etc"""

	import pandas as pd

	#remove rows where Voltage = Null
	df = df[pd.notnull(df['Volts'])]
	keep during raw conversion, but

	#drop rows where voltage values are -255.00
	#(UNSURE WHY THIS '-255.0' value EXISTS)

	"""df = df[df['Volts'] != -255.0]
	keep for raw conversion, but remove change during
	manipultion and analysis"""

	#keep the the columns that are at least 30% populated
	df = df.dropna(thresh=len(df)*.30, axis=1)

	#This removes the ambiguity in the names of T# columns of certain csvs which temperature
	#measurements go into several columns, as well as removing rows with NaN values in column
	vars_len = len(df.columns.values)

	if vars_len <9:
	df.columns.values[-4] = 'T1'

	else:
	df.columns.values[-4] = 'T2'
	df.columns.values[-5] = 'T1'

	print('during REMOVE df')
	return df



	def var_insert(df, imei):
	"""This function shifts around columns, inserts new variables into
	into the dataframe"""

	import math
	from datetime import timedelta

	#insert calculated volumetric water content measurements #and change in volumetric water
	#content and populate for two subsequent values under DateTime if their difference == 30min
	#populate the dVWC column with the difference of the adjoining values under VWC.
	last_col = len(df.columns.values) -1
	df.insert(last_col, column='VWC', value=None)
	df['VWC'] = .000494(df['Volts']1000)-.544

	##Take the following into account during manipulation
	"""df.insert(last_col+1, column='dVWC', value=None)
	df.loc[(df['DateTime'].shift(-1) - df['DateTime']) == timedelta(minutes=30),
	'dVWC'] = df['VWC'].shift(-1) - df['VWC']


	#Then shift those values by one row remove top row holding NaN
	#fill rest of NaN with preceeding real value
	df['dVWC'] = df['dVWC'].shift(+1)
	df = df[1:]
	df['dVWC'] = df['dVWC'].ffill()"""


	#insert 'month', 'week', 'day' columns... these can be set as indeces
	#for used for simpler visualization, ##better to add during other algos
	""" df.insert(1, column="Month", value = df['DateTime'].dt.strftime('%b'))
	df.insert(2, column="Week", value = df['DateTime'].dt.week)
	df.insert(3, column="Day", value = df['DateTime'].dt.day)"""

	return df

	raw_dir = 'C:/Users/MAR8RNG/BOSCH-PROJECTS/1-Humidity/python/raw_CSVs/'
	wrt_dir = 'C:/Users/MAR8RNG/BOSCH-PROJECTS/1-Humidity/python/condensed_CSVs/'

	raw_converter_main(raw_dir, wrt_dir)