seumasmorrison/hist_concat.py

## hist_concat.py
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 06 14:38:11 2015

@author: le12jm
"""

from datetime import datetime
#from hebtools.common import wave_power
import glob
import os
import pandas as pd
import logging
import sys
logging.basicConfig(stream=sys.stderr, level=logging.INFO)

his_columns = ['date_time', 'tp', 'dirp', 'sprp', 'tz', 'hm0', 'ti', 't1',
               'tc', 'tdw2', 'tdw1', 'tpc', 'nu','eps','qp','ss','tref','tsea',
               'bat']

hiw_columns = ['date_time','% no reception errors','hmax','tmax','h(1/10)',
               't(1/10)','h1/3','t1/3','Hav','Tav','Eps','#Waves']

matching_string_buoy_his = '*$*.his'
matching_string_computed_his = '*[!$]}*.his'
matching_string_hiw = '*.hiw'
depth = 65

matching_file_types = {'his':matching_string_computed_his, 'hiw':matching_string_hiw}

def strip_non_directories(path):
    files_and_dirs = os.listdir(path)
    return [x for x in files_and_dirs if os.path.isdir(os.path.join(path,x))]

def get_historical_dataframe(buoy_path, matching_string):
    logging.info(("buoy_path", buoy_path))
    df_list = []
    years = strip_non_directories(buoy_path)
    logging.info(("years", years))
    for year in years:
        year_path = os.path.join(buoy_path, year)
        months = strip_non_directories(year_path)
        for month in months:
            month_path = os.path.join(year_path,month)
            try:
                file_name = glob.glob(month_path + os.sep +  matching_string)[0]
                if matching_string[-1] == 'w':
                    columns = hiw_columns
                else:
                    columns = his_columns
                df = pd.read_csv(file_name, names = columns)
                date_times = []
                for date_time_string in df['date_time'].values:
                    if date_time_string != 'nan':
                        date_time = datetime.strptime(date_time_string[:-5],
                                                      "%Y-%m-%dT%H:%M:%S")
                        date_times.append(date_time)
                    else:
                        date_times.append(datetime(1970,1,1))
                df.index = pd.DatetimeIndex(date_times)
                df_list.append(df)
            except IndexError:
                print "No file found matching", matching_string
    if len(df_list) != 0:
        large_df = pd.concat(df_list)
        large_df = large_df.sort_index()
        large_df.to_pickle(buoy_path + '_' + matching_string[-3:] + '_dataframe')
        def resample_write_xlsx(df, period):
            resampled_df = df.resample(period)
            resampled_df.to_excel(buoy_path + '_' + period + '_' + \
                                         matching_string[-3:] + '.xlsx' )
            return resampled_df
        thirty_min_resample = resample_write_xlsx(large_df, '30Min')
        resample_write_xlsx(large_df, '60Min')
        return thirty_min_resample


def load(buoy_path):
    for key, value in matching_file_types.iteritems():
        print key
        hist_df = get_historical_dataframe(buoy_path, value)
        hist_df.to_hdf(buoy_path + '/hist.h5', key)
	# -- coding: utf-8 --
	"""
	Created on Fri Feb 06 14:38:11 2015

	@author: le12jm
	"""

	from datetime import datetime
	#from hebtools.common import wave_power
	import glob
	import os
	import pandas as pd
	import logging
	import sys
	logging.basicConfig(stream=sys.stderr, level=logging.INFO)

	his_columns = ['date_time', 'tp', 'dirp', 'sprp', 'tz', 'hm0', 'ti', 't1',
	'tc', 'tdw2', 'tdw1', 'tpc', 'nu','eps','qp','ss','tref','tsea',
	'bat']

	hiw_columns = ['date_time','% no reception errors','hmax','tmax','h(1/10)',
	't(1/10)','h1/3','t1/3','Hav','Tav','Eps','#Waves']

	matching_string_buoy_his = '$.his'
	matching_string_computed_his = '[!$]}.his'
	matching_string_hiw = '*.hiw'
	depth = 65

	matching_file_types = {'his':matching_string_computed_his, 'hiw':matching_string_hiw}

	def strip_non_directories(path):
	files_and_dirs = os.listdir(path)
	return [x for x in files_and_dirs if os.path.isdir(os.path.join(path,x))]

	def get_historical_dataframe(buoy_path, matching_string):
	logging.info(("buoy_path", buoy_path))
	df_list = []
	years = strip_non_directories(buoy_path)
	logging.info(("years", years))
	for year in years:
	year_path = os.path.join(buoy_path, year)
	months = strip_non_directories(year_path)
	for month in months:
	month_path = os.path.join(year_path,month)
	try:
	file_name = glob.glob(month_path + os.sep + matching_string)[0]
	if matching_string[-1] == 'w':
	columns = hiw_columns
	else:
	columns = his_columns
	df = pd.read_csv(file_name, names = columns)
	date_times = []
	for date_time_string in df['date_time'].values:
	if date_time_string != 'nan':
	date_time = datetime.strptime(date_time_string[:-5],
	"%Y-%m-%dT%H:%M:%S")
	date_times.append(date_time)
	else:
	date_times.append(datetime(1970,1,1))
	df.index = pd.DatetimeIndex(date_times)
	df_list.append(df)
	except IndexError:
	print "No file found matching", matching_string
	if len(df_list) != 0:
	large_df = pd.concat(df_list)
	large_df = large_df.sort_index()
	large_df.to_pickle(buoy_path + '_' + matching_string[-3:] + '_dataframe')
	def resample_write_xlsx(df, period):
	resampled_df = df.resample(period)
	resampled_df.to_excel(buoy_path + '_' + period + '_' + \
	matching_string[-3:] + '.xlsx' )
	return resampled_df
	thirty_min_resample = resample_write_xlsx(large_df, '30Min')
	resample_write_xlsx(large_df, '60Min')
	return thirty_min_resample


	def load(buoy_path):
	for key, value in matching_file_types.iteritems():
	print key
	hist_df = get_historical_dataframe(buoy_path, value)
	hist_df.to_hdf(buoy_path + '/hist.h5', key)