seumasmorrison/Concatenating_his_hiw_files_from_Datawell_MKIII_directory_data_and_generating_Excel_Workbooks.ipynb

## Concatenating_his_hiw_files_from_Datawell_MKIII_directory_data_and_generating_Excel_Workbooks.ipynb
{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#Updated script for Python 3 based on https://gist.github.com/seumasmorrison/1abaa2308044814167a9\n",
    "from datetime import datetime\n",
    "import glob\n",
    "import os\n",
    "import pandas as pd\n",
    "\n",
    "his_columns = ['date_time', 'tp', 'dirp', 'sprp', 'tz', 'hm0', 'ti', 't1', \n",
    "               'tc', 'tdw2', 'tdw1', 'tpc', 'nu','eps','qp','ss','tref','tsea',\n",
    "               'bat']\n",
    "           \n",
    "hiw_columns = ['date_time','% no reception errors','hmax','tmax','h(1/10)',\n",
    "               't(1/10)','h1/3','t1/3','Hav','Tav','Eps','#Waves']\n",
    "\n",
    "matching_string_buoy_his = '*$*.his'\n",
    "matching_string_computed_his = '*[!$]}*.his'\n",
    "matching_string_hiw = '*.hiw'\n",
    "\n",
    "matching_file_types = {'his':matching_string_computed_his, 'hiw':matching_string_hiw}\n",
    "\n",
    "def strip_non_directories(path):\n",
    "    files_and_dirs = os.listdir(path)\n",
    "    return [x for x in files_and_dirs if os.path.isdir(os.path.join(path,x))]\n",
    "\n",
    "def get_historical_dataframe(buoy_path, matching_string):\n",
    "    print(\"buoy_path\", buoy_path)\n",
    "    df_list = []\n",
    "    years = strip_non_directories(buoy_path)\n",
    "    print(\"years\", years)\n",
    "    for year in years:\n",
    "        year_path = os.path.join(buoy_path, year)\n",
    "        months = strip_non_directories(year_path)\n",
    "        for month in months:\n",
    "            month_path = os.path.join(year_path,month)\n",
    "            try:\n",
    "                file_name = glob.glob(month_path + os.sep +  matching_string)[0]\n",
    "                if matching_string[-1] == 'w':\n",
    "                    columns = hiw_columns\n",
    "                else:\n",
    "                    columns = his_columns\n",
    "                df = pd.read_csv(file_name, names = columns)\n",
    "                date_times = []\n",
    "                for date_time_string in df['date_time'].values:\n",
    "                    if date_time_string != 'nan':\n",
    "                        date_time = datetime.strptime(date_time_string[:-5],\n",
    "                                                      \"%Y-%m-%dT%H:%M:%S\")\n",
    "                        date_times.append(date_time)\n",
    "                    else:\n",
    "                        date_times.append(datetime(1970,1,1))\n",
    "                df.index = pd.DatetimeIndex(date_times)\n",
    "                df_list.append(df)\n",
    "            except IndexError:\n",
    "                print(\"No file found matching\", matching_string)\n",
    "    if len(df_list) != 0:\n",
    "        large_df = pd.concat(df_list)\n",
    "        large_df = large_df.sort_index()       \n",
    "        large_df.to_pickle(buoy_path + '_' + matching_string[-3:] + '_dataframe')\n",
    "        def resample_write_xlsx(df, period):\n",
    "            resampled_df = df.resample(period)\n",
    "            resampled_df.to_excel(buoy_path + '_' + period + '_' + \\\n",
    "                                         matching_string[-3:] + '.xlsx' )\n",
    "            return resampled_df\n",
    "        thirty_min_resample = resample_write_xlsx(large_df, '30Min')\n",
    "        resample_write_xlsx(large_df, '60Min')\n",
    "        return thirty_min_resample\n",
    "\n",
    "\n",
    "def load(buoy_path):\n",
    "    for key, value in matching_file_types.items():\n",
    "        print(key)\n",
    "        hist_df = get_historical_dataframe(buoy_path, value)\n",
    "        hist_df.to_hdf(buoy_path + '/hist.h5', key)"
   ]
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [default]",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"#Updated script for Python 3 based on https://gist.github.com/seumasmorrison/1abaa2308044814167a9\n",
	"from datetime import datetime\n",
	"import glob\n",
	"import os\n",
	"import pandas as pd\n",
	"\n",
	"his_columns = ['date_time', 'tp', 'dirp', 'sprp', 'tz', 'hm0', 'ti', 't1', \n",
	" 'tc', 'tdw2', 'tdw1', 'tpc', 'nu','eps','qp','ss','tref','tsea',\n",
	" 'bat']\n",
	" \n",
	"hiw_columns = ['date_time','% no reception errors','hmax','tmax','h(1/10)',\n",
	" 't(1/10)','h1/3','t1/3','Hav','Tav','Eps','#Waves']\n",
	"\n",
	"matching_string_buoy_his = '$.his'\n",
	"matching_string_computed_his = '[!$]}.his'\n",
	"matching_string_hiw = '*.hiw'\n",
	"\n",
	"matching_file_types = {'his':matching_string_computed_his, 'hiw':matching_string_hiw}\n",
	"\n",
	"def strip_non_directories(path):\n",
	" files_and_dirs = os.listdir(path)\n",
	" return [x for x in files_and_dirs if os.path.isdir(os.path.join(path,x))]\n",
	"\n",
	"def get_historical_dataframe(buoy_path, matching_string):\n",
	" print(\"buoy_path\", buoy_path)\n",
	" df_list = []\n",
	" years = strip_non_directories(buoy_path)\n",
	" print(\"years\", years)\n",
	" for year in years:\n",
	" year_path = os.path.join(buoy_path, year)\n",
	" months = strip_non_directories(year_path)\n",
	" for month in months:\n",
	" month_path = os.path.join(year_path,month)\n",
	" try:\n",
	" file_name = glob.glob(month_path + os.sep + matching_string)[0]\n",
	" if matching_string[-1] == 'w':\n",
	" columns = hiw_columns\n",
	" else:\n",
	" columns = his_columns\n",
	" df = pd.read_csv(file_name, names = columns)\n",
	" date_times = []\n",
	" for date_time_string in df['date_time'].values:\n",
	" if date_time_string != 'nan':\n",
	" date_time = datetime.strptime(date_time_string[:-5],\n",
	" \"%Y-%m-%dT%H:%M:%S\")\n",
	" date_times.append(date_time)\n",
	" else:\n",
	" date_times.append(datetime(1970,1,1))\n",
	" df.index = pd.DatetimeIndex(date_times)\n",
	" df_list.append(df)\n",
	" except IndexError:\n",
	" print(\"No file found matching\", matching_string)\n",
	" if len(df_list) != 0:\n",
	" large_df = pd.concat(df_list)\n",
	" large_df = large_df.sort_index() \n",
	" large_df.to_pickle(buoy_path + '_' + matching_string[-3:] + '_dataframe')\n",
	" def resample_write_xlsx(df, period):\n",
	" resampled_df = df.resample(period)\n",
	" resampled_df.to_excel(buoy_path + '_' + period + '_' + \\\n",
	" matching_string[-3:] + '.xlsx' )\n",
	" return resampled_df\n",
	" thirty_min_resample = resample_write_xlsx(large_df, '30Min')\n",
	" resample_write_xlsx(large_df, '60Min')\n",
	" return thirty_min_resample\n",
	"\n",
	"\n",
	"def load(buoy_path):\n",
	" for key, value in matching_file_types.items():\n",
	" print(key)\n",
	" hist_df = get_historical_dataframe(buoy_path, value)\n",
	" hist_df.to_hdf(buoy_path + '/hist.h5', key)"
	]
	}
	],
	"metadata": {
	"anaconda-cloud": {},
	"kernelspec": {
	"display_name": "Python [default]",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.5.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 1
	}