Skip to content

Instantly share code, notes, and snippets.

@n-lavrenko
Last active May 22, 2020 14:45
Show Gist options
  • Save n-lavrenko/dc2bc42b20be5d6d5286fc28b9a9bcf1 to your computer and use it in GitHub Desktop.
Save n-lavrenko/dc2bc42b20be5d6d5286fc28b9a9bcf1 to your computer and use it in GitHub Desktop.
Python script that read .csv files that contain historical data of bars and ticks (in separate files), auto define winter weeks, based on data in file; change timezone, remove historical data on Saturdays and decrease 1 hour at winter weeks and write updated data to the new file.
# I use this script for make historical data of Ducascopy identical with FxClub forex broker.
# Then I can make deep historical optimization in MetaTrader 5 and use results in real trading on FxClub.
# You can edit some variables in code and code itself to edit Ducascopy historical data to make data as you need.
# Good luck.
import time
import pandas as pd
symbol = 'AUDUSD'
path_to_bars_file = f'D:\TickstoryDownloads\\{symbol}\\{symbol}_mt5_bars.csv'
path_to_bars_edited_file = f'D:\TickstoryDownloads\\{symbol}\\{symbol}_bars_ducas_clean_UTC+3.csv'
path_to_tick_file = f'D:\TickstoryDownloads\\{symbol}\\{symbol}_mt5_ticks.csv'
path_to_ticks_edited_file = f'D:\TickstoryDownloads\\{symbol}\\{symbol}_ticks_ducas_clean_UTC+3.csv'
bars_cols_names = ['date', 'time', 'open', 'high', 'low', 'close', 'tick_vol', 'vol', 'spread']
bars_dtype = {'date': object, 'open': str, 'high': str, 'low': str, 'close': str, 'tick_vol': str, 'vol': str}
ticks_cols_names = ['date', 'time', 'bid', 'ask']
ticks_dtype = {'date': object, 'bid': str, 'ask': str}
years_count = 14
bars_chunk_size = 100000
ticks_chunk_size = 1000000
increase_hours_timezone = 3
is_print_technical_info = False
def get_reader(path, cols, d_type, chunk_size):
return pd.read_csv(path, dtype=d_type, iterator=True, sep=',', header=None, names=cols,
usecols=range(0, len(cols)), chunksize=chunk_size)
def get_winter_weeks():
all_weeks_should_changed = []
reader = get_reader(path_to_bars_file, bars_cols_names, bars_dtype, bars_chunk_size)
for chunk in reader:
chunk['date_time'] = pd.to_datetime(chunk['date'] + chunk['time'], format='%Y%m%d%H:%M:%S')
chunk['date_time'] = chunk['date_time'] + pd.Timedelta(hours=increase_hours_timezone)
chunk['dayofweek'] = chunk['date_time'].dt.dayofweek
chunk['is_day_of'] = chunk['dayofweek'] > 4
chunk['week'] = chunk['date_time'].dt.week
chunk['year_week'] = chunk['date_time'].dt.year.astype('str') + '/' + chunk['week'].astype('str')
mask = chunk['is_day_of']
unique = chunk.loc[mask]['year_week'].unique()
all_weeks_should_changed.extend(unique)
result = list(dict.fromkeys(all_weeks_should_changed))
if is_print_technical_info:
print(result)
return result
def date_exclude_creater():
start_year = 2007
new_year_date_start = '01-01 00:00:00'
new_year_date_end = '01-01 23:59:59'
cristmas_date_start = '12-24 22:00:00'
cristmas_date_end = '12-26 02:59:59'
_all_new_years_start = []
_all_new_years_end = []
_all_cristmases_start = []
_all_cristmases_end = []
for d in range(years_count):
_all_new_years_start.append(str(start_year + d) + '-' + new_year_date_start)
_all_new_years_end.append(str(start_year + d) + '-' + new_year_date_end)
_all_cristmases_start.append(str(start_year + d) + '-' + cristmas_date_start)
_all_cristmases_end.append(str(start_year + d) + '-' + cristmas_date_end)
if is_print_technical_info:
print(_all_new_years_start, _all_new_years_end, _all_cristmases_start, _all_cristmases_end)
return _all_new_years_start, _all_new_years_end, _all_cristmases_start, _all_cristmases_end
def filter_dates(chunk):
for index in range(years_count):
y_s = pd.to_datetime(all_new_years_start[index])
y_e = pd.to_datetime(all_new_years_end[index])
x_s = pd.to_datetime(all_cristmases_start[index])
x_e = pd.to_datetime(all_cristmases_end[index])
mask_year = (chunk['date_time'] >= y_s) & (chunk['date_time'] <= y_e)
mask_xmas = (chunk['date_time'] >= x_s) & (chunk['date_time'] <= x_e)
chunk.drop(chunk.loc[mask_xmas].index, inplace=True)
chunk.drop(chunk.loc[mask_year].index, inplace=True)
return chunk
def read_edit_write(is_bars):
if is_bars:
reader = get_reader(path_to_bars_file, bars_cols_names, bars_dtype, bars_chunk_size)
cols_names = bars_cols_names
path_to_write = path_to_bars_edited_file
else:
reader = get_reader(path_to_tick_file, ticks_cols_names, ticks_dtype, ticks_chunk_size)
cols_names = ticks_cols_names
path_to_write = path_to_ticks_edited_file
with open(path_to_write, 'w') as path:
for chunk in reader:
chunk['date_time'] = pd.to_datetime(chunk['date'] + chunk['time'], format='%Y%m%d%H:%M:%S')
chunk['date_time'] = chunk['date_time'] + pd.Timedelta(hours=increase_hours_timezone)
chunk['dayofweek'] = chunk['date_time'].dt.dayofweek
chunk['week'] = chunk['date_time'].dt.week
chunk['year_week'] = chunk['date_time'].dt.year.astype('str') + '/' + chunk['week'].astype('str')
chunk['updated_date_time'] = chunk[chunk['year_week'].isin(unique_weeks)]['date_time'] - pd.Timedelta(hours=1)
chunk['updated_date_time'].fillna(chunk['date_time'], inplace=True)
# Filter saturdays:
chunk = chunk[chunk['dayofweek'] < 5]
# filter data by day !01.01, limit 25.12 till 20:00
chunk = filter_dates(chunk)
chunk['date'] = chunk['updated_date_time'].dt.strftime('%Y%m%d')
chunk['time'] = chunk['updated_date_time'].dt.strftime('%H:%M:%S')
chunk[cols_names].to_csv(path, sep=',', header=False, line_terminator='\n', index=None)
def edit_history():
start_time = time.perf_counter()
read_edit_write(True)
print(f'Bars of {symbol} is done! Spent: {(time.perf_counter() - start_time) / 60 :0.2f} minutes')
start_time = time.perf_counter()
read_edit_write(False)
print(f'Ticks of {symbol} is done! Spent: {(time.perf_counter() - start_time) / 60 / 60 :0.2f} hours')
print(f'Editing bars and ticks of {symbol} is started.')
(all_new_years_start, all_new_years_end, all_cristmases_start, all_cristmases_end) = date_exclude_creater()
unique_weeks = get_winter_weeks()
edit_history()
@n-lavrenko
Copy link
Author

This script use pandas library and can read huge files. For example I am read one file with size of 11 Gb.
P.S. also I cut some data on 01.01 and 25-26 December of each year (see function date_exclude_creater()).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment