Last active
May 22, 2020 14:45
-
-
Save n-lavrenko/dc2bc42b20be5d6d5286fc28b9a9bcf1 to your computer and use it in GitHub Desktop.
Python script that read .csv files that contain historical data of bars and ticks (in separate files), auto define winter weeks, based on data in file; change timezone, remove historical data on Saturdays and decrease 1 hour at winter weeks and write updated data to the new file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# I use this script for make historical data of Ducascopy identical with FxClub forex broker. | |
# Then I can make deep historical optimization in MetaTrader 5 and use results in real trading on FxClub. | |
# You can edit some variables in code and code itself to edit Ducascopy historical data to make data as you need. | |
# Good luck. | |
import time | |
import pandas as pd | |
symbol = 'AUDUSD' | |
path_to_bars_file = f'D:\TickstoryDownloads\\{symbol}\\{symbol}_mt5_bars.csv' | |
path_to_bars_edited_file = f'D:\TickstoryDownloads\\{symbol}\\{symbol}_bars_ducas_clean_UTC+3.csv' | |
path_to_tick_file = f'D:\TickstoryDownloads\\{symbol}\\{symbol}_mt5_ticks.csv' | |
path_to_ticks_edited_file = f'D:\TickstoryDownloads\\{symbol}\\{symbol}_ticks_ducas_clean_UTC+3.csv' | |
bars_cols_names = ['date', 'time', 'open', 'high', 'low', 'close', 'tick_vol', 'vol', 'spread'] | |
bars_dtype = {'date': object, 'open': str, 'high': str, 'low': str, 'close': str, 'tick_vol': str, 'vol': str} | |
ticks_cols_names = ['date', 'time', 'bid', 'ask'] | |
ticks_dtype = {'date': object, 'bid': str, 'ask': str} | |
years_count = 14 | |
bars_chunk_size = 100000 | |
ticks_chunk_size = 1000000 | |
increase_hours_timezone = 3 | |
is_print_technical_info = False | |
def get_reader(path, cols, d_type, chunk_size): | |
return pd.read_csv(path, dtype=d_type, iterator=True, sep=',', header=None, names=cols, | |
usecols=range(0, len(cols)), chunksize=chunk_size) | |
def get_winter_weeks(): | |
all_weeks_should_changed = [] | |
reader = get_reader(path_to_bars_file, bars_cols_names, bars_dtype, bars_chunk_size) | |
for chunk in reader: | |
chunk['date_time'] = pd.to_datetime(chunk['date'] + chunk['time'], format='%Y%m%d%H:%M:%S') | |
chunk['date_time'] = chunk['date_time'] + pd.Timedelta(hours=increase_hours_timezone) | |
chunk['dayofweek'] = chunk['date_time'].dt.dayofweek | |
chunk['is_day_of'] = chunk['dayofweek'] > 4 | |
chunk['week'] = chunk['date_time'].dt.week | |
chunk['year_week'] = chunk['date_time'].dt.year.astype('str') + '/' + chunk['week'].astype('str') | |
mask = chunk['is_day_of'] | |
unique = chunk.loc[mask]['year_week'].unique() | |
all_weeks_should_changed.extend(unique) | |
result = list(dict.fromkeys(all_weeks_should_changed)) | |
if is_print_technical_info: | |
print(result) | |
return result | |
def date_exclude_creater(): | |
start_year = 2007 | |
new_year_date_start = '01-01 00:00:00' | |
new_year_date_end = '01-01 23:59:59' | |
cristmas_date_start = '12-24 22:00:00' | |
cristmas_date_end = '12-26 02:59:59' | |
_all_new_years_start = [] | |
_all_new_years_end = [] | |
_all_cristmases_start = [] | |
_all_cristmases_end = [] | |
for d in range(years_count): | |
_all_new_years_start.append(str(start_year + d) + '-' + new_year_date_start) | |
_all_new_years_end.append(str(start_year + d) + '-' + new_year_date_end) | |
_all_cristmases_start.append(str(start_year + d) + '-' + cristmas_date_start) | |
_all_cristmases_end.append(str(start_year + d) + '-' + cristmas_date_end) | |
if is_print_technical_info: | |
print(_all_new_years_start, _all_new_years_end, _all_cristmases_start, _all_cristmases_end) | |
return _all_new_years_start, _all_new_years_end, _all_cristmases_start, _all_cristmases_end | |
def filter_dates(chunk): | |
for index in range(years_count): | |
y_s = pd.to_datetime(all_new_years_start[index]) | |
y_e = pd.to_datetime(all_new_years_end[index]) | |
x_s = pd.to_datetime(all_cristmases_start[index]) | |
x_e = pd.to_datetime(all_cristmases_end[index]) | |
mask_year = (chunk['date_time'] >= y_s) & (chunk['date_time'] <= y_e) | |
mask_xmas = (chunk['date_time'] >= x_s) & (chunk['date_time'] <= x_e) | |
chunk.drop(chunk.loc[mask_xmas].index, inplace=True) | |
chunk.drop(chunk.loc[mask_year].index, inplace=True) | |
return chunk | |
def read_edit_write(is_bars): | |
if is_bars: | |
reader = get_reader(path_to_bars_file, bars_cols_names, bars_dtype, bars_chunk_size) | |
cols_names = bars_cols_names | |
path_to_write = path_to_bars_edited_file | |
else: | |
reader = get_reader(path_to_tick_file, ticks_cols_names, ticks_dtype, ticks_chunk_size) | |
cols_names = ticks_cols_names | |
path_to_write = path_to_ticks_edited_file | |
with open(path_to_write, 'w') as path: | |
for chunk in reader: | |
chunk['date_time'] = pd.to_datetime(chunk['date'] + chunk['time'], format='%Y%m%d%H:%M:%S') | |
chunk['date_time'] = chunk['date_time'] + pd.Timedelta(hours=increase_hours_timezone) | |
chunk['dayofweek'] = chunk['date_time'].dt.dayofweek | |
chunk['week'] = chunk['date_time'].dt.week | |
chunk['year_week'] = chunk['date_time'].dt.year.astype('str') + '/' + chunk['week'].astype('str') | |
chunk['updated_date_time'] = chunk[chunk['year_week'].isin(unique_weeks)]['date_time'] - pd.Timedelta(hours=1) | |
chunk['updated_date_time'].fillna(chunk['date_time'], inplace=True) | |
# Filter saturdays: | |
chunk = chunk[chunk['dayofweek'] < 5] | |
# filter data by day !01.01, limit 25.12 till 20:00 | |
chunk = filter_dates(chunk) | |
chunk['date'] = chunk['updated_date_time'].dt.strftime('%Y%m%d') | |
chunk['time'] = chunk['updated_date_time'].dt.strftime('%H:%M:%S') | |
chunk[cols_names].to_csv(path, sep=',', header=False, line_terminator='\n', index=None) | |
def edit_history(): | |
start_time = time.perf_counter() | |
read_edit_write(True) | |
print(f'Bars of {symbol} is done! Spent: {(time.perf_counter() - start_time) / 60 :0.2f} minutes') | |
start_time = time.perf_counter() | |
read_edit_write(False) | |
print(f'Ticks of {symbol} is done! Spent: {(time.perf_counter() - start_time) / 60 / 60 :0.2f} hours') | |
print(f'Editing bars and ticks of {symbol} is started.') | |
(all_new_years_start, all_new_years_end, all_cristmases_start, all_cristmases_end) = date_exclude_creater() | |
unique_weeks = get_winter_weeks() | |
edit_history() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This script use pandas library and can read huge files. For example I am read one file with size of 11 Gb.
P.S. also I cut some data on 01.01 and 25-26 December of each year (see function
date_exclude_creater()
).