Last active
June 19, 2021 08:45
-
-
Save kacaukacau/df5a42ac3a5b75a3c13a67fb81a28593 to your computer and use it in GitHub Desktop.
A python script to find overlapping holdings between ETFs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# v5: fixed a few bugs, added a overlap percentages by u/Mahasamat | |
# v4: Support for Deka, HSBC, JP Morgan, and Amundi added. | |
# Now the results will show members of each section in a Venn diagram. | |
# i.e. HoldingA will only be in section ('ETFA', 'ETFB', 'ETFC') and not in ('ETFA', 'ETFB'), ('ETFA', 'ETFC') and ('ETFB', 'ETFC') anymore, | |
# even though HoldingA is present in all three ETFs. | |
# Showing also unique holdings in ETFs. | |
# v3: Now supports more providers, and also overlap of more than two ETFs | |
# Before running, run | |
# pip install openpyxl xlrd pylightxl | |
# Supports ETFs from iShares, Invesco, Xtrackers, Lyxor, SPDR, L&G, Deka, HSBC, JP Morgan and Amundi for now. | |
# Vanguard doesn't seem to provide any csv/xlsx files with individual holdings of an ETF with ISIN, | |
# so it won't work with Vanguard's ETFs unless someone else can guide me to the relevant files or | |
# has other ideas. | |
# ETFs from other providers will most probably not work without modifications, so if you have any suggestions, you can request them to be supported, | |
# but make sure they have csv or xlsx files that have ISINs of the holdings in the file. | |
# Usage: | |
# 1. Download the csv/xlsx files from the providers' websites, place them all in a folder. | |
# 2. Rename the csv/xlsx files to <ticker>_<provider><.csv/.xlsx>. | |
# The first four letters of the filename is used to identify the ETFs in the results, so as long as you know what it means, it doesn't matter. | |
# <provider> has to be ishares/invesco/xtrackers/lyxor/spdr/lng/deka/hsbc/jpmorgan/amundi (case sensitive, exact match). | |
# <.csv/.xlsx> have to be the same as downloaded. | |
# 3. Download/copy & paste this script into a file in the same folder. | |
# 4. Run the script with: python etf_overlap.py <filenames with extensions in quotes, separated by space> | |
# For example: | |
# python etf_overlap.py '2B76_ishares.csv' 'IQQQ_ishares.csv' 'IUSN_ishares.csv' 'QDVE_ishares.csv' 'IS3N_ishares.csv' 'IQQH_ishares.csv' | |
import csv | |
import sys | |
import itertools | |
import pprint | |
import openpyxl | |
import pylightxl | |
import xlrd | |
import re | |
from copy import deepcopy | |
pp = pprint.PrettyPrinter() | |
dict_etf = {} | |
dict_overlap = {} | |
list_etf = [] | |
list_pair = [] | |
py_reg = re.compile(".*\.py") | |
currency_reg = re.compile("_CURRENCY.*") | |
isin_reg = re.compile("^[a-zA-Z0-9]{12}$") | |
xlsx_reg = re.compile(".*\.xlsx$") | |
xlsx_reg = re.compile(".*\.xlsx$") | |
xls_reg = re.compile(".*\.xls$") | |
csv_reg = re.compile(".*\.csv$") | |
xtrackers_reg = re.compile(".*xtrackers.xlsx") | |
ishares_reg = re.compile(".*ishares.csv") | |
invesco_reg = re.compile(".*invesco.xlsx") | |
lyxor_reg = re.compile(".*lyxor.xls") | |
spdr_reg = re.compile(".*spdr.xlsx") | |
lng_reg = re.compile(".*lng.csv") | |
deka_reg = re.compile(".*deka.xlsx") | |
hsbc_reg = re.compile(".*hsbc.xls") | |
jpmorgan_reg = re.compile(".*jpmorgan.xlsx") | |
amundi_reg = re.compile(".*amundi.xlsx") | |
def filter_junk(item): | |
junk = ['', None, '-', '_CURRENCY'] | |
if item in junk: | |
return False | |
elif bool(re.match(currency_reg, item)): | |
return False | |
elif bool(re.match(isin_reg, item)): | |
return True | |
else: | |
return False | |
def open_file(filename,start_row,start_col): | |
if bool(re.match(xlsx_reg, filename)): | |
if not bool(re.match(xtrackers_reg, filename)) and not bool(re.match(deka_reg, filename)): | |
wb = pylightxl.readxl(fn=filename) | |
for i in range(start_row,wb.ws(ws=wb.ws_names[0]).size[0] + 1): | |
dict_etf[filename[0:4]].append(wb.ws(ws=wb.ws_names[0]).address(address=start_col + str(i))) | |
else: | |
wb = openpyxl.load_workbook(filename) | |
wa = wb[wb.sheetnames[0]] | |
for i in range(start_row,wa.max_row + 1): | |
dict_etf[filename[0:4]].append(wa[start_col + str(i)].value) | |
elif bool(re.match(xls_reg, filename)): | |
wb = xlrd.open_workbook(filename) | |
wa = wb.sheet_by_name(wb.sheet_names()[0]) | |
for i in range(start_row - 1,wa.nrows): | |
dict_etf[filename[0:4]].append(wa.cell_value(i,start_col - 1)) | |
elif bool(re.match(csv_reg, filename)): | |
with open(filename) as csv_file: | |
reader = csv.reader(csv_file, delimiter=',') | |
rows = list(reader) | |
for i in range(0,start_row - 1): | |
rows.pop(i) | |
while len(rows[0]) < start_col: | |
rows.pop(0) | |
while len(rows[-1]) < start_col: | |
rows.pop(-1) | |
for each in rows: | |
dict_etf[filename[0:4]].append(each[start_col - 1]) | |
print('Processing:') | |
for each_file in sys.argv[1:]: | |
if bool(re.match(py_reg, each_file)): | |
continue | |
dict_etf[each_file[0:4]] = [] | |
print(each_file) | |
if bool(re.match(ishares_reg, each_file)): | |
open_file(each_file,3,10) | |
elif bool(re.match(xtrackers_reg, each_file)): | |
open_file(each_file,5,'C') | |
elif bool(re.match(invesco_reg, each_file)): | |
open_file(each_file,7,'B') | |
elif bool(re.match(lyxor_reg, each_file)): | |
open_file(each_file,15,4) | |
elif bool(re.match(spdr_reg, each_file)): | |
open_file(each_file,7,'A') | |
elif bool(re.match(lng_reg, each_file)): | |
open_file(each_file,16,2) | |
elif bool(re.match(deka_reg, each_file)): | |
open_file(each_file,2,'C') | |
elif bool(re.match(hsbc_reg, each_file)): | |
open_file(each_file,12,1) | |
elif bool(re.match(jpmorgan_reg, each_file)): | |
open_file(each_file,9,'B') | |
elif bool(re.match(amundi_reg, each_file)): | |
open_file(each_file,18,'B') | |
else: | |
print(f'Unknown provider {each_file}') | |
print('') | |
for each_etf in dict_etf: | |
list_etf.append(each_etf) | |
dict_etf[each_etf] = list(filter(filter_junk, dict_etf[each_etf])) | |
list_pair = [] | |
dict_etf_copy = deepcopy(dict_etf) | |
if len(list_etf) < 1: | |
print('No ETF given') | |
exit() | |
elif len(list_etf) == 1: | |
print('Only 1 ETF, no overlap') | |
exit() | |
elif len(list_etf) > 1: | |
if len(list_etf) == 2: | |
each_pair = (list_etf[0], list_etf[1]) | |
dict_overlap[each_pair] = list(set(dict_etf[each_pair[0]]) & set(dict_etf[each_pair[1]])) | |
if len(dict_overlap[each_pair]) == 0: | |
del dict_overlap[each_pair] | |
elif len(list_etf) > 2: | |
for i in range(len(list_etf), 1, -1): | |
list_pair.append(list(itertools.combinations(list_etf, r=i))) | |
argument_list = [] | |
for each_length in list_pair: | |
for each_combi in each_length: | |
for each_etf in each_combi: | |
argument_list.append(dict_etf_copy[each_etf]) | |
dict_overlap[each_combi] = list(set(argument_list[0]).intersection(*argument_list)) | |
for each_holding in dict_overlap[each_combi]: | |
for each_etf in dict_etf: | |
try: | |
dict_etf_copy[each_etf].pop(dict_etf_copy[each_etf].index(each_holding)) | |
except ValueError: | |
continue | |
argument_list = [] | |
list_pair = [] | |
for etfs, overlaps in dict_overlap.items(): | |
n_over = len(overlaps) | |
if n_over: | |
percents = [f"{e} {n_over/len(dict_etf[e])*100:.2f}%" for e in etfs] | |
pp.pprint(percents) | |
print('') | |
print('Overlapping holdings in ETFs:') | |
pp.pprint(dict_overlap) | |
print('') | |
print('Unique holdings in each ETF:') | |
pp.pprint(dict_etf_copy) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hello! I'm not sure if this is a bug, but I'm getting different results when I increase the number of etfs being compared, here are some screenshots below -
Notice (RBOT, HEAL) has 1 overlap
Notice (QDVE, RBOT) has 14 overlaps
Notice (RBOT, HEAL) has no overlap
Notice (QDVE, RBOT) has 1 overlap