Skip to content

Instantly share code, notes, and snippets.

@kacaukacau
Last active June 19, 2021 08:45
Show Gist options
  • Save kacaukacau/df5a42ac3a5b75a3c13a67fb81a28593 to your computer and use it in GitHub Desktop.
Save kacaukacau/df5a42ac3a5b75a3c13a67fb81a28593 to your computer and use it in GitHub Desktop.
A python script to find overlapping holdings between ETFs
# v5: fixed a few bugs, added a overlap percentages by u/Mahasamat
# v4: Support for Deka, HSBC, JP Morgan, and Amundi added.
# Now the results will show members of each section in a Venn diagram.
# i.e. HoldingA will only be in section ('ETFA', 'ETFB', 'ETFC') and not in ('ETFA', 'ETFB'), ('ETFA', 'ETFC') and ('ETFB', 'ETFC') anymore,
# even though HoldingA is present in all three ETFs.
# Showing also unique holdings in ETFs.
# v3: Now supports more providers, and also overlap of more than two ETFs
# Before running, run
# pip install openpyxl xlrd pylightxl
# Supports ETFs from iShares, Invesco, Xtrackers, Lyxor, SPDR, L&G, Deka, HSBC, JP Morgan and Amundi for now.
# Vanguard doesn't seem to provide any csv/xlsx files with individual holdings of an ETF with ISIN,
# so it won't work with Vanguard's ETFs unless someone else can guide me to the relevant files or
# has other ideas.
# ETFs from other providers will most probably not work without modifications, so if you have any suggestions, you can request them to be supported,
# but make sure they have csv or xlsx files that have ISINs of the holdings in the file.
# Usage:
# 1. Download the csv/xlsx files from the providers' websites, place them all in a folder.
# 2. Rename the csv/xlsx files to <ticker>_<provider><.csv/.xlsx>.
# The first four letters of the filename is used to identify the ETFs in the results, so as long as you know what it means, it doesn't matter.
# <provider> has to be ishares/invesco/xtrackers/lyxor/spdr/lng/deka/hsbc/jpmorgan/amundi (case sensitive, exact match).
# <.csv/.xlsx> have to be the same as downloaded.
# 3. Download/copy & paste this script into a file in the same folder.
# 4. Run the script with: python etf_overlap.py <filenames with extensions in quotes, separated by space>
# For example:
# python etf_overlap.py '2B76_ishares.csv' 'IQQQ_ishares.csv' 'IUSN_ishares.csv' 'QDVE_ishares.csv' 'IS3N_ishares.csv' 'IQQH_ishares.csv'
import csv
import sys
import itertools
import pprint
import openpyxl
import pylightxl
import xlrd
import re
from copy import deepcopy
pp = pprint.PrettyPrinter()
dict_etf = {}
dict_overlap = {}
list_etf = []
list_pair = []
py_reg = re.compile(".*\.py")
currency_reg = re.compile("_CURRENCY.*")
isin_reg = re.compile("^[a-zA-Z0-9]{12}$")
xlsx_reg = re.compile(".*\.xlsx$")
xlsx_reg = re.compile(".*\.xlsx$")
xls_reg = re.compile(".*\.xls$")
csv_reg = re.compile(".*\.csv$")
xtrackers_reg = re.compile(".*xtrackers.xlsx")
ishares_reg = re.compile(".*ishares.csv")
invesco_reg = re.compile(".*invesco.xlsx")
lyxor_reg = re.compile(".*lyxor.xls")
spdr_reg = re.compile(".*spdr.xlsx")
lng_reg = re.compile(".*lng.csv")
deka_reg = re.compile(".*deka.xlsx")
hsbc_reg = re.compile(".*hsbc.xls")
jpmorgan_reg = re.compile(".*jpmorgan.xlsx")
amundi_reg = re.compile(".*amundi.xlsx")
def filter_junk(item):
junk = ['', None, '-', '_CURRENCY']
if item in junk:
return False
elif bool(re.match(currency_reg, item)):
return False
elif bool(re.match(isin_reg, item)):
return True
else:
return False
def open_file(filename,start_row,start_col):
if bool(re.match(xlsx_reg, filename)):
if not bool(re.match(xtrackers_reg, filename)) and not bool(re.match(deka_reg, filename)):
wb = pylightxl.readxl(fn=filename)
for i in range(start_row,wb.ws(ws=wb.ws_names[0]).size[0] + 1):
dict_etf[filename[0:4]].append(wb.ws(ws=wb.ws_names[0]).address(address=start_col + str(i)))
else:
wb = openpyxl.load_workbook(filename)
wa = wb[wb.sheetnames[0]]
for i in range(start_row,wa.max_row + 1):
dict_etf[filename[0:4]].append(wa[start_col + str(i)].value)
elif bool(re.match(xls_reg, filename)):
wb = xlrd.open_workbook(filename)
wa = wb.sheet_by_name(wb.sheet_names()[0])
for i in range(start_row - 1,wa.nrows):
dict_etf[filename[0:4]].append(wa.cell_value(i,start_col - 1))
elif bool(re.match(csv_reg, filename)):
with open(filename) as csv_file:
reader = csv.reader(csv_file, delimiter=',')
rows = list(reader)
for i in range(0,start_row - 1):
rows.pop(i)
while len(rows[0]) < start_col:
rows.pop(0)
while len(rows[-1]) < start_col:
rows.pop(-1)
for each in rows:
dict_etf[filename[0:4]].append(each[start_col - 1])
print('Processing:')
for each_file in sys.argv[1:]:
if bool(re.match(py_reg, each_file)):
continue
dict_etf[each_file[0:4]] = []
print(each_file)
if bool(re.match(ishares_reg, each_file)):
open_file(each_file,3,10)
elif bool(re.match(xtrackers_reg, each_file)):
open_file(each_file,5,'C')
elif bool(re.match(invesco_reg, each_file)):
open_file(each_file,7,'B')
elif bool(re.match(lyxor_reg, each_file)):
open_file(each_file,15,4)
elif bool(re.match(spdr_reg, each_file)):
open_file(each_file,7,'A')
elif bool(re.match(lng_reg, each_file)):
open_file(each_file,16,2)
elif bool(re.match(deka_reg, each_file)):
open_file(each_file,2,'C')
elif bool(re.match(hsbc_reg, each_file)):
open_file(each_file,12,1)
elif bool(re.match(jpmorgan_reg, each_file)):
open_file(each_file,9,'B')
elif bool(re.match(amundi_reg, each_file)):
open_file(each_file,18,'B')
else:
print(f'Unknown provider {each_file}')
print('')
for each_etf in dict_etf:
list_etf.append(each_etf)
dict_etf[each_etf] = list(filter(filter_junk, dict_etf[each_etf]))
list_pair = []
dict_etf_copy = deepcopy(dict_etf)
if len(list_etf) < 1:
print('No ETF given')
exit()
elif len(list_etf) == 1:
print('Only 1 ETF, no overlap')
exit()
elif len(list_etf) > 1:
if len(list_etf) == 2:
each_pair = (list_etf[0], list_etf[1])
dict_overlap[each_pair] = list(set(dict_etf[each_pair[0]]) & set(dict_etf[each_pair[1]]))
if len(dict_overlap[each_pair]) == 0:
del dict_overlap[each_pair]
elif len(list_etf) > 2:
for i in range(len(list_etf), 1, -1):
list_pair.append(list(itertools.combinations(list_etf, r=i)))
argument_list = []
for each_length in list_pair:
for each_combi in each_length:
for each_etf in each_combi:
argument_list.append(dict_etf_copy[each_etf])
dict_overlap[each_combi] = list(set(argument_list[0]).intersection(*argument_list))
for each_holding in dict_overlap[each_combi]:
for each_etf in dict_etf:
try:
dict_etf_copy[each_etf].pop(dict_etf_copy[each_etf].index(each_holding))
except ValueError:
continue
argument_list = []
list_pair = []
for etfs, overlaps in dict_overlap.items():
n_over = len(overlaps)
if n_over:
percents = [f"{e} {n_over/len(dict_etf[e])*100:.2f}%" for e in etfs]
pp.pprint(percents)
print('')
print('Overlapping holdings in ETFs:')
pp.pprint(dict_overlap)
print('')
print('Unique holdings in each ETF:')
pp.pprint(dict_etf_copy)
@DennisDurairaj
Copy link

DennisDurairaj commented Jun 7, 2021

Hello! I'm not sure if this is a bug, but I'm getting different results when I increase the number of etfs being compared, here are some screenshots below -

image

Notice (RBOT, HEAL) has 1 overlap
Notice (QDVE, RBOT) has 14 overlaps

image
image

Notice (RBOT, HEAL) has no overlap
Notice (QDVE, RBOT) has 1 overlap

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment