Skip to content

Instantly share code, notes, and snippets.

@RodrigoPrior
Created October 14, 2016 04:25
Show Gist options
  • Save RodrigoPrior/53dc19e9d305fb333054310f0338caf5 to your computer and use it in GitHub Desktop.
Save RodrigoPrior/53dc19e9d305fb333054310f0338caf5 to your computer and use it in GitHub Desktop.
Banco do Brasil investiment funds - grab and compare
from lxml import html
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.cluster import KMeans
from sklearn import preprocessing
import seaborn as sns
pd.set_option('expand_frame_repr', False)
url = 'http://www37.bb.com.br/portalbb/tabelaRentabilidade/rentabilidade/gfi7,802,9085,9089,1.bbx?tipo=1'
page = requests.get(url)
tree = html.fromstring(page.content)
# remove fun class
for i in tree.xpath('//table//tr//td[@colspan="13"]'):
i.getparent().remove(i)
# columns = tree.xpath('//table//th/text()')[4:19]
columns = ['fundos',
'lixo',
'dia',
'acum_mes',
'setembro',
'2016',
'12m',
'24m',
'36m',
'pl_medio_12m',
'taxaadm_aa',
'data_cotacao',
'cota',
'data_inicio']
fundos = tree.xpath('//table//tr//td/text()')
a = []
for x in range(len(fundos)-48)[::14]:
a.append([
i.replace(',', '.').
replace('\t', '').
replace('\n', '').
replace('%', '').
replace(' ', '') for i in fundos[x:x+14]])
df = pd.DataFrame(a, columns=columns, dtype='float64')
df[['data_cotacao', 'data_inicio']] = df[['data_cotacao', 'data_inicio']].apply(pd.to_datetime)
df.replace('', np.nan, inplace=True)
df.replace('-', 0, inplace=True)
df[['24m', '36m']] = df[['24m', '36m']].apply(pd.to_numeric)
df.to_csv(time.strftime('%Y%m%d-%H%M%S')+'_bb.csv')
df.drop(['lixo', 'dia', 'acum_mes', 'setembro', 'data_cotacao', 'cota',
'data_inicio'], axis=1, inplace=True)
X = df[['2016', '12m', '24m', '36m', 'pl_medio_12m', 'taxaadm_aa']].values
scaler = preprocessing.StandardScaler()
X = scaler.fit_transform(X)
kmeans = KMeans(n_clusters=10, random_state=0).fit(X)
df['labels'] = kmeans.labels_
sns.pairplot(df, hue="labels")
plt.savefig('./bb_clustes.pdf')
df.sort_values(['labels'])
for i in df['labels'].unique():
print('Report for cluster', i)
print(df[df['labels'] == i].
sort_values(['36m', 'pl_medio_12m', 'taxaadm_aa'], ascending=False))
print('='*80)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment