Skip to content

Instantly share code, notes, and snippets.

@sergiolucero
Last active July 17, 2019 05:05
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sergiolucero/4e2bd1e389cded8c4e83a051f05ab36d to your computer and use it in GitHub Desktop.
Save sergiolucero/4e2bd1e389cded8c4e83a051f05ab36d to your computer and use it in GitHub Desktop.
Scraping CNTV 1/2
import requests, pandas as pd
from bs4 import BeautifulSoup
pd.set_option('max_colwidth',-1)
base_url = 'https://www.cntv.cl/cntv/site/tax/port/all/taxport_16___1.html'
bs = BeautifulSoup(requests.get(base_url).text, 'lxml')
top = pd.DataFrame()
links = [link for link in bs.find_all('a') if 'más' in link.text]
print(f'{len(links)} meses de denuncias')
for link in links: # visitalas y recopila 1 año
mes = link['href'].split('/')[-2][:7] # url->2019-05
print(mes, end=',')
link_url = link['href']
link_df = pd.read_html('https://www.cntv.cl'+link_url, thousands='.')[0]
link_df['mes'] = mes
top = top.append(link_df)
top.columns=['PROGRAMA','FECHA','DENUNCIAS','CONTENIDOS','CANAL','mes']
top['CANAL'] = top.CANAL.str.upper()
import seaborn as sns
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1,figsize=(20,9))
top5 = top.CANAL.value_counts().index[:5]
top = top[top.CANAL.isin(top5)]
top['mes'] = top['mes'].str.replace('2019-07','2019-06')
top['CANAL'] = [c.replace('CANAL 13','C13') for c in top['CANAL']]
pdf = top[top.mes>='2018'].pivot_table(index='CANAL', columns='mes', values='DENUNCIAS', aggfunc='sum')
p = sns.heatmap(pdf, annot=True, fmt='.0f', ax=ax, annot_kws={'size':18}, cmap='YlOrRd')
plt.xticks(rotation=30)
p.set_xlabel("mes",fontsize=20,fontweight='bold', color='orange')
p.set_ylabel("CANAL",fontsize=20,fontweight='bold',color='red')
_ = ax.set_title('En 18 Meses, 10278 denuncias al Consejo Nacional de Televisión [Fuente:cntv.cl & Data: quant.cl]',
size=18, color='purple', fontweight='semibold')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment