Last active
July 17, 2019 05:05
-
-
Save sergiolucero/4e2bd1e389cded8c4e83a051f05ab36d to your computer and use it in GitHub Desktop.
Scraping CNTV 1/2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests, pandas as pd | |
from bs4 import BeautifulSoup | |
pd.set_option('max_colwidth',-1) | |
base_url = 'https://www.cntv.cl/cntv/site/tax/port/all/taxport_16___1.html' | |
bs = BeautifulSoup(requests.get(base_url).text, 'lxml') | |
top = pd.DataFrame() | |
links = [link for link in bs.find_all('a') if 'más' in link.text] | |
print(f'{len(links)} meses de denuncias') | |
for link in links: # visitalas y recopila 1 año | |
mes = link['href'].split('/')[-2][:7] # url->2019-05 | |
print(mes, end=',') | |
link_url = link['href'] | |
link_df = pd.read_html('https://www.cntv.cl'+link_url, thousands='.')[0] | |
link_df['mes'] = mes | |
top = top.append(link_df) | |
top.columns=['PROGRAMA','FECHA','DENUNCIAS','CONTENIDOS','CANAL','mes'] | |
top['CANAL'] = top.CANAL.str.upper() | |
import seaborn as sns | |
import matplotlib.pyplot as plt | |
fig, ax = plt.subplots(1,figsize=(20,9)) | |
top5 = top.CANAL.value_counts().index[:5] | |
top = top[top.CANAL.isin(top5)] | |
top['mes'] = top['mes'].str.replace('2019-07','2019-06') | |
top['CANAL'] = [c.replace('CANAL 13','C13') for c in top['CANAL']] | |
pdf = top[top.mes>='2018'].pivot_table(index='CANAL', columns='mes', values='DENUNCIAS', aggfunc='sum') | |
p = sns.heatmap(pdf, annot=True, fmt='.0f', ax=ax, annot_kws={'size':18}, cmap='YlOrRd') | |
plt.xticks(rotation=30) | |
p.set_xlabel("mes",fontsize=20,fontweight='bold', color='orange') | |
p.set_ylabel("CANAL",fontsize=20,fontweight='bold',color='red') | |
_ = ax.set_title('En 18 Meses, 10278 denuncias al Consejo Nacional de Televisión [Fuente:cntv.cl & Data: quant.cl]', | |
size=18, color='purple', fontweight='semibold') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment