Last active
October 25, 2018 20:13
-
-
Save aaizemberg/616552f12dce4c91e22fb6417af4e741 to your computer and use it in GitHub Desktop.
Las noticias del día 25/10/2018 (GoogleNews & Tableau)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import feedparser | |
import pandas as pd | |
lista = [{"letra":"n","categoria":"Nacional"}, | |
{"letra":"s","categoria":"Deportes"}, | |
{"letra":"m","categoria":"Salud"}, | |
{"letra":"t","categoria":"Ciencia y Tecnología"}, | |
{"letra":"e","categoria":"Entretenimiento"}, | |
{"letra":"b","categoria":"Economía"}] | |
data = [] | |
for l in lista: | |
url = 'https://news.google.com/news?pz=1&cf=all&ned=es_ar&hl=es&topic=' + l['letra'] + '&output=rss' | |
d = feedparser.parse( url ) | |
for e in d.entries: | |
if(e!=d.entries[0]): | |
titmed2 = e.title | |
titmed = titmed2.split(' - ') | |
titulo = titmed[0] | |
medio = titmed[len(titmed)-1] | |
fecha = e.published | |
url2 = e.link # esta URL tiene primero la parte de news.google y despues la del medio | |
url = url2.split('&url=')[1] | |
categoria = l['categoria'] | |
tabla = e.summary | |
tbl = tabla.replace("</b>", "<b>").split('<b>') | |
cantidad = "1" | |
for r in tbl: | |
if (u"artículos informativos »" in r): | |
cantidad = r.split(' ')[1].replace(".","") | |
data.append([titulo, medio, fecha, url, categoria, int(cantidad)]) | |
df = pd.DataFrame(data, columns=['Title', 'Source', 'Date', 'url', 'Category', 'Size']) | |
df = df.sort_values(by='Size', ascending=False) | |
df.to_csv('googlenews_ar.tsv', sep='\t', header=True, index=False, encoding='utf-8') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<!DOCTYPE html> | |
<html> | |
<head> | |
<meta charset="utf-8"> | |
<meta name="viewport" content="width=device-width"> | |
<title>GoogleNews - 25/10/2018</title> | |
</head> | |
<body> | |
<div class='tableauPlaceholder' id='viz1540498161350' style='position: relative'><noscript><a href='#'><img alt=' ' src='https://public.tableau.com/static/images/Go/GoogleNews/GoogleNews/1_rss.png' style='border: none' /></a></noscript><object class='tableauViz' style='display:none;'><param name='host_url' value='https%3A%2F%2Fpublic.tableau.com%2F' /> <param name='embed_code_version' value='3' /> <param name='site_root' value='' /><param name='name' value='GoogleNews/GoogleNews' /><param name='tabs' value='no' /><param name='toolbar' value='yes' /><param name='static_image' value='https://public.tableau.com/static/images/Go/GoogleNews/GoogleNews/1.png' /> <param name='animate_transition' value='yes' /><param name='display_static_image' value='yes' /><param name='display_spinner' value='yes' /><param name='display_overlay' value='yes' /><param name='display_count' value='yes' /></object></div> <script type='text/javascript'> var divElement = document.getElementById('viz1540498161350'); var vizElement = divElement.getElementsByTagName('object')[0]; vizElement.style.width='100%';vizElement.style.height=(divElement.offsetWidth*0.75)+'px'; var scriptElement = document.createElement('script'); scriptElement.src = 'https://public.tableau.com/javascripts/api/viz_v1.js'; vizElement.parentNode.insertBefore(scriptElement, vizElement); </script> | |
</body> | |
</html> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment