Created
December 24, 2019 20:21
-
-
Save pimienta/269a62b8604060572c7d856a4625c808 to your computer and use it in GitHub Desktop.
Archivos para crear una nube de palabras en forma de árbol de navidad, creado a partir de las transcripciones de los discursos de Andrés Manuel López Obrador disponibles en su página web.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
class LopezSpider(scrapy.Spider): | |
name = "Lopez" | |
start_urls = ["https://lopezobrador.org.mx/transcripciones/"] | |
def parse(self, response): | |
for href in response.css('.entry-title a::attr("href")'): | |
url = href.get() | |
yield scrapy.Request(url, callback=self.parse_item) | |
next_page = response.css('li a.next::attr("href")').get() | |
if next_page is not None: | |
yield response.follow(next_page, self.parse) | |
def parse_item(self, response): | |
for article in response.css('article'): | |
title = article.css(".entry-title").get() | |
content = article.css(".entry-content").get() | |
yield {'title': title, 'content': content} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import json | |
corpus = "" | |
with open('salida.json') as f: | |
transcripts = json.load(f) | |
for t in transcripts: | |
if "conferencia de prensa" not in t['title']: | |
text = BeautifulSoup(t['content']).get_text() | |
text = text.replace("+++++", "") | |
text = text.replace("PRESIDENTE ANDRÉS MANUEL LÓPEZ OBRADOR", '') | |
text = text.replace( | |
"2019, Año del Caudillo del Sur, Emiliano Zapata", '') | |
text = text.strip() | |
corpus += ' ' + text | |
with open('corpus.txt', 'w') as output: | |
output.write(corpus) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import stylecloud | |
sw = ['a', 'de', 'la', 'se', 'en', 'el', 'las', | |
'los', 'las', 'que', 'mil', 'millones', 'pesos', | |
'también', 'por', 'eso', 'esto', 'aquello','desde', | |
'no', 'un', 'unas', 'unos', 'ya', 'pero', 'porque', | |
'entonces', 'tanto', 'qué', 'como', 'son', 'este', | |
'estas', 'cuando', 'para','por','según', 'sin', | |
'sobre', 'del','aquí', 'con', 'nada', 'todo','más', | |
'es','lo','al','esa','una','les', 'ahora', 'si', | |
'hay', 'así', 'estar', 'ser', 'va','año','nos', | |
'sí', 'está', 'sólo', 'le', 'ese','luego','ustedes', | |
'tiene','estamos','tener', 'ello', 'da', 'mucho', | |
'gusto', 'tenemos', 'pues', 'ante', 'además', 'otro', | |
'ahí', 'ver','sea','mismo','hasta','me','fue', 'cuánto', | |
'tienen', 'hacer', 'muy', 'había', 'queremos', 'siempre', | |
'sino', 'allá', 'allí', 'cómo', 'su', 'voy','van', 'decir', | |
'esta','haciendo', 'haya','cosa', 'toda', 'todos', 'caso', | |
'era', 'ellos', 'ellas', 'él', 'ciento', 'saben', 'cabo', | |
'llevar', 'seguir', 'vamos', 'cada', 'día', 'todas', | |
'esos', 'decía', 'ir', 'uno', 'entre', 'dos', 'tres', | |
'digo', 'repito', 'tengo', 'vez', 'otros', 'estoy', | |
'todavía', 'claro', 'llamada', 'pop', 'miren', 'tenía', | |
'están','otra', 'llegar', 'estos', 'estas', 'amigas amigos', | |
'dije', 'mucha', 'tenga', 'haber', 'poco', 'muchas gracias', | |
'quiero', 'puedo', 'donde'] | |
stylecloud.gen_stylecloud(file_path='corpus.txt', custom_stopwords=sw, | |
icon_name='fas fa-tree', size=1024, max_font_size=400) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment