Last active
October 19, 2016 15:50
-
-
Save danielperezr88/45c9bc93c268758b2584babb48640db3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from datetime import date, timedelta | |
from collections import Counter | |
import unicodedata | |
from time import sleep | |
import MySQLdb.connections | |
import regex as re | |
import string | |
import os | |
from google.protobuf import timestamp_pb2 | |
from gcloud import storage | |
punctuation = list(string.punctuation) | |
#from nltk.corpus import stopwords | |
#stop = stopwords.words('spanish') | |
import csv | |
lista_stop_words_spanish = [] | |
CONFIG_BUCKET = 'configs-hf' | |
PICKLE_BUCKET = 'pickles-python' | |
## Download pickles and config file from pickle bucket | |
configfile = 'observatoriohf.py' | |
stpwrdfile = 'stop_words_spanish.csv' | |
if not os.path.exists(os.path.join("services",configfile)): | |
client = storage.Client() | |
cblob = client.get_bucket(CONFIG_BUCKET).get_blob(configfile) | |
fp = open(os.path.join("services",configfile),'wb') | |
cblob.download_to_file(fp) | |
fp.close() | |
if not os.path.exists(os.path.join("services",stpwrdfile)): | |
client = storage.Client() | |
cblob = client.get_bucket(PICKLE_BUCKET).get_blob(stpwrdfile) | |
fp = open(os.path.join("services",stpwrdfile),'wb') | |
cblob.download_to_file(fp) | |
fp.close() | |
import observatoriohf | |
with open(os.path.join("services",stpwrdfile), "r") as f: | |
reader = csv.reader(f) | |
for line in reader: | |
lista_stop_words_spanish.append(line[0]) | |
def quitar_acentos(input_str): | |
nfkd_form = unicodedata.normalize('NFKD', input_str) | |
return u"".join([c for c in nfkd_form if not unicodedata.combining(c)]) | |
check_url_validity = re.compile(r'^((?P<scheme>https?|ftp):\/)?\/?((?P<username>.*?)(:(?P<password>.*?)|)@)?(?P<hostname>[^:\/\s]+)(?P<port>:([^\/]*))?(?P<path>(\/\w+)*\/)(?P<filename>[-\w.]+[^#?\s]*)?(?P<query>\?([^#]*))?(?P<fragment>#(.*))?$') | |
from AbstractServiceClass import AbstractServiceClass | |
class FreqCounter(AbstractServiceClass): | |
def __init__(self): | |
AbstractServiceClass.__init__(self) | |
def __doit__(self): | |
yesterday = date.today() - timedelta(1) | |
con = MySQLdb.connections.Connection( | |
user=observatoriohf.dbuser, | |
passwd=observatoriohf.dbpassword, | |
host=observatoriohf.dbhost, | |
db=observatoriohf.dbdatabase, | |
charset='latin1' | |
) | |
#yesterday = date.today() | |
db_suffix = yesterday.strftime("%Y_%m") | |
cur = con.cursor() | |
cur.execute("SELECT distinct(IF(t1.polarity>38,'pos',IF(t1.polarity<-38,'neg','unk'))) as sentiment FROM cnt_extra_" + | |
db_suffix+" AS t1 JOIN cnt_info_"+db_suffix+" AS t2 ON t1.cnt_id = t2.cnt_id WHERE date(t1.created_at) = '%s'" % (str(yesterday),)) | |
con.commit() | |
moods = [row[0] for row in cur] | |
if len(moods) <= 0: | |
self.log("Couldn't find tweets for %s" % (str(yesterday),)) | |
con.close() | |
return | |
texto = {} | |
lideres = {} | |
hashtags = {} | |
urls = {} | |
concepts = {} | |
for mood in moods: | |
if mood not in texto.keys(): | |
texto[mood] = {} | |
lideres[mood] = {} | |
hashtags[mood] = {} | |
urls[mood] = {} | |
concepts[mood] = {} | |
select_str = """select t1.content as content, t3.search_id as search_id, IF(t2.polarity>38,'pos',IF(t2.polarity<-38,'neg','unk')) | |
as sentiment from cnt_scraped_"""+db_suffix+""" as t1 JOIN cnt_extra_"""+db_suffix+""" as t2 on t1.cnt_id=t2.cnt_id join | |
cnt_info_"""+db_suffix+""" as t3 on t1.cnt_id=t3.cnt_id where date(t2.created_at) = '"""+str(yesterday)+"""' HAVING | |
sentiment ='"""+str(mood)+"""'""" | |
cur.execute(select_str) | |
# RECOGEMOS los TWEETs PARA SER ANALIZADO de la seleccion de cur.execute | |
for row in cur: | |
if row[1] not in texto[mood]: | |
texto[mood][row[1]] = "" | |
# Bytearray a string utf-8 | |
texto_tweet = row[0] | |
# Eliminar puntuación no perteneciente a URLs | |
aux = re.sub(r'[-,;]',r' ',texto_tweet).lower() | |
# Eliminar puntuación propia de URLs únicamente cuando se encuentre | |
# al inicio o final de una palabra | |
aux = re.sub(r'([.:?!¿¡]+ )|([.:?!¿¡]+$)|(^[.:?!¿¡]+)|( [.:?!¿¡]+)',r' ',aux) | |
# Eliminar "rt" | |
aux = re.sub(r'(\Wrt\W)|(^rt\W)|(\Wrt$)',r' ',aux) | |
# Eliminar datos numéricos (palabras consistentes únicamente en dígitos) | |
texto[mood][row[1]] += ' ' + re.sub(r'(\W[^a-z\W]+\W)|(^[^a-z\W]+\W)|(\W[^a-z\W]+$)',r' ',aux) | |
con.commit() | |
for sid in texto[mood]: | |
lideres[mood][sid] = [] | |
hashtags[mood][sid] = [] | |
urls[mood][sid] = [] | |
concepts[mood][sid] = [] | |
for sid in texto[mood]: | |
lista_texto = texto[mood][sid].split() | |
for elemento in lista_texto: | |
if elemento.startswith('@'): | |
lideres[mood][sid].append(elemento) | |
elif elemento.startswith('#'): | |
hashtags[mood][sid].append(elemento) | |
elif elemento.startswith('http') and re.match(check_url_validity,elemento) != None: | |
urls[mood][sid].append(elemento) | |
else: | |
# quitamos las stopwords | |
if not elemento in lista_stop_words_spanish: | |
concepts[mood][sid].append(elemento) | |
""" -------------------- HASHTAGS ----------------------------- """ | |
count_all_hashtags = Counter() | |
#actualizamos el contador que es como += | |
count_all_hashtags.update(hashtags[mood][sid]) | |
hashtags_frecuentes = count_all_hashtags.most_common(50) | |
self.log(count_all_hashtags.most_common(10)) | |
""" -------------------- LIDERES DE OPINION ---------------- """ | |
count_all_lideres = Counter() | |
count_all_lideres.update(lideres[mood][sid]) | |
lideres_frecuentes = count_all_lideres.most_common(50) | |
self.log(count_all_lideres.most_common(10)) | |
"""-------------------- CONCEPTOS ---------------------------- """ | |
count_all_concepts = Counter() | |
count_all_concepts.update(concepts[mood][sid]) | |
concepts_frecuentes = count_all_concepts.most_common(50) | |
self.log(count_all_concepts.most_common(10)) | |
"""-------------------- URL´S -------------------------------- """ | |
count_all_urls = Counter() | |
count_all_urls.update(urls[mood][sid]) | |
urls_frecuentes = count_all_urls.most_common(50) | |
self.log(count_all_urls.most_common(10)) | |
concepts_json = json.dumps(concepts_frecuentes) | |
lideres_json = json.dumps(lideres_frecuentes) | |
hashtags_json = json.dumps(hashtags_frecuentes) | |
urls_json = json.dumps(urls_frecuentes) | |
concepts_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in concepts_frecuentes]) + ';' | |
lideres_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in lideres_frecuentes]) + ';' | |
hashtags_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in hashtags_frecuentes]) + ';' | |
urls_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in urls_frecuentes]) + ';' | |
try: | |
conn2 = MySQLdb.connections.Connection( | |
user=observatoriohf.dbuser, | |
passwd=observatoriohf.dbpassword, | |
host=observatoriohf.dbhost, | |
db=observatoriohf.dbdatabase, | |
charset='latin1' | |
) | |
cursor = conn2.cursor() | |
anadir_registro = ("INSERT INTO frecuents (search_id, created_at, concepts, leaders, hashtags, urls, sentiment) VALUES ( %s, %s, %s, %s, %s, %s, %s)") | |
registro = (str(sid), yesterday, concepts_json , lideres_json , hashtags_json, urls_json, str(mood)) | |
cursor.execute(anadir_registro, registro) | |
conn2.commit() | |
sleep(5) | |
anadir_registro = ("INSERT INTO frecuents_tableau (search_id, created_at, concepts, leaders, hashtags, urls, sentiment) VALUES ( %s, %s, %s, %s, %s, %s, %s)") | |
registro = (str(sid), yesterday, concepts_tableau , lideres_tableau , hashtags_tableau, urls_tableau, str(mood)) | |
cursor.execute(anadir_registro, registro) | |
conn2.commit() | |
sleep(5) | |
cursor.close() | |
conn2.close() | |
except mysql.connector.Error as err: | |
self.log(err) | |
cur.close() | |
con.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment