Skip to content

Instantly share code, notes, and snippets.

@danielperezr88
Last active October 19, 2016 15:50
Show Gist options
  • Save danielperezr88/45c9bc93c268758b2584babb48640db3 to your computer and use it in GitHub Desktop.
Save danielperezr88/45c9bc93c268758b2584babb48640db3 to your computer and use it in GitHub Desktop.
import json
from datetime import date, timedelta
from collections import Counter
import unicodedata
from time import sleep
import MySQLdb.connections
import regex as re
import string
import os
from google.protobuf import timestamp_pb2
from gcloud import storage
punctuation = list(string.punctuation)
#from nltk.corpus import stopwords
#stop = stopwords.words('spanish')
import csv
lista_stop_words_spanish = []
CONFIG_BUCKET = 'configs-hf'
PICKLE_BUCKET = 'pickles-python'
## Download pickles and config file from pickle bucket
configfile = 'observatoriohf.py'
stpwrdfile = 'stop_words_spanish.csv'
if not os.path.exists(os.path.join("services",configfile)):
client = storage.Client()
cblob = client.get_bucket(CONFIG_BUCKET).get_blob(configfile)
fp = open(os.path.join("services",configfile),'wb')
cblob.download_to_file(fp)
fp.close()
if not os.path.exists(os.path.join("services",stpwrdfile)):
client = storage.Client()
cblob = client.get_bucket(PICKLE_BUCKET).get_blob(stpwrdfile)
fp = open(os.path.join("services",stpwrdfile),'wb')
cblob.download_to_file(fp)
fp.close()
import observatoriohf
with open(os.path.join("services",stpwrdfile), "r") as f:
reader = csv.reader(f)
for line in reader:
lista_stop_words_spanish.append(line[0])
def quitar_acentos(input_str):
nfkd_form = unicodedata.normalize('NFKD', input_str)
return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])
check_url_validity = re.compile(r'^((?P<scheme>https?|ftp):\/)?\/?((?P<username>.*?)(:(?P<password>.*?)|)@)?(?P<hostname>[^:\/\s]+)(?P<port>:([^\/]*))?(?P<path>(\/\w+)*\/)(?P<filename>[-\w.]+[^#?\s]*)?(?P<query>\?([^#]*))?(?P<fragment>#(.*))?$')
from AbstractServiceClass import AbstractServiceClass
class FreqCounter(AbstractServiceClass):
def __init__(self):
AbstractServiceClass.__init__(self)
def __doit__(self):
yesterday = date.today() - timedelta(1)
con = MySQLdb.connections.Connection(
user=observatoriohf.dbuser,
passwd=observatoriohf.dbpassword,
host=observatoriohf.dbhost,
db=observatoriohf.dbdatabase,
charset='latin1'
)
#yesterday = date.today()
db_suffix = yesterday.strftime("%Y_%m")
cur = con.cursor()
cur.execute("SELECT distinct(IF(t1.polarity>38,'pos',IF(t1.polarity<-38,'neg','unk'))) as sentiment FROM cnt_extra_" +
db_suffix+" AS t1 JOIN cnt_info_"+db_suffix+" AS t2 ON t1.cnt_id = t2.cnt_id WHERE date(t1.created_at) = '%s'" % (str(yesterday),))
con.commit()
moods = [row[0] for row in cur]
if len(moods) <= 0:
self.log("Couldn't find tweets for %s" % (str(yesterday),))
con.close()
return
texto = {}
lideres = {}
hashtags = {}
urls = {}
concepts = {}
for mood in moods:
if mood not in texto.keys():
texto[mood] = {}
lideres[mood] = {}
hashtags[mood] = {}
urls[mood] = {}
concepts[mood] = {}
select_str = """select t1.content as content, t3.search_id as search_id, IF(t2.polarity>38,'pos',IF(t2.polarity<-38,'neg','unk'))
as sentiment from cnt_scraped_"""+db_suffix+""" as t1 JOIN cnt_extra_"""+db_suffix+""" as t2 on t1.cnt_id=t2.cnt_id join
cnt_info_"""+db_suffix+""" as t3 on t1.cnt_id=t3.cnt_id where date(t2.created_at) = '"""+str(yesterday)+"""' HAVING
sentiment ='"""+str(mood)+"""'"""
cur.execute(select_str)
# RECOGEMOS los TWEETs PARA SER ANALIZADO de la seleccion de cur.execute
for row in cur:
if row[1] not in texto[mood]:
texto[mood][row[1]] = ""
# Bytearray a string utf-8
texto_tweet = row[0]
# Eliminar puntuación no perteneciente a URLs
aux = re.sub(r'[-,;]',r' ',texto_tweet).lower()
# Eliminar puntuación propia de URLs únicamente cuando se encuentre
# al inicio o final de una palabra
aux = re.sub(r'([.:?!¿¡]+ )|([.:?!¿¡]+$)|(^[.:?!¿¡]+)|( [.:?!¿¡]+)',r' ',aux)
# Eliminar "rt"
aux = re.sub(r'(\Wrt\W)|(^rt\W)|(\Wrt$)',r' ',aux)
# Eliminar datos numéricos (palabras consistentes únicamente en dígitos)
texto[mood][row[1]] += ' ' + re.sub(r'(\W[^a-z\W]+\W)|(^[^a-z\W]+\W)|(\W[^a-z\W]+$)',r' ',aux)
con.commit()
for sid in texto[mood]:
lideres[mood][sid] = []
hashtags[mood][sid] = []
urls[mood][sid] = []
concepts[mood][sid] = []
for sid in texto[mood]:
lista_texto = texto[mood][sid].split()
for elemento in lista_texto:
if elemento.startswith('@'):
lideres[mood][sid].append(elemento)
elif elemento.startswith('#'):
hashtags[mood][sid].append(elemento)
elif elemento.startswith('http') and re.match(check_url_validity,elemento) != None:
urls[mood][sid].append(elemento)
else:
# quitamos las stopwords
if not elemento in lista_stop_words_spanish:
concepts[mood][sid].append(elemento)
""" -------------------- HASHTAGS ----------------------------- """
count_all_hashtags = Counter()
#actualizamos el contador que es como +=
count_all_hashtags.update(hashtags[mood][sid])
hashtags_frecuentes = count_all_hashtags.most_common(50)
self.log(count_all_hashtags.most_common(10))
""" -------------------- LIDERES DE OPINION ---------------- """
count_all_lideres = Counter()
count_all_lideres.update(lideres[mood][sid])
lideres_frecuentes = count_all_lideres.most_common(50)
self.log(count_all_lideres.most_common(10))
"""-------------------- CONCEPTOS ---------------------------- """
count_all_concepts = Counter()
count_all_concepts.update(concepts[mood][sid])
concepts_frecuentes = count_all_concepts.most_common(50)
self.log(count_all_concepts.most_common(10))
"""-------------------- URL´S -------------------------------- """
count_all_urls = Counter()
count_all_urls.update(urls[mood][sid])
urls_frecuentes = count_all_urls.most_common(50)
self.log(count_all_urls.most_common(10))
concepts_json = json.dumps(concepts_frecuentes)
lideres_json = json.dumps(lideres_frecuentes)
hashtags_json = json.dumps(hashtags_frecuentes)
urls_json = json.dumps(urls_frecuentes)
concepts_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in concepts_frecuentes]) + ';'
lideres_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in lideres_frecuentes]) + ';'
hashtags_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in hashtags_frecuentes]) + ';'
urls_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in urls_frecuentes]) + ';'
try:
conn2 = MySQLdb.connections.Connection(
user=observatoriohf.dbuser,
passwd=observatoriohf.dbpassword,
host=observatoriohf.dbhost,
db=observatoriohf.dbdatabase,
charset='latin1'
)
cursor = conn2.cursor()
anadir_registro = ("INSERT INTO frecuents (search_id, created_at, concepts, leaders, hashtags, urls, sentiment) VALUES ( %s, %s, %s, %s, %s, %s, %s)")
registro = (str(sid), yesterday, concepts_json , lideres_json , hashtags_json, urls_json, str(mood))
cursor.execute(anadir_registro, registro)
conn2.commit()
sleep(5)
anadir_registro = ("INSERT INTO frecuents_tableau (search_id, created_at, concepts, leaders, hashtags, urls, sentiment) VALUES ( %s, %s, %s, %s, %s, %s, %s)")
registro = (str(sid), yesterday, concepts_tableau , lideres_tableau , hashtags_tableau, urls_tableau, str(mood))
cursor.execute(anadir_registro, registro)
conn2.commit()
sleep(5)
cursor.close()
conn2.close()
except mysql.connector.Error as err:
self.log(err)
cur.close()
con.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment