danielperezr88/FreqCounterSvc.py

## FreqCounterSvc.py
import json

from datetime import date, timedelta

from collections import Counter

import unicodedata

from time import sleep

import MySQLdb.connections

import regex as re

import string

import os

from google.protobuf import timestamp_pb2
from gcloud import storage

punctuation = list(string.punctuation)

#from nltk.corpus import stopwords
#stop = stopwords.words('spanish')

import csv
lista_stop_words_spanish = []

CONFIG_BUCKET = 'configs-hf'
PICKLE_BUCKET = 'pickles-python'

## Download pickles and config file from pickle bucket
configfile = 'observatoriohf.py'
stpwrdfile = 'stop_words_spanish.csv'
if not os.path.exists(os.path.join("services",configfile)):
	client = storage.Client()
	cblob = client.get_bucket(CONFIG_BUCKET).get_blob(configfile)
	fp = open(os.path.join("services",configfile),'wb')
	cblob.download_to_file(fp)
	fp.close()

if not os.path.exists(os.path.join("services",stpwrdfile)):
	client = storage.Client()
	cblob = client.get_bucket(PICKLE_BUCKET).get_blob(stpwrdfile)
	fp = open(os.path.join("services",stpwrdfile),'wb')
	cblob.download_to_file(fp)
	fp.close()

import observatoriohf

with open(os.path.join("services",stpwrdfile), "r") as f:
    reader = csv.reader(f)
    for line in reader:
        lista_stop_words_spanish.append(line[0])

def quitar_acentos(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

check_url_validity = re.compile(r'^((?P<scheme>https?|ftp):\/)?\/?((?P<username>.*?)(:(?P<password>.*?)|)@)?(?P<hostname>[^:\/\s]+)(?P<port>:([^\/]*))?(?P<path>(\/\w+)*\/)(?P<filename>[-\w.]+[^#?\s]*)?(?P<query>\?([^#]*))?(?P<fragment>#(.*))?$')

from AbstractServiceClass import AbstractServiceClass


class FreqCounter(AbstractServiceClass):
    def __init__(self):
        AbstractServiceClass.__init__(self)

    def __doit__(self):

        yesterday = date.today() - timedelta(1)

        con = MySQLdb.connections.Connection(
			user=observatoriohf.dbuser,
			passwd=observatoriohf.dbpassword,
			host=observatoriohf.dbhost,
			db=observatoriohf.dbdatabase,
			charset='latin1'
		)

        #yesterday = date.today()

        db_suffix = yesterday.strftime("%Y_%m")

        cur = con.cursor()
        cur.execute("SELECT distinct(IF(t1.polarity>38,'pos',IF(t1.polarity<-38,'neg','unk'))) as sentiment FROM cnt_extra_" +
            db_suffix+" AS t1 JOIN cnt_info_"+db_suffix+" AS t2 ON t1.cnt_id = t2.cnt_id WHERE date(t1.created_at) = '%s'" % (str(yesterday),))
        con.commit()
        moods = [row[0] for row in cur]

        if len(moods) <= 0:
            self.log("Couldn't find tweets for %s" % (str(yesterday),))
            con.close()
            return

        texto = {}
        lideres = {}
        hashtags = {}
        urls = {}
        concepts = {}
        for mood in moods:

            if mood not in texto.keys():
                texto[mood] = {}
                lideres[mood] = {}
                hashtags[mood] = {}
                urls[mood] = {}
                concepts[mood] = {}

            select_str = """select t1.content as content, t3.search_id as search_id, IF(t2.polarity>38,'pos',IF(t2.polarity<-38,'neg','unk'))
                as sentiment from cnt_scraped_"""+db_suffix+""" as t1 JOIN cnt_extra_"""+db_suffix+""" as t2 on t1.cnt_id=t2.cnt_id join
                cnt_info_"""+db_suffix+""" as t3 on t1.cnt_id=t3.cnt_id where date(t2.created_at) = '"""+str(yesterday)+"""' HAVING
                sentiment ='"""+str(mood)+"""'"""
            cur.execute(select_str)

            #  RECOGEMOS los TWEETs PARA SER ANALIZADO de la seleccion de cur.execute
            for row in cur:
                if row[1] not in texto[mood]:
                    texto[mood][row[1]] = ""

                # Bytearray a string utf-8
                texto_tweet = row[0]

                # Eliminar puntuación no perteneciente a URLs
                aux = re.sub(r'[-,;]',r' ',texto_tweet).lower()

                # Eliminar puntuación propia de URLs únicamente cuando se encuentre
                # al inicio o final de una palabra
                aux = re.sub(r'([.:?!¿¡]+ )|([.:?!¿¡]+$)|(^[.:?!¿¡]+)|( [.:?!¿¡]+)',r' ',aux)

                # Eliminar "rt"
                aux = re.sub(r'(\Wrt\W)|(^rt\W)|(\Wrt$)',r' ',aux)

                # Eliminar datos numéricos (palabras consistentes únicamente en dígitos)
                texto[mood][row[1]] += ' ' + re.sub(r'(\W[^a-z\W]+\W)|(^[^a-z\W]+\W)|(\W[^a-z\W]+$)',r' ',aux)

            con.commit()

            for sid in texto[mood]:
                lideres[mood][sid] = []
                hashtags[mood][sid] = []
                urls[mood][sid] = []
                concepts[mood][sid] = []

            for sid in texto[mood]:

                lista_texto = texto[mood][sid].split()

                for elemento in lista_texto:
                    if elemento.startswith('@'):
                        lideres[mood][sid].append(elemento)
                    elif elemento.startswith('#'):
                        hashtags[mood][sid].append(elemento)
                    elif elemento.startswith('http') and re.match(check_url_validity,elemento) != None:
                        urls[mood][sid].append(elemento)
                    else:
                        # quitamos las stopwords
                        if not elemento in lista_stop_words_spanish:
                            concepts[mood][sid].append(elemento)

                """ --------------------    HASHTAGS  ----------------------------- """
                count_all_hashtags = Counter()
                #actualizamos el contador  que es como +=
                count_all_hashtags.update(hashtags[mood][sid])
                hashtags_frecuentes = count_all_hashtags.most_common(50)
                self.log(count_all_hashtags.most_common(10))

                """ --------------------    LIDERES DE OPINION  ---------------- """
                count_all_lideres = Counter()
                count_all_lideres.update(lideres[mood][sid])
                lideres_frecuentes = count_all_lideres.most_common(50)
                self.log(count_all_lideres.most_common(10))

                """--------------------    CONCEPTOS  ---------------------------- """
                count_all_concepts = Counter()
                count_all_concepts.update(concepts[mood][sid])
                concepts_frecuentes = count_all_concepts.most_common(50)
                self.log(count_all_concepts.most_common(10))

                """--------------------    URL´S  -------------------------------- """
                count_all_urls = Counter()
                count_all_urls.update(urls[mood][sid])
                urls_frecuentes = count_all_urls.most_common(50)
                self.log(count_all_urls.most_common(10))

                concepts_json = json.dumps(concepts_frecuentes)
                lideres_json = json.dumps(lideres_frecuentes)
                hashtags_json = json.dumps(hashtags_frecuentes)
                urls_json = json.dumps(urls_frecuentes)

                concepts_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in concepts_frecuentes]) + ';'
                lideres_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in lideres_frecuentes]) + ';'
                hashtags_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in hashtags_frecuentes]) + ';'
                urls_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in urls_frecuentes]) + ';'

                try:
                    conn2 = MySQLdb.connections.Connection(
						user=observatoriohf.dbuser,
						passwd=observatoriohf.dbpassword,
						host=observatoriohf.dbhost,
						db=observatoriohf.dbdatabase,
						charset='latin1'
					)
                    cursor = conn2.cursor()

                    anadir_registro = ("INSERT INTO frecuents (search_id, created_at, concepts, leaders, hashtags, urls,  sentiment) VALUES ( %s, %s, %s, %s, %s, %s, %s)")
                    registro = (str(sid), yesterday, concepts_json , lideres_json , hashtags_json, urls_json, str(mood))
                    cursor.execute(anadir_registro, registro)
                    conn2.commit()

                    sleep(5)

                    anadir_registro = ("INSERT INTO frecuents_tableau (search_id, created_at, concepts, leaders, hashtags, urls,  sentiment) VALUES ( %s, %s, %s, %s, %s, %s, %s)")
                    registro = (str(sid), yesterday, concepts_tableau , lideres_tableau , hashtags_tableau, urls_tableau, str(mood))
                    cursor.execute(anadir_registro, registro)
                    conn2.commit()

                    sleep(5)

                    cursor.close()
                    conn2.close()

                except mysql.connector.Error as err:
                     self.log(err)

        cur.close()
        con.close()
	import json

	from datetime import date, timedelta

	from collections import Counter

	import unicodedata

	from time import sleep

	import MySQLdb.connections

	import regex as re

	import string

	import os

	from google.protobuf import timestamp_pb2
	from gcloud import storage

	punctuation = list(string.punctuation)

	#from nltk.corpus import stopwords
	#stop = stopwords.words('spanish')

	import csv
	lista_stop_words_spanish = []

	CONFIG_BUCKET = 'configs-hf'
	PICKLE_BUCKET = 'pickles-python'

	## Download pickles and config file from pickle bucket
	configfile = 'observatoriohf.py'
	stpwrdfile = 'stop_words_spanish.csv'
	if not os.path.exists(os.path.join("services",configfile)):
	client = storage.Client()
	cblob = client.get_bucket(CONFIG_BUCKET).get_blob(configfile)
	fp = open(os.path.join("services",configfile),'wb')
	cblob.download_to_file(fp)
	fp.close()

	if not os.path.exists(os.path.join("services",stpwrdfile)):
	client = storage.Client()
	cblob = client.get_bucket(PICKLE_BUCKET).get_blob(stpwrdfile)
	fp = open(os.path.join("services",stpwrdfile),'wb')
	cblob.download_to_file(fp)
	fp.close()

	import observatoriohf

	with open(os.path.join("services",stpwrdfile), "r") as f:
	reader = csv.reader(f)
	for line in reader:
	lista_stop_words_spanish.append(line[0])

	def quitar_acentos(input_str):
	nfkd_form = unicodedata.normalize('NFKD', input_str)
	return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

	check_url_validity = re.compile(r'^((?P<scheme>https?\|ftp):\/)?\/?((?P<username>.?)(:(?P<password>.?)\|)@)?(?P<hostname>[^:\/\s]+)(?P<port>:([^\/]))?(?P<path>(\/\w+)\/)(?P<filename>[-\w.]+[^#?\s])?(?P<query>\?([^#]))?(?P<fragment>#(.*))?$')

	from AbstractServiceClass import AbstractServiceClass


	class FreqCounter(AbstractServiceClass):
	def __init__(self):
	AbstractServiceClass.__init__(self)

	def __doit__(self):

	yesterday = date.today() - timedelta(1)

	con = MySQLdb.connections.Connection(
	user=observatoriohf.dbuser,
	passwd=observatoriohf.dbpassword,
	host=observatoriohf.dbhost,
	db=observatoriohf.dbdatabase,
	charset='latin1'
	)

	#yesterday = date.today()

	db_suffix = yesterday.strftime("%Y_%m")

	cur = con.cursor()
	cur.execute("SELECT distinct(IF(t1.polarity>38,'pos',IF(t1.polarity<-38,'neg','unk'))) as sentiment FROM cnt_extra_" +
	db_suffix+" AS t1 JOIN cnt_info_"+db_suffix+" AS t2 ON t1.cnt_id = t2.cnt_id WHERE date(t1.created_at) = '%s'" % (str(yesterday),))
	con.commit()
	moods = [row[0] for row in cur]

	if len(moods) <= 0:
	self.log("Couldn't find tweets for %s" % (str(yesterday),))
	con.close()
	return

	texto = {}
	lideres = {}
	hashtags = {}
	urls = {}
	concepts = {}
	for mood in moods:

	if mood not in texto.keys():
	texto[mood] = {}
	lideres[mood] = {}
	hashtags[mood] = {}
	urls[mood] = {}
	concepts[mood] = {}

	select_str = """select t1.content as content, t3.search_id as search_id, IF(t2.polarity>38,'pos',IF(t2.polarity<-38,'neg','unk'))
	as sentiment from cnt_scraped_"""+db_suffix+""" as t1 JOIN cnt_extra_"""+db_suffix+""" as t2 on t1.cnt_id=t2.cnt_id join
	cnt_info_"""+db_suffix+""" as t3 on t1.cnt_id=t3.cnt_id where date(t2.created_at) = '"""+str(yesterday)+"""' HAVING
	sentiment ='"""+str(mood)+"""'"""
	cur.execute(select_str)

	# RECOGEMOS los TWEETs PARA SER ANALIZADO de la seleccion de cur.execute
	for row in cur:
	if row[1] not in texto[mood]:
	texto[mood][row[1]] = ""

	# Bytearray a string utf-8
	texto_tweet = row[0]

	# Eliminar puntuación no perteneciente a URLs
	aux = re.sub(r'[-,;]',r' ',texto_tweet).lower()

	# Eliminar puntuación propia de URLs únicamente cuando se encuentre
	# al inicio o final de una palabra
	aux = re.sub(r'([.:?!¿¡]+ )\|([.:?!¿¡]+$)\|(^[.:?!¿¡]+)\|( [.:?!¿¡]+)',r' ',aux)

	# Eliminar "rt"
	aux = re.sub(r'(\Wrt\W)\|(^rt\W)\|(\Wrt$)',r' ',aux)

	# Eliminar datos numéricos (palabras consistentes únicamente en dígitos)
	texto[mood][row[1]] += ' ' + re.sub(r'(\W[^a-z\W]+\W)\|(^[^a-z\W]+\W)\|(\W[^a-z\W]+$)',r' ',aux)

	con.commit()

	for sid in texto[mood]:
	lideres[mood][sid] = []
	hashtags[mood][sid] = []
	urls[mood][sid] = []
	concepts[mood][sid] = []

	for sid in texto[mood]:

	lista_texto = texto[mood][sid].split()

	for elemento in lista_texto:
	if elemento.startswith('@'):
	lideres[mood][sid].append(elemento)
	elif elemento.startswith('#'):
	hashtags[mood][sid].append(elemento)
	elif elemento.startswith('http') and re.match(check_url_validity,elemento) != None:
	urls[mood][sid].append(elemento)
	else:
	# quitamos las stopwords
	if not elemento in lista_stop_words_spanish:
	concepts[mood][sid].append(elemento)

	""" -------------------- HASHTAGS ----------------------------- """
	count_all_hashtags = Counter()
	#actualizamos el contador que es como +=
	count_all_hashtags.update(hashtags[mood][sid])
	hashtags_frecuentes = count_all_hashtags.most_common(50)
	self.log(count_all_hashtags.most_common(10))

	""" -------------------- LIDERES DE OPINION ---------------- """
	count_all_lideres = Counter()
	count_all_lideres.update(lideres[mood][sid])
	lideres_frecuentes = count_all_lideres.most_common(50)
	self.log(count_all_lideres.most_common(10))

	"""-------------------- CONCEPTOS ---------------------------- """
	count_all_concepts = Counter()
	count_all_concepts.update(concepts[mood][sid])
	concepts_frecuentes = count_all_concepts.most_common(50)
	self.log(count_all_concepts.most_common(10))

	"""-------------------- URL´S -------------------------------- """
	count_all_urls = Counter()
	count_all_urls.update(urls[mood][sid])
	urls_frecuentes = count_all_urls.most_common(50)
	self.log(count_all_urls.most_common(10))

	concepts_json = json.dumps(concepts_frecuentes)
	lideres_json = json.dumps(lideres_frecuentes)
	hashtags_json = json.dumps(hashtags_frecuentes)
	urls_json = json.dumps(urls_frecuentes)

	concepts_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in concepts_frecuentes]) + ';'
	lideres_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in lideres_frecuentes]) + ';'
	hashtags_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in hashtags_frecuentes]) + ';'
	urls_tableau = '; '.join([', '.join([str(val) for val in tpl]) for tpl in urls_frecuentes]) + ';'

	try:
	conn2 = MySQLdb.connections.Connection(
	user=observatoriohf.dbuser,
	passwd=observatoriohf.dbpassword,
	host=observatoriohf.dbhost,
	db=observatoriohf.dbdatabase,
	charset='latin1'
	)
	cursor = conn2.cursor()

	anadir_registro = ("INSERT INTO frecuents (search_id, created_at, concepts, leaders, hashtags, urls, sentiment) VALUES ( %s, %s, %s, %s, %s, %s, %s)")
	registro = (str(sid), yesterday, concepts_json , lideres_json , hashtags_json, urls_json, str(mood))
	cursor.execute(anadir_registro, registro)
	conn2.commit()

	sleep(5)

	anadir_registro = ("INSERT INTO frecuents_tableau (search_id, created_at, concepts, leaders, hashtags, urls, sentiment) VALUES ( %s, %s, %s, %s, %s, %s, %s)")
	registro = (str(sid), yesterday, concepts_tableau , lideres_tableau , hashtags_tableau, urls_tableau, str(mood))
	cursor.execute(anadir_registro, registro)
	conn2.commit()

	sleep(5)

	cursor.close()
	conn2.close()

	except mysql.connector.Error as err:
	self.log(err)

	cur.close()
	con.close()