eiriks/ism_detector.py

## ism_detector.py
#!/usr/bin/env python
# encoding: utf-8
"""
Created by Eirik Stavelin on 2015
Copyright (c) 2015 Eirik Stavelin. All rights reserved.

Modul for å finne -ismer i tekst, for å klare opp et veddemål.(ingen vant).
Resultater: http://stavelin.com/uib/ismer

"""

import sys, time, string
import os

import pymysql
from collections import Counter

import logging  #DEBUG, INFO, WARNING, ERROR, CRITICAL
logging.basicConfig(stream=sys.stderr,level=logging.DEBUG)
logging.info("we are running")


def connect():
    try:
        # Set up a database cursor:
        rdbms_hostname = "localhost"
        rdbms_username = "******"
        rdbms_password = "******"
        connection = pymysql.connect(unix_socket="/Applications/MAMP/tmp/mysql/mysql.sock",user=rdbms_username, passwd=rdbms_password, db="nrk", charset='utf8')
        cur = connection.cursor()
        cur.execute("USE nrk;")
        logging.debug("koblet til MySQL")
        return connection, cur
    except:
        logging.error("kunne ikke logge på databasen")


def disconnect(connection):
    if connection:
        connection.close()
        logging.notset("koblet av mysql")

def update_progress(progress):
    barLength = 10 # Modify this to change the length of the progress bar
    status = ""
    if isinstance(progress, int):
        progress = float(progress)
    if not isinstance(progress, float):
        progress = 0
        status = "error: progress var must be float\r\n"
    if progress < 0:
        progress = 0
        status = "Halt...\r\n"
    if progress >= 1:
        progress = 1
        status = "Done...\r\n"
    block = int(round(barLength*progress))
    text = "\rPercent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), progress*100, status)
    sys.stdout.write(text)
    sys.stdout.flush()


def find_isms(text):
    isms_list = []

    from nltk.corpus import stopwords
    stopwords = [word.decode('utf-8') for word in stopwords.words('norwegian')]
    from nltk import wordpunct_tokenize


    for word in wordpunct_tokenize(text.lower()):
        #print word
        if word in stopwords:
            continue # skip this iteration
        if word in string.punctuation:
            continue # skip
        #print word
        if word.endswith(('isme', 'ismen', 'ismene')):
            isms_list.append(word)
    isms_list.sort()
    #print isms_list
    return isms_list

def loop_text():
    connection, cur = connect()
    #mysql_query = 'SELECT * FROM nrk2013b_ism_tbl order by id limit 40000'
    mysql_query = '''SELECT CONCAT(title, " ", full_text) as text, id FROM
                    nrk2013b_ism_tbl order by id ''' #limit 10000 OFFSET 119999
    cur.execute(mysql_query)
    mysql_rows = cur.fetchall()
    current_row = 1
    for row in mysql_rows:
        #print current_row, len(mysql_rows), row[0], type(row[0])

        # print find_isms(row[0])
        isms = find_isms(row[0])
        #print isms, type(isms)
        if isms is None:
            pass
        else:
            #if len(isms) != 0:
            print isms, row[1]

            # update mysql
            cur.execute (""" UPDATE nrk2013b_ism_tbl SET ismer=%s WHERE id=%s
                     """, (','.join(isms), row[1]))
            connection.commit()
            print "Number of rows updated:",  cur.rowcount

        # update progress
        update_progress(current_row/float(len(mysql_rows)))
        current_row += 1

def get_results():
    connection, cur = connect()
    query = '''SELECT ismer FROM nrk2013b_ism_tbl WHERE `ismer` !=""
            AND `template`!="nrk_alfa"'''
    query = '''SELECT ismer FROM nrk2013b_ism_tbl WHERE `ismer` !=""'''
    cur.execute(query)
    mysql_rows = cur.fetchall()
    current_row = 1
    cnt = Counter()

    #analyse 1
    #for row in mysql_rows:
    #    for isme in row[0].split(","):
    #        cnt[isme]+=1
    #print cnt.most_common(15)

    #analyse 2
    for row in mysql_rows:
        for isme in row[0].split(","):
            # slice of suffix
            if isme.endswith('ismene'):
                isme = isme[:-6]
            if isme.endswith('ismen'):
                isme = isme[:-5]
            if isme.endswith('isme'):
                isme = isme[:-4]
            cnt[isme]+=1
    print cnt.most_common(20)


if __name__ == '__main__':
    loop_text()
    get_results()
    sys.exit(0)
	#!/usr/bin/env python
	# encoding: utf-8
	"""
	Created by Eirik Stavelin on 2015
	Copyright (c) 2015 Eirik Stavelin. All rights reserved.

	Modul for å finne -ismer i tekst, for å klare opp et veddemål.(ingen vant).
	Resultater: http://stavelin.com/uib/ismer

	"""

	import sys, time, string
	import os

	import pymysql
	from collections import Counter

	import logging #DEBUG, INFO, WARNING, ERROR, CRITICAL
	logging.basicConfig(stream=sys.stderr,level=logging.DEBUG)
	logging.info("we are running")


	def connect():
	try:
	# Set up a database cursor:
	rdbms_hostname = "localhost"
	rdbms_username = "******"
	rdbms_password = "******"
	connection = pymysql.connect(unix_socket="/Applications/MAMP/tmp/mysql/mysql.sock",user=rdbms_username, passwd=rdbms_password, db="nrk", charset='utf8')
	cur = connection.cursor()
	cur.execute("USE nrk;")
	logging.debug("koblet til MySQL")
	return connection, cur
	except:
	logging.error("kunne ikke logge på databasen")


	def disconnect(connection):
	if connection:
	connection.close()
	logging.notset("koblet av mysql")

	def update_progress(progress):
	barLength = 10 # Modify this to change the length of the progress bar
	status = ""
	if isinstance(progress, int):
	progress = float(progress)
	if not isinstance(progress, float):
	progress = 0
	status = "error: progress var must be float\r\n"
	if progress < 0:
	progress = 0
	status = "Halt...\r\n"
	if progress >= 1:
	progress = 1
	status = "Done...\r\n"
	block = int(round(barLength*progress))
	text = "\rPercent: [{0}] {1}% {2}".format( "#"block + "-"(barLength-block), progress*100, status)
	sys.stdout.write(text)
	sys.stdout.flush()


	def find_isms(text):
	isms_list = []

	from nltk.corpus import stopwords
	stopwords = [word.decode('utf-8') for word in stopwords.words('norwegian')]
	from nltk import wordpunct_tokenize


	for word in wordpunct_tokenize(text.lower()):
	#print word
	if word in stopwords:
	continue # skip this iteration
	if word in string.punctuation:
	continue # skip
	#print word
	if word.endswith(('isme', 'ismen', 'ismene')):
	isms_list.append(word)
	isms_list.sort()
	#print isms_list
	return isms_list

	def loop_text():
	connection, cur = connect()
	#mysql_query = 'SELECT * FROM nrk2013b_ism_tbl order by id limit 40000'
	mysql_query = '''SELECT CONCAT(title, " ", full_text) as text, id FROM
	nrk2013b_ism_tbl order by id ''' #limit 10000 OFFSET 119999
	cur.execute(mysql_query)
	mysql_rows = cur.fetchall()
	current_row = 1
	for row in mysql_rows:
	#print current_row, len(mysql_rows), row[0], type(row[0])

	# print find_isms(row[0])
	isms = find_isms(row[0])
	#print isms, type(isms)
	if isms is None:
	pass
	else:
	#if len(isms) != 0:
	print isms, row[1]

	# update mysql
	cur.execute (""" UPDATE nrk2013b_ism_tbl SET ismer=%s WHERE id=%s
	""", (','.join(isms), row[1]))
	connection.commit()
	print "Number of rows updated:", cur.rowcount

	# update progress
	update_progress(current_row/float(len(mysql_rows)))
	current_row += 1

	def get_results():
	connection, cur = connect()
	query = '''SELECT ismer FROM nrk2013b_ism_tbl WHERE `ismer` !=""
	AND `template`!="nrk_alfa"'''
	query = '''SELECT ismer FROM nrk2013b_ism_tbl WHERE `ismer` !=""'''
	cur.execute(query)
	mysql_rows = cur.fetchall()
	current_row = 1
	cnt = Counter()

	#analyse 1
	#for row in mysql_rows:
	# for isme in row[0].split(","):
	# cnt[isme]+=1
	#print cnt.most_common(15)

	#analyse 2
	for row in mysql_rows:
	for isme in row[0].split(","):
	# slice of suffix
	if isme.endswith('ismene'):
	isme = isme[:-6]
	if isme.endswith('ismen'):
	isme = isme[:-5]
	if isme.endswith('isme'):
	isme = isme[:-4]
	cnt[isme]+=1
	print cnt.most_common(20)




	if __name__ == '__main__':
	loop_text()
	get_results()
	sys.exit(0)