Skip to content

Instantly share code, notes, and snippets.

@eiriks
Last active August 29, 2015 14:22
Show Gist options
  • Save eiriks/34edb6f2bff9adde6502 to your computer and use it in GitHub Desktop.
Save eiriks/34edb6f2bff9adde6502 to your computer and use it in GitHub Desktop.
Modul for å finne -ismer i tekst, for å klare opp et veddemål. (ingen vant). Resultater: http://stavelin.com/uib/ismer
#!/usr/bin/env python
# encoding: utf-8
"""
Created by Eirik Stavelin on 2015
Copyright (c) 2015 Eirik Stavelin. All rights reserved.
Modul for å finne -ismer i tekst, for å klare opp et veddemål.(ingen vant).
Resultater: http://stavelin.com/uib/ismer
"""
import sys, time, string
import os
import pymysql
from collections import Counter
import logging #DEBUG, INFO, WARNING, ERROR, CRITICAL
logging.basicConfig(stream=sys.stderr,level=logging.DEBUG)
logging.info("we are running")
def connect():
try:
# Set up a database cursor:
rdbms_hostname = "localhost"
rdbms_username = "******"
rdbms_password = "******"
connection = pymysql.connect(unix_socket="/Applications/MAMP/tmp/mysql/mysql.sock",user=rdbms_username, passwd=rdbms_password, db="nrk", charset='utf8')
cur = connection.cursor()
cur.execute("USE nrk;")
logging.debug("koblet til MySQL")
return connection, cur
except:
logging.error("kunne ikke logge på databasen")
def disconnect(connection):
if connection:
connection.close()
logging.notset("koblet av mysql")
def update_progress(progress):
barLength = 10 # Modify this to change the length of the progress bar
status = ""
if isinstance(progress, int):
progress = float(progress)
if not isinstance(progress, float):
progress = 0
status = "error: progress var must be float\r\n"
if progress < 0:
progress = 0
status = "Halt...\r\n"
if progress >= 1:
progress = 1
status = "Done...\r\n"
block = int(round(barLength*progress))
text = "\rPercent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), progress*100, status)
sys.stdout.write(text)
sys.stdout.flush()
def find_isms(text):
isms_list = []
from nltk.corpus import stopwords
stopwords = [word.decode('utf-8') for word in stopwords.words('norwegian')]
from nltk import wordpunct_tokenize
for word in wordpunct_tokenize(text.lower()):
#print word
if word in stopwords:
continue # skip this iteration
if word in string.punctuation:
continue # skip
#print word
if word.endswith(('isme', 'ismen', 'ismene')):
isms_list.append(word)
isms_list.sort()
#print isms_list
return isms_list
def loop_text():
connection, cur = connect()
#mysql_query = 'SELECT * FROM nrk2013b_ism_tbl order by id limit 40000'
mysql_query = '''SELECT CONCAT(title, " ", full_text) as text, id FROM
nrk2013b_ism_tbl order by id ''' #limit 10000 OFFSET 119999
cur.execute(mysql_query)
mysql_rows = cur.fetchall()
current_row = 1
for row in mysql_rows:
#print current_row, len(mysql_rows), row[0], type(row[0])
# print find_isms(row[0])
isms = find_isms(row[0])
#print isms, type(isms)
if isms is None:
pass
else:
#if len(isms) != 0:
print isms, row[1]
# update mysql
cur.execute (""" UPDATE nrk2013b_ism_tbl SET ismer=%s WHERE id=%s
""", (','.join(isms), row[1]))
connection.commit()
print "Number of rows updated:", cur.rowcount
# update progress
update_progress(current_row/float(len(mysql_rows)))
current_row += 1
def get_results():
connection, cur = connect()
query = '''SELECT ismer FROM nrk2013b_ism_tbl WHERE `ismer` !=""
AND `template`!="nrk_alfa"'''
query = '''SELECT ismer FROM nrk2013b_ism_tbl WHERE `ismer` !=""'''
cur.execute(query)
mysql_rows = cur.fetchall()
current_row = 1
cnt = Counter()
#analyse 1
#for row in mysql_rows:
# for isme in row[0].split(","):
# cnt[isme]+=1
#print cnt.most_common(15)
#analyse 2
for row in mysql_rows:
for isme in row[0].split(","):
# slice of suffix
if isme.endswith('ismene'):
isme = isme[:-6]
if isme.endswith('ismen'):
isme = isme[:-5]
if isme.endswith('isme'):
isme = isme[:-4]
cnt[isme]+=1
print cnt.most_common(20)
if __name__ == '__main__':
loop_text()
get_results()
sys.exit(0)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment