Last active
August 29, 2015 14:22
-
-
Save eiriks/34edb6f2bff9adde6502 to your computer and use it in GitHub Desktop.
Modul for å finne -ismer i tekst, for å klare opp et veddemål. (ingen vant). Resultater: http://stavelin.com/uib/ismer
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# encoding: utf-8 | |
""" | |
Created by Eirik Stavelin on 2015 | |
Copyright (c) 2015 Eirik Stavelin. All rights reserved. | |
Modul for å finne -ismer i tekst, for å klare opp et veddemål.(ingen vant). | |
Resultater: http://stavelin.com/uib/ismer | |
""" | |
import sys, time, string | |
import os | |
import pymysql | |
from collections import Counter | |
import logging #DEBUG, INFO, WARNING, ERROR, CRITICAL | |
logging.basicConfig(stream=sys.stderr,level=logging.DEBUG) | |
logging.info("we are running") | |
def connect(): | |
try: | |
# Set up a database cursor: | |
rdbms_hostname = "localhost" | |
rdbms_username = "******" | |
rdbms_password = "******" | |
connection = pymysql.connect(unix_socket="/Applications/MAMP/tmp/mysql/mysql.sock",user=rdbms_username, passwd=rdbms_password, db="nrk", charset='utf8') | |
cur = connection.cursor() | |
cur.execute("USE nrk;") | |
logging.debug("koblet til MySQL") | |
return connection, cur | |
except: | |
logging.error("kunne ikke logge på databasen") | |
def disconnect(connection): | |
if connection: | |
connection.close() | |
logging.notset("koblet av mysql") | |
def update_progress(progress): | |
barLength = 10 # Modify this to change the length of the progress bar | |
status = "" | |
if isinstance(progress, int): | |
progress = float(progress) | |
if not isinstance(progress, float): | |
progress = 0 | |
status = "error: progress var must be float\r\n" | |
if progress < 0: | |
progress = 0 | |
status = "Halt...\r\n" | |
if progress >= 1: | |
progress = 1 | |
status = "Done...\r\n" | |
block = int(round(barLength*progress)) | |
text = "\rPercent: [{0}] {1}% {2}".format( "#"*block + "-"*(barLength-block), progress*100, status) | |
sys.stdout.write(text) | |
sys.stdout.flush() | |
def find_isms(text): | |
isms_list = [] | |
from nltk.corpus import stopwords | |
stopwords = [word.decode('utf-8') for word in stopwords.words('norwegian')] | |
from nltk import wordpunct_tokenize | |
for word in wordpunct_tokenize(text.lower()): | |
#print word | |
if word in stopwords: | |
continue # skip this iteration | |
if word in string.punctuation: | |
continue # skip | |
#print word | |
if word.endswith(('isme', 'ismen', 'ismene')): | |
isms_list.append(word) | |
isms_list.sort() | |
#print isms_list | |
return isms_list | |
def loop_text(): | |
connection, cur = connect() | |
#mysql_query = 'SELECT * FROM nrk2013b_ism_tbl order by id limit 40000' | |
mysql_query = '''SELECT CONCAT(title, " ", full_text) as text, id FROM | |
nrk2013b_ism_tbl order by id ''' #limit 10000 OFFSET 119999 | |
cur.execute(mysql_query) | |
mysql_rows = cur.fetchall() | |
current_row = 1 | |
for row in mysql_rows: | |
#print current_row, len(mysql_rows), row[0], type(row[0]) | |
# print find_isms(row[0]) | |
isms = find_isms(row[0]) | |
#print isms, type(isms) | |
if isms is None: | |
pass | |
else: | |
#if len(isms) != 0: | |
print isms, row[1] | |
# update mysql | |
cur.execute (""" UPDATE nrk2013b_ism_tbl SET ismer=%s WHERE id=%s | |
""", (','.join(isms), row[1])) | |
connection.commit() | |
print "Number of rows updated:", cur.rowcount | |
# update progress | |
update_progress(current_row/float(len(mysql_rows))) | |
current_row += 1 | |
def get_results(): | |
connection, cur = connect() | |
query = '''SELECT ismer FROM nrk2013b_ism_tbl WHERE `ismer` !="" | |
AND `template`!="nrk_alfa"''' | |
query = '''SELECT ismer FROM nrk2013b_ism_tbl WHERE `ismer` !=""''' | |
cur.execute(query) | |
mysql_rows = cur.fetchall() | |
current_row = 1 | |
cnt = Counter() | |
#analyse 1 | |
#for row in mysql_rows: | |
# for isme in row[0].split(","): | |
# cnt[isme]+=1 | |
#print cnt.most_common(15) | |
#analyse 2 | |
for row in mysql_rows: | |
for isme in row[0].split(","): | |
# slice of suffix | |
if isme.endswith('ismene'): | |
isme = isme[:-6] | |
if isme.endswith('ismen'): | |
isme = isme[:-5] | |
if isme.endswith('isme'): | |
isme = isme[:-4] | |
cnt[isme]+=1 | |
print cnt.most_common(20) | |
if __name__ == '__main__': | |
loop_text() | |
get_results() | |
sys.exit(0) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment