Skip to content

Instantly share code, notes, and snippets.

@k-saka
Created April 30, 2010 04:16
Show Gist options
  • Save k-saka/384733 to your computer and use it in GitHub Desktop.
Save k-saka/384733 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import sqlite3
import time
import MeCab
def connect_db():
conn = sqlite3.connect('tl.db')
return conn
def get_text():
#被解析用データ
conn = connect_db()
query = u'select * from status'
texts = conn.execute(query)
return texts
def word_select(text):
#形態素解析
words = []
m = MeCab.Tagger('-Ochasen')
n = m.parseToNode(text)
n = n.next
while n.next:
if (38 <= n.posid and n.posid <= 47) and n.posid != 40:
words.append(n.surface)
n = n.next
return words
def main():
noun={}
texts = get_text()
for text in texts:
words = word_select(text[2].encode('utf-8'))
for word in words:
if word in noun:
num=noun[word]
noun[word] = num + 1
else:
noun[word] = 1
for k, v in sorted(noun.items(), key=lambda x:x[1],reverse=True):
print k, v
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment