Skip to content

Instantly share code, notes, and snippets.

@ukyo
Created August 8, 2011 12:26
Show Gist options
  • Save ukyo/1131664 to your computer and use it in GitHub Desktop.
Save ukyo/1131664 to your computer and use it in GitHub Desktop.
dump from mongodb
#!/usr/bin/python
#coding: utf8
import re
import pymongo
import MeCab
sub_url = re.compile('(https?|ftp)(:\/\/[-_.!~*\'()a-zA-Z0-9;\/?:\@&=+\$,%#]+)').sub
sub_user_hash = re.compile('(@|#)[a-zA-Z0-9_]+').sub
def delete(text):
return sub_user_hash('', sub_url('', text))
conn = pymongo.Connection()
db = conn.twitter
tweets = db.tweets
m = MeCab.Tagger('-u /home/ukyo/mecab-dic-overdrive/misc/dic/wikipedia.dic')
for t in tweets.find():
fuga = delete(t['text']).encode('utf8')
hoge = []
for f in m.parse(fuga).split('\n')[:-2]:
if '名詞' in f and not '名詞,数' in f and not '非自律' in f:
hoge.append(f.split('\t')[0])
print ','.join(hoge)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment