Skip to content

Instantly share code, notes, and snippets.

@satomacoto
Created April 12, 2012 03:25
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save satomacoto/2364439 to your computer and use it in GitHub Desktop.
Save satomacoto/2364439 to your computer and use it in GitHub Desktop.
Parser with MeCab
#! /usr/bin/python
# -*- encoding: utf-8 -*-
'''
Parser with MeCab
'''
import MeCab
import glob
import re
import os
class Parser:
stopwords=[]
client = None
def __init__(self,):
self.client = MeCab.Tagger()
self.p = re.compile(r"&.{1,5}?;|\w")
for file in glob.glob(os.path.dirname(__file__)+'/stopwords/*/*.txt'):
self.stopwords += [ line.strip() for line in open(file).readlines() ]
self.stopwords.append('the')
def removeSymbols(self, string):
symbols = []
for file in glob.glob(os.path.dirname(__file__)+'/stopwords/symbol/*.txt'):
symbols += [ line.strip() for line in open(file).readlines() ]
for symbol in symbols:
string = string.replace(symbol, '')
return string
def clean(self, string):
""" remove any nasty grammar tokens from string """
string = self.p.sub(' ',string)
string = string.lower()
return string
def removeStopwords(self, list):
""" Remove common words which have no search value """
return [word for word in list if word not in self.stopwords ]
def tokenize(self, string, pos="形容詞,形容動詞,感動詞,副詞,連体詞,名詞,動詞"):
""" break string up into tokens and stem words """
m = self.client.parseToNode(string)
res = []
while m:
f = m.feature.split(",")
if f[0] in pos or pos == "":
res += [m.surface]
m = m.next
return res
if __name__ == "__main__":
p = Parser()
text = "隣の客はよく柿食う客だ。"
cleaned = p.clean(text)
symbolsRemoved = p.removeSymbols(cleaned)
tokens = p.tokenize(cleaned)
removed = p.removeStopwords(tokens)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment