satomacoto/mparser.py

## mparser.py
#! /usr/bin/python
# -*- encoding: utf-8 -*-
'''
Parser with MeCab
'''

import MeCab
import glob
import re
import os

class Parser:

    stopwords=[]
    client = None

    def __init__(self,):
        self.client = MeCab.Tagger()
        self.p = re.compile(r"&.{1,5}?;|\w")
        for file in glob.glob(os.path.dirname(__file__)+'/stopwords/*/*.txt'):
            self.stopwords += [ line.strip() for line in open(file).readlines() ]
        self.stopwords.append('the')

    def removeSymbols(self, string):
        symbols = []
        for file in glob.glob(os.path.dirname(__file__)+'/stopwords/symbol/*.txt'):
            symbols += [ line.strip() for line in open(file).readlines() ]
        for symbol in symbols:
            string = string.replace(symbol, '')
        return string

    def clean(self, string):
        """ remove any nasty grammar tokens from string """
        string = self.p.sub(' ',string)
        string = string.lower()
        return string

    def removeStopwords(self, list):
        """ Remove common words which have no search value """
        return [word for word in list if word not in self.stopwords ]

    def tokenize(self, string, pos="形容詞,形容動詞,感動詞,副詞,連体詞,名詞,動詞"):
        """ break string up into tokens and stem words """
        m = self.client.parseToNode(string)
        res = []
        while m:
            f = m.feature.split(",")
            if f[0] in pos or pos == "":
                res += [m.surface]
            m = m.next
        return res

if __name__ == "__main__":
    p = Parser()
    text = "隣の客はよく柿食う客だ。"
    cleaned = p.clean(text)
    symbolsRemoved = p.removeSymbols(cleaned)
    tokens = p.tokenize(cleaned)
    removed = p.removeStopwords(tokens)
	#! /usr/bin/python
	# -- encoding: utf-8 --
	'''
	Parser with MeCab
	'''

	import MeCab
	import glob
	import re
	import os

	class Parser:

	stopwords=[]
	client = None

	def __init__(self,):
	self.client = MeCab.Tagger()
	self.p = re.compile(r"&.{1,5}?;\|\w")
	for file in glob.glob(os.path.dirname(__file__)+'/stopwords//.txt'):
	self.stopwords += [ line.strip() for line in open(file).readlines() ]
	self.stopwords.append('the')

	def removeSymbols(self, string):
	symbols = []
	for file in glob.glob(os.path.dirname(__file__)+'/stopwords/symbol/*.txt'):
	symbols += [ line.strip() for line in open(file).readlines() ]
	for symbol in symbols:
	string = string.replace(symbol, '')
	return string

	def clean(self, string):
	""" remove any nasty grammar tokens from string """
	string = self.p.sub(' ',string)
	string = string.lower()
	return string

	def removeStopwords(self, list):
	""" Remove common words which have no search value """
	return [word for word in list if word not in self.stopwords ]

	def tokenize(self, string, pos="形容詞,形容動詞,感動詞,副詞,連体詞,名詞,動詞"):
	""" break string up into tokens and stem words """
	m = self.client.parseToNode(string)
	res = []
	while m:
	f = m.feature.split(",")
	if f[0] in pos or pos == "":
	res += [m.surface]
	m = m.next
	return res

	if __name__ == "__main__":
	p = Parser()
	text = "隣の客はよく柿食う客だ。"
	cleaned = p.clean(text)
	symbolsRemoved = p.removeSymbols(cleaned)
	tokens = p.tokenize(cleaned)
	removed = p.removeStopwords(tokens)