kouya17/text_classifier.py

## text_classifier.py
import numpy as np
import MeCab
import re

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression


# MeCab による単語への分割関数 (名詞のみ残す)
def split_text_only_noun(text):
    tagger = MeCab.Tagger()
    words = []
    for c in tagger.parse(text).splitlines()[:-1]:
        surface, feature = c.split('\t')
        pos = feature.split(',')[0]
        if pos == '名詞':
            words.append(surface)
    return ' '.join(words)


# MeCab による単語への分割関数
def split_text_all(text):
    tagger = MeCab.Tagger()
    words = []
    for c in tagger.parse(text).splitlines()[:-1]:
        surface, feature = c.split('\t')
        words.append(surface)
    return ' '.join(words)


def remove_subtitle(title):
    return re.sub('【.*?】', '', title)


def remove_newline(title):
    return re.sub('\n', '', title)


class TextVectorizer:
    def __init__(self):
        self.__count = CountVectorizer()
        self.__tfidf = TfidfTransformer(use_idf=True, norm='l2',
                                        smooth_idf=True)

    def train(self, file_path_list):
        title_list = []
        for file_path in file_path_list:
            with open(file_path) as f:
                for line in f:
                    title_list.append(split_text_all(remove_newline(line)))
        docs = np.array(title_list)

        # bow ( bag of words )
        self.__count.fit(docs)
        bags = self.__count.transform(docs)
        features = self.__count.get_feature_names()

        # tf-idf
        np.set_printoptions(precision=2)
        self.__tfidf.fit(bags)
        tf_idf = self.__tfidf.transform(bags)
        return tf_idf.toarray()

    def test(self, file_path_list):
        title_list = []
        for file_path in file_path_list:
            with open(file_path) as f:
                for line in f:
                    title_list.append(split_text_all(remove_newline(line)))
        docs = np.array(title_list)

        # bow ( bag of words )
        bags = self.__count.transform(docs)
        features = self.__count.get_feature_names()

        # tf-idf
        np.set_printoptions(precision=2)
        tf_idf = self.__tfidf.transform(bags)
        return tf_idf.toarray()


if __name__ == "__main__":
    textVectorizer = TextVectorizer()
    file_path_list = ['./hinokuma.txt', './kuzuha.txt']
    title_bags_array = textVectorizer.train(file_path_list)
    print('len: ', len(title_bags_array))

    X_train = np.array(title_bags_array).reshape((929, -1))
    y_train = np.r_[np.ones(474), np.zeros(455)]
    model = LogisticRegression()
    model.fit(X_train, y_train)
    test_path_list = ['./test.txt']
    test_bags_array = textVectorizer.test(test_path_list)
    print(model.predict_proba(test_bags_array)[:, 1])
	import numpy as np
	import MeCab
	import re

	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.feature_extraction.text import TfidfTransformer
	from sklearn.linear_model import LogisticRegression


	# MeCab による単語への分割関数 (名詞のみ残す)
	def split_text_only_noun(text):
	tagger = MeCab.Tagger()
	words = []
	for c in tagger.parse(text).splitlines()[:-1]:
	surface, feature = c.split('\t')
	pos = feature.split(',')[0]
	if pos == '名詞':
	words.append(surface)
	return ' '.join(words)


	# MeCab による単語への分割関数
	def split_text_all(text):
	tagger = MeCab.Tagger()
	words = []
	for c in tagger.parse(text).splitlines()[:-1]:
	surface, feature = c.split('\t')
	words.append(surface)
	return ' '.join(words)


	def remove_subtitle(title):
	return re.sub('【.*?】', '', title)


	def remove_newline(title):
	return re.sub('\n', '', title)


	class TextVectorizer:
	def __init__(self):
	self.__count = CountVectorizer()
	self.__tfidf = TfidfTransformer(use_idf=True, norm='l2',
	smooth_idf=True)

	def train(self, file_path_list):
	title_list = []
	for file_path in file_path_list:
	with open(file_path) as f:
	for line in f:
	title_list.append(split_text_all(remove_newline(line)))
	docs = np.array(title_list)

	# bow ( bag of words )
	self.__count.fit(docs)
	bags = self.__count.transform(docs)
	features = self.__count.get_feature_names()

	# tf-idf
	np.set_printoptions(precision=2)
	self.__tfidf.fit(bags)
	tf_idf = self.__tfidf.transform(bags)
	return tf_idf.toarray()

	def test(self, file_path_list):
	title_list = []
	for file_path in file_path_list:
	with open(file_path) as f:
	for line in f:
	title_list.append(split_text_all(remove_newline(line)))
	docs = np.array(title_list)

	# bow ( bag of words )
	bags = self.__count.transform(docs)
	features = self.__count.get_feature_names()

	# tf-idf
	np.set_printoptions(precision=2)
	tf_idf = self.__tfidf.transform(bags)
	return tf_idf.toarray()


	if __name__ == "__main__":
	textVectorizer = TextVectorizer()
	file_path_list = ['./hinokuma.txt', './kuzuha.txt']
	title_bags_array = textVectorizer.train(file_path_list)
	print('len: ', len(title_bags_array))

	X_train = np.array(title_bags_array).reshape((929, -1))
	y_train = np.r_[np.ones(474), np.zeros(455)]
	model = LogisticRegression()
	model.fit(X_train, y_train)
	test_path_list = ['./test.txt']
	test_bags_array = textVectorizer.test(test_path_list)
	print(model.predict_proba(test_bags_array)[:, 1])