Created
May 24, 2020 13:03
-
-
Save kouya17/9ef766522d387a86a2a88e375e1e82f3 to your computer and use it in GitHub Desktop.
ファイルからデータセットを読み込んで、テキストの二値分類を行う
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import MeCab | |
import re | |
from sklearn.feature_extraction.text import CountVectorizer | |
from sklearn.feature_extraction.text import TfidfTransformer | |
from sklearn.linear_model import LogisticRegression | |
# MeCab による単語への分割関数 (名詞のみ残す) | |
def split_text_only_noun(text): | |
tagger = MeCab.Tagger() | |
words = [] | |
for c in tagger.parse(text).splitlines()[:-1]: | |
surface, feature = c.split('\t') | |
pos = feature.split(',')[0] | |
if pos == '名詞': | |
words.append(surface) | |
return ' '.join(words) | |
# MeCab による単語への分割関数 | |
def split_text_all(text): | |
tagger = MeCab.Tagger() | |
words = [] | |
for c in tagger.parse(text).splitlines()[:-1]: | |
surface, feature = c.split('\t') | |
words.append(surface) | |
return ' '.join(words) | |
def remove_subtitle(title): | |
return re.sub('【.*?】', '', title) | |
def remove_newline(title): | |
return re.sub('\n', '', title) | |
class TextVectorizer: | |
def __init__(self): | |
self.__count = CountVectorizer() | |
self.__tfidf = TfidfTransformer(use_idf=True, norm='l2', | |
smooth_idf=True) | |
def train(self, file_path_list): | |
title_list = [] | |
for file_path in file_path_list: | |
with open(file_path) as f: | |
for line in f: | |
title_list.append(split_text_all(remove_newline(line))) | |
docs = np.array(title_list) | |
# bow ( bag of words ) | |
self.__count.fit(docs) | |
bags = self.__count.transform(docs) | |
features = self.__count.get_feature_names() | |
# tf-idf | |
np.set_printoptions(precision=2) | |
self.__tfidf.fit(bags) | |
tf_idf = self.__tfidf.transform(bags) | |
return tf_idf.toarray() | |
def test(self, file_path_list): | |
title_list = [] | |
for file_path in file_path_list: | |
with open(file_path) as f: | |
for line in f: | |
title_list.append(split_text_all(remove_newline(line))) | |
docs = np.array(title_list) | |
# bow ( bag of words ) | |
bags = self.__count.transform(docs) | |
features = self.__count.get_feature_names() | |
# tf-idf | |
np.set_printoptions(precision=2) | |
tf_idf = self.__tfidf.transform(bags) | |
return tf_idf.toarray() | |
if __name__ == "__main__": | |
textVectorizer = TextVectorizer() | |
file_path_list = ['./hinokuma.txt', './kuzuha.txt'] | |
title_bags_array = textVectorizer.train(file_path_list) | |
print('len: ', len(title_bags_array)) | |
X_train = np.array(title_bags_array).reshape((929, -1)) | |
y_train = np.r_[np.ones(474), np.zeros(455)] | |
model = LogisticRegression() | |
model.fit(X_train, y_train) | |
test_path_list = ['./test.txt'] | |
test_bags_array = textVectorizer.test(test_path_list) | |
print(model.predict_proba(test_bags_array)[:, 1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment