Skip to content

Instantly share code, notes, and snippets.

@kouya17
Created May 24, 2020 13:03
Show Gist options
  • Save kouya17/9ef766522d387a86a2a88e375e1e82f3 to your computer and use it in GitHub Desktop.
Save kouya17/9ef766522d387a86a2a88e375e1e82f3 to your computer and use it in GitHub Desktop.
ファイルからデータセットを読み込んで、テキストの二値分類を行う
import numpy as np
import MeCab
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
# MeCab による単語への分割関数 (名詞のみ残す)
def split_text_only_noun(text):
tagger = MeCab.Tagger()
words = []
for c in tagger.parse(text).splitlines()[:-1]:
surface, feature = c.split('\t')
pos = feature.split(',')[0]
if pos == '名詞':
words.append(surface)
return ' '.join(words)
# MeCab による単語への分割関数
def split_text_all(text):
tagger = MeCab.Tagger()
words = []
for c in tagger.parse(text).splitlines()[:-1]:
surface, feature = c.split('\t')
words.append(surface)
return ' '.join(words)
def remove_subtitle(title):
return re.sub('【.*?】', '', title)
def remove_newline(title):
return re.sub('\n', '', title)
class TextVectorizer:
def __init__(self):
self.__count = CountVectorizer()
self.__tfidf = TfidfTransformer(use_idf=True, norm='l2',
smooth_idf=True)
def train(self, file_path_list):
title_list = []
for file_path in file_path_list:
with open(file_path) as f:
for line in f:
title_list.append(split_text_all(remove_newline(line)))
docs = np.array(title_list)
# bow ( bag of words )
self.__count.fit(docs)
bags = self.__count.transform(docs)
features = self.__count.get_feature_names()
# tf-idf
np.set_printoptions(precision=2)
self.__tfidf.fit(bags)
tf_idf = self.__tfidf.transform(bags)
return tf_idf.toarray()
def test(self, file_path_list):
title_list = []
for file_path in file_path_list:
with open(file_path) as f:
for line in f:
title_list.append(split_text_all(remove_newline(line)))
docs = np.array(title_list)
# bow ( bag of words )
bags = self.__count.transform(docs)
features = self.__count.get_feature_names()
# tf-idf
np.set_printoptions(precision=2)
tf_idf = self.__tfidf.transform(bags)
return tf_idf.toarray()
if __name__ == "__main__":
textVectorizer = TextVectorizer()
file_path_list = ['./hinokuma.txt', './kuzuha.txt']
title_bags_array = textVectorizer.train(file_path_list)
print('len: ', len(title_bags_array))
X_train = np.array(title_bags_array).reshape((929, -1))
y_train = np.r_[np.ones(474), np.zeros(455)]
model = LogisticRegression()
model.fit(X_train, y_train)
test_path_list = ['./test.txt']
test_bags_array = textVectorizer.test(test_path_list)
print(model.predict_proba(test_bags_array)[:, 1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment