Created
October 18, 2017 02:13
-
-
Save yatszhash/7b3d7c8749f014639aaa5f2687197abd to your computer and use it in GitHub Desktop.
Japanese TF-IDF vectorizer for scikit-learn pipline
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer | |
from sklearn.base import BaseEstimator | |
import MeCab | |
import numpy as np | |
from itertools import chain | |
class JpTfidfVectorizer(TfidfVectorizer): | |
tagger = MeCab.Tagger("-Owakati") | |
def __init__(self, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True, | |
preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern=r'(?u)\b\w\w+\b', | |
ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False, | |
dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False): | |
super(JpTfidfVectorizer, self).__init__(input, encoding, decode_error, strip_accents, lowercase, | |
preprocessor, tokenizer, analyzer, stop_words, token_pattern, | |
ngram_range, max_df, min_df, max_features, vocabulary, binary, | |
dtype, norm, use_idf, smooth_idf, sublinear_tf) | |
self._tfidf_vectorizer = TfidfVectorizer(input, encoding, decode_error, strip_accents, lowercase, | |
preprocessor, tokenizer, analyzer, stop_words, | |
token_pattern, | |
ngram_range, max_df, min_df, max_features, vocabulary, binary, | |
dtype, norm, use_idf, smooth_idf, sublinear_tf) | |
self.feature_names = ["tfidf"] | |
def get_feature_name(self): | |
return self.feature_names | |
def fit(self, X, y=None): | |
preprocessed_X = self.mecab_preprocessor(X) | |
self._tfidf_vectorizer.fit(preprocessed_X, y) | |
return self | |
def fit_transform(self, X, y=None): | |
preprocessed_X = self.mecab_preprocessor(X) | |
return self._tfidf_vectorizer.fit_transform(preprocessed_X) | |
def transform(self, X, copy=True): | |
preprocessed_X = self.mecab_preprocessor(X) | |
formed_X = self._tfidf_vectorizer.transform(preprocessed_X, copy) | |
return formed_X | |
@classmethod | |
def mecab_preprocessor(cls, X): | |
# preprocessor for word array (so can't be used in pipeline) | |
splited_X = [cls.into_joined_word(x) for x in X] | |
return splited_X | |
@classmethod | |
def into_joined_word(cls, x): | |
mecab_remove_reline = lambda x: (cls.tagger.parse(x)).replace("\n", "") | |
mecabed_X = map(mecab_remove_reline, x) | |
return "".join(chain.from_iterable(mecabed_X)) | |
def set_params(self, **params): | |
super().set_params(**params) | |
self._tfidf_vectorizer.set_params(**params) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
where can we get the Japanese stopwords list ? どうも