Skip to content

Instantly share code, notes, and snippets.

@yatszhash
Created October 18, 2017 02:13
Show Gist options
  • Save yatszhash/7b3d7c8749f014639aaa5f2687197abd to your computer and use it in GitHub Desktop.
Save yatszhash/7b3d7c8749f014639aaa5f2687197abd to your computer and use it in GitHub Desktop.
Japanese TF-IDF vectorizer for scikit-learn pipline
# coding: utf-8
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.base import BaseEstimator
import MeCab
import numpy as np
from itertools import chain
class JpTfidfVectorizer(TfidfVectorizer):
tagger = MeCab.Tagger("-Owakati")
def __init__(self, input='content', encoding='utf-8', decode_error='strict', strip_accents=None, lowercase=True,
preprocessor=None, tokenizer=None, analyzer='word', stop_words=None, token_pattern=r'(?u)\b\w\w+\b',
ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None, vocabulary=None, binary=False,
dtype=np.int64, norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=False):
super(JpTfidfVectorizer, self).__init__(input, encoding, decode_error, strip_accents, lowercase,
preprocessor, tokenizer, analyzer, stop_words, token_pattern,
ngram_range, max_df, min_df, max_features, vocabulary, binary,
dtype, norm, use_idf, smooth_idf, sublinear_tf)
self._tfidf_vectorizer = TfidfVectorizer(input, encoding, decode_error, strip_accents, lowercase,
preprocessor, tokenizer, analyzer, stop_words,
token_pattern,
ngram_range, max_df, min_df, max_features, vocabulary, binary,
dtype, norm, use_idf, smooth_idf, sublinear_tf)
self.feature_names = ["tfidf"]
def get_feature_name(self):
return self.feature_names
def fit(self, X, y=None):
preprocessed_X = self.mecab_preprocessor(X)
self._tfidf_vectorizer.fit(preprocessed_X, y)
return self
def fit_transform(self, X, y=None):
preprocessed_X = self.mecab_preprocessor(X)
return self._tfidf_vectorizer.fit_transform(preprocessed_X)
def transform(self, X, copy=True):
preprocessed_X = self.mecab_preprocessor(X)
formed_X = self._tfidf_vectorizer.transform(preprocessed_X, copy)
return formed_X
@classmethod
def mecab_preprocessor(cls, X):
# preprocessor for word array (so can't be used in pipeline)
splited_X = [cls.into_joined_word(x) for x in X]
return splited_X
@classmethod
def into_joined_word(cls, x):
mecab_remove_reline = lambda x: (cls.tagger.parse(x)).replace("\n", "")
mecabed_X = map(mecab_remove_reline, x)
return "".join(chain.from_iterable(mecabed_X))
def set_params(self, **params):
super().set_params(**params)
self._tfidf_vectorizer.set_params(**params)
@meshiguge
Copy link

where can we get the Japanese stopwords list ? どうも

@rahulkrishnan98
Copy link

where can we get the Japanese stopwords list? どうも

Stop words-ja has a dictionary of stopwords.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment