Skip to content

Instantly share code, notes, and snippets.

@tkazusa
Created April 27, 2019 19:57
Show Gist options
  • Save tkazusa/37127320deb9d93acb7d3bda5b0c0b59 to your computer and use it in GitHub Desktop.
Save tkazusa/37127320deb9d93acb7d3bda5b0c0b59 to your computer and use it in GitHub Desktop.
from gensim import corpora, models
from gensim.matutils import corpus2dense, corpus2csc
def tfidfvectorizer(df: pd.DataFrame) -> np.ndarray:
documents = list(train.name)
texts = list(map(lambda x: x.split(), documents))
dct = corpora.Dictionary(texts)
corpus = [dct.doc2bow(line) for line in texts]
tfidf= models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
return [list(corpus2dense([corpus_tfidf[i], len(dct)])) for i in range(len(corpus_tfidf))]
def tfidfvectorizer(df: pd.DataFrame) -> np.ndarray:
documents = list(df.name)
texts = list(map(lambda x: x.split(), documents))
dct = corpora.Dictionary(texts)
corpus = [dct.doc2bow(line) for line in texts]
tfidf= models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
return [list(corpus2csc([corpus_tfidf[i]], len(dct))) for i in range(len(corpus_tfidf))]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment