-
-
Save pbellon/575041e22320b8bc011929421a9d6263 to your computer and use it in GitHub Desktop.
Create DTM matrices with chunks
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
import re | |
from nltk import ngrams | |
from sklearn.feature_extraction.text import CountVectorizer | |
from django_pandas.io import read_frame | |
from scipy import sparse | |
# our Article model class. Here we're going to rely on its "content" field that | |
# hold the article's textual content. It's the center of our analysis | |
from idlm.models import Article | |
stop_words = [ | |
# our stop words list goes here then we add a "EOF" word that will be | |
# useful when creating vocabulary see (process_vocabulary). | |
'EOF', | |
] | |
# small function to find all words that we identified as stop/tool words | |
search_stop_words = re.compile("(%s)" % '|'.join(stop_words) ).search | |
# Ex: getchunks([0,1,2,3,4,5,6,7], 4) => [[0,1,2,3], [4,5,6,7]] | |
def getchunks(iterable, chunk_size): | |
size = len(iterable) | |
if size < chunk_size: | |
yield iterable | |
chunks_nb = int(size / chunk_size) | |
iter_ints = range(0, chunks_nb) | |
for i in iter_ints: | |
j = i * chunk_size | |
if i+1 < chunks_nb: | |
k = j + chunk_size | |
yield iterable[j:k] | |
else: | |
yield iterable[j:] | |
def getngrams(texts): | |
_ngrams = dict() | |
# This EOF is here because we produce ngrams on a bunch of documents in a | |
# single time. So when we will produce our vocabulary after that we want to | |
# be able to avoid associating terms that are not really associated. | |
allterms = ' EOF '.join(texts).split(' ') | |
# create 1-grams, 2-grams and 3-grams and zip() them all. | |
for g in zip(ngrams(allterms, 1), ngrams(allterms, 2), ngrams(allterms, 3)): | |
for w in map(lambda w: ' '.join(w), g): | |
if w not in _ngrams and not search_stop_words(w): | |
_ngrams[w] = 1 | |
return _ngrams | |
# Creates a vocabulary/dictonnary from a bunch of documents | |
def process_vocabulary(texts): | |
ngrams = getngrams(texts) | |
keys = ngrams.keys() | |
keys.sort() | |
return keys | |
# Conversion of django objects to a DataFrame | |
df = read_frame(Article.objects.all()[:1000]) | |
# Vocabulary creation | |
vocabulary = process_vocabulary(df['content']) | |
# Model initialization | |
model = CountVectorizer( | |
ngram_range=(1, 3), analyzer='word', stop_words=stop_words, | |
vocabulary=vocabulary | |
) | |
dtm_chunked = [] | |
for chunk in getchunks(texts, df['content']): | |
dtm_chunked.append(model.fit_transform(chunk)) | |
# matrices concates | |
dtm = sparse.vstack(dtm_chunked) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Line 68 - getchunks(texts, df['content']) should be getchunks(df['content'], defined chunksize)