Skip to content

Instantly share code, notes, and snippets.

@pbellon
Last active September 25, 2018 19:42
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pbellon/575041e22320b8bc011929421a9d6263 to your computer and use it in GitHub Desktop.
Save pbellon/575041e22320b8bc011929421a9d6263 to your computer and use it in GitHub Desktop.
Create DTM matrices with chunks
import pandas as pd
import re
from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from django_pandas.io import read_frame
from scipy import sparse
# our Article model class. Here we're going to rely on its "content" field that
# hold the article's textual content. It's the center of our analysis
from idlm.models import Article
stop_words = [
# our stop words list goes here then we add a "EOF" word that will be
# useful when creating vocabulary see (process_vocabulary).
'EOF',
]
# small function to find all words that we identified as stop/tool words
search_stop_words = re.compile("(%s)" % '|'.join(stop_words) ).search
# Ex: getchunks([0,1,2,3,4,5,6,7], 4) => [[0,1,2,3], [4,5,6,7]]
def getchunks(iterable, chunk_size):
size = len(iterable)
if size < chunk_size:
yield iterable
chunks_nb = int(size / chunk_size)
iter_ints = range(0, chunks_nb)
for i in iter_ints:
j = i * chunk_size
if i+1 < chunks_nb:
k = j + chunk_size
yield iterable[j:k]
else:
yield iterable[j:]
def getngrams(texts):
_ngrams = dict()
# This EOF is here because we produce ngrams on a bunch of documents in a
# single time. So when we will produce our vocabulary after that we want to
# be able to avoid associating terms that are not really associated.
allterms = ' EOF '.join(texts).split(' ')
# create 1-grams, 2-grams and 3-grams and zip() them all.
for g in zip(ngrams(allterms, 1), ngrams(allterms, 2), ngrams(allterms, 3)):
for w in map(lambda w: ' '.join(w), g):
if w not in _ngrams and not search_stop_words(w):
_ngrams[w] = 1
return _ngrams
# Creates a vocabulary/dictonnary from a bunch of documents
def process_vocabulary(texts):
ngrams = getngrams(texts)
keys = ngrams.keys()
keys.sort()
return keys
# Conversion of django objects to a DataFrame
df = read_frame(Article.objects.all()[:1000])
# Vocabulary creation
vocabulary = process_vocabulary(df['content'])
# Model initialization
model = CountVectorizer(
ngram_range=(1, 3), analyzer='word', stop_words=stop_words,
vocabulary=vocabulary
)
dtm_chunked = []
for chunk in getchunks(texts, df['content']):
dtm_chunked.append(model.fit_transform(chunk))
# matrices concates
dtm = sparse.vstack(dtm_chunked)
@jaclynweiser
Copy link

jaclynweiser commented Sep 25, 2018

Line 68 - getchunks(texts, df['content']) should be getchunks(df['content'], defined chunksize)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment