pbellon/sklearn_with_chunks.py Secret

## sklearn_with_chunks.py
import pandas as pd
import re

from nltk import ngrams
from sklearn.feature_extraction.text import CountVectorizer
from django_pandas.io import read_frame
from scipy import sparse

# our Article model class. Here we're going to rely on its "content" field that
# hold the article's textual content. It's the center of our analysis
from idlm.models import Article

stop_words = [
    # our stop words list goes here then we add a "EOF" word that will be
    # useful when creating vocabulary see (process_vocabulary).
    'EOF',
]

# small function to find all words that we identified as stop/tool words
search_stop_words = re.compile("(%s)" % '|'.join(stop_words) ).search

# Ex: getchunks([0,1,2,3,4,5,6,7], 4) => [[0,1,2,3], [4,5,6,7]]
def getchunks(iterable, chunk_size):
    size = len(iterable)
    if size < chunk_size:
        yield iterable
    chunks_nb = int(size / chunk_size)
    iter_ints = range(0, chunks_nb)
    for i in iter_ints:
        j = i * chunk_size
        if i+1 < chunks_nb:
            k = j + chunk_size
            yield iterable[j:k]
        else:
            yield iterable[j:]

def getngrams(texts):
    _ngrams = dict()
    # This EOF is here because we produce ngrams on a bunch of documents in a
    # single time. So when we will produce our vocabulary after that we want to
    # be able to avoid associating terms that are not really associated.
    allterms = ' EOF '.join(texts).split(' ')
    # create 1-grams, 2-grams and 3-grams and zip() them all.
    for g in zip(ngrams(allterms, 1), ngrams(allterms, 2), ngrams(allterms, 3)):
        for w in map(lambda w: ' '.join(w), g):
            if w not in _ngrams and not search_stop_words(w):
                _ngrams[w] = 1
    return _ngrams

# Creates a vocabulary/dictonnary from a bunch of documents
def process_vocabulary(texts):
    ngrams = getngrams(texts)
    keys = ngrams.keys()
    keys.sort()
    return keys


# Conversion of django objects to a DataFrame
df = read_frame(Article.objects.all()[:1000])
# Vocabulary creation
vocabulary = process_vocabulary(df['content'])
# Model initialization
model = CountVectorizer(
    ngram_range=(1, 3), analyzer='word', stop_words=stop_words,
    vocabulary=vocabulary
)
dtm_chunked = []
for chunk in getchunks(texts, df['content']):
    dtm_chunked.append(model.fit_transform(chunk))

# matrices concates
dtm = sparse.vstack(dtm_chunked)
	import pandas as pd
	import re

	from nltk import ngrams
	from sklearn.feature_extraction.text import CountVectorizer
	from django_pandas.io import read_frame
	from scipy import sparse

	# our Article model class. Here we're going to rely on its "content" field that
	# hold the article's textual content. It's the center of our analysis
	from idlm.models import Article

	stop_words = [
	# our stop words list goes here then we add a "EOF" word that will be
	# useful when creating vocabulary see (process_vocabulary).
	'EOF',
	]

	# small function to find all words that we identified as stop/tool words
	search_stop_words = re.compile("(%s)" % '\|'.join(stop_words) ).search

	# Ex: getchunks([0,1,2,3,4,5,6,7], 4) => [[0,1,2,3], [4,5,6,7]]
	def getchunks(iterable, chunk_size):
	size = len(iterable)
	if size < chunk_size:
	yield iterable
	chunks_nb = int(size / chunk_size)
	iter_ints = range(0, chunks_nb)
	for i in iter_ints:
	j = i * chunk_size
	if i+1 < chunks_nb:
	k = j + chunk_size
	yield iterable[j:k]
	else:
	yield iterable[j:]

	def getngrams(texts):
	_ngrams = dict()
	# This EOF is here because we produce ngrams on a bunch of documents in a
	# single time. So when we will produce our vocabulary after that we want to
	# be able to avoid associating terms that are not really associated.
	allterms = ' EOF '.join(texts).split(' ')
	# create 1-grams, 2-grams and 3-grams and zip() them all.
	for g in zip(ngrams(allterms, 1), ngrams(allterms, 2), ngrams(allterms, 3)):
	for w in map(lambda w: ' '.join(w), g):
	if w not in _ngrams and not search_stop_words(w):
	_ngrams[w] = 1
	return _ngrams

	# Creates a vocabulary/dictonnary from a bunch of documents
	def process_vocabulary(texts):
	ngrams = getngrams(texts)
	keys = ngrams.keys()
	keys.sort()
	return keys


	# Conversion of django objects to a DataFrame
	df = read_frame(Article.objects.all()[:1000])
	# Vocabulary creation
	vocabulary = process_vocabulary(df['content'])
	# Model initialization
	model = CountVectorizer(
	ngram_range=(1, 3), analyzer='word', stop_words=stop_words,
	vocabulary=vocabulary
	)
	dtm_chunked = []
	for chunk in getchunks(texts, df['content']):
	dtm_chunked.append(model.fit_transform(chunk))

	# matrices concates
	dtm = sparse.vstack(dtm_chunked)