griesmey/bigram_gist.py

## bigram_gist.py
from collections import Counter
import re
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from itertools import islice, tee
from nltk.corpus import stopwords

def tokenize(sentence):
    words = re.findall("[a-zA-Z]+", sentence)
    bigram = []

    for gram in generate_ngrams(words, 2):
        bigram.append('{0} {1}'.format(gram[0], gram[1]))

    # take out stop words
    words = [w for w in words if w not in stopwords.words("english")]
    words.extend(bigram)

    return words

def generate_ngrams(lst, n):
    ilst = lst
    while True:
        a, b = tee(ilst)
        l = tuple(islice(a, n))
        if len(l) == n:
            yield l
            next(b)
            ilst = b
        else:
            break

print(tokenize('Hello there good guy. I will kill you'))
	from collections import Counter
	import re
	from sklearn.feature_extraction import DictVectorizer
	from sklearn.feature_extraction.text import TfidfTransformer
	from sklearn.feature_extraction.text import CountVectorizer
	from itertools import islice, tee
	from nltk.corpus import stopwords

	def tokenize(sentence):
	words = re.findall("[a-zA-Z]+", sentence)
	bigram = []

	for gram in generate_ngrams(words, 2):
	bigram.append('{0} {1}'.format(gram[0], gram[1]))

	# take out stop words
	words = [w for w in words if w not in stopwords.words("english")]
	words.extend(bigram)

	return words

	def generate_ngrams(lst, n):
	ilst = lst
	while True:
	a, b = tee(ilst)
	l = tuple(islice(a, n))
	if len(l) == n:
	yield l
	next(b)
	ilst = b
	else:
	break

	print(tokenize('Hello there good guy. I will kill you'))