bertomartin/clean_data.py

## clean_data.py
selected_words = [
    'receipt',
    'card',
    'refund',
    'month',
    'monthly',
    'plan',
    'profit',
    'charged',
    'charge',
    'digits',
    'visa',
    'money',
    'upgrade',
    'upgraded',
    'pay',
    'bill',
    'billing',
    'billed',
    'cancel',
    'cancelled',
    'accounts',
    'credit',
    'year',
    'bank',
    'discount',
    'small'
]

import re
threads['words'] = threads['body'].apply(lambda body: re.sub("<[^>]*>", "", body))
#remove punctuation, whitespace and lowercase it all
threads['words'] = threads['words'].apply(lambda words: re.sub("[\W\d]", " ", words.lower().strip()))
threads_tfidf = graphlab.text_analytics.tf_idf(threads['words'])
threads['top10'] = threads_tfidf['docs'].apply(lambda t: " ".join(sorted(t, key=t.get, reverse=True)[1:10]))
threads['word_count'] = graphlab.text_analytics.count_words(threads['words'])

def selected_word_count(word, counts):
    if word in counts:
        return counts[word]
    else:
        return 0
results = {}
for selected_word in selected_words:
    threads[selected_word] = threads['word_count'].apply(lambda words: selected_word_count(selected_word, words))

## create_model.py
billing_model = graphlab.logistic_classifier.create(train_data,
                                                     target='area_billing',
                                                     features=selected_words,
                                                     validation_set=test_data)
billing_model.evaluate(test_threads)
	selected_words = [
	'receipt',
	'card',
	'refund',
	'month',
	'monthly',
	'plan',
	'profit',
	'charged',
	'charge',
	'digits',
	'visa',
	'money',
	'upgrade',
	'upgraded',
	'pay',
	'bill',
	'billing',
	'billed',
	'cancel',
	'cancelled',
	'accounts',
	'credit',
	'year',
	'bank',
	'discount',
	'small'
	]

	import re
	threads['words'] = threads['body'].apply(lambda body: re.sub("<[^>]*>", "", body))
	#remove punctuation, whitespace and lowercase it all
	threads['words'] = threads['words'].apply(lambda words: re.sub("[\W\d]", " ", words.lower().strip()))
	threads_tfidf = graphlab.text_analytics.tf_idf(threads['words'])
	threads['top10'] = threads_tfidf['docs'].apply(lambda t: " ".join(sorted(t, key=t.get, reverse=True)[1:10]))
	threads['word_count'] = graphlab.text_analytics.count_words(threads['words'])

	def selected_word_count(word, counts):
	if word in counts:
	return counts[word]
	else:
	return 0
	results = {}
	for selected_word in selected_words:
	threads[selected_word] = threads['word_count'].apply(lambda words: selected_word_count(selected_word, words))
	billing_model = graphlab.logistic_classifier.create(train_data,
	target='area_billing',
	features=selected_words,
	validation_set=test_data)
	billing_model.evaluate(test_threads)