Skip to content

Instantly share code, notes, and snippets.

@bertomartin
Forked from michael-erasmus/clean_data.py
Created September 9, 2016 19:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bertomartin/c4434610cd5ff8936d658726bd61982f to your computer and use it in GitHub Desktop.
Save bertomartin/c4434610cd5ff8936d658726bd61982f to your computer and use it in GitHub Desktop.
Simple logistic model
selected_words = [
'receipt',
'card',
'refund',
'month',
'monthly',
'plan',
'profit',
'charged',
'charge',
'digits',
'visa',
'money',
'upgrade',
'upgraded',
'pay',
'bill',
'billing',
'billed',
'cancel',
'cancelled',
'accounts',
'credit',
'year',
'bank',
'discount',
'small'
]
import re
threads['words'] = threads['body'].apply(lambda body: re.sub("<[^>]*>", "", body))
#remove punctuation, whitespace and lowercase it all
threads['words'] = threads['words'].apply(lambda words: re.sub("[\W\d]", " ", words.lower().strip()))
threads_tfidf = graphlab.text_analytics.tf_idf(threads['words'])
threads['top10'] = threads_tfidf['docs'].apply(lambda t: " ".join(sorted(t, key=t.get, reverse=True)[1:10]))
threads['word_count'] = graphlab.text_analytics.count_words(threads['words'])
def selected_word_count(word, counts):
if word in counts:
return counts[word]
else:
return 0
results = {}
for selected_word in selected_words:
threads[selected_word] = threads['word_count'].apply(lambda words: selected_word_count(selected_word, words))
billing_model = graphlab.logistic_classifier.create(train_data,
target='area_billing',
features=selected_words,
validation_set=test_data)
billing_model.evaluate(test_threads)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment