Skip to content

Instantly share code, notes, and snippets.

@sughodke
Created December 14, 2016 22:47
Show Gist options
  • Save sughodke/05c0af92516f215347431d642436a7bb to your computer and use it in GitHub Desktop.
Save sughodke/05c0af92516f215347431d642436a7bb to your computer and use it in GitHub Desktop.
Determine if some text is a question
import nltk
import random
from pprint import pprint
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import HashingVectorizer
posts = nltk.corpus.nps_chat.xml_posts([
'10-19-20s_706posts.xml',
'11-08-20s_705posts.xml',
'11-09-20s_706posts.xml'
])
def transform(post):
tokens = nltk.word_tokenize(post.text)
tagged_tokens = nltk.pos_tag(tokens)
serialized = ['_'.join(z)
for z in tagged_tokens]
text = ' '.join(serialized)
return text, 'Question' in post.get('class')
def train_classifier():
pipeline = Pipeline([
('vect', HashingVectorizer()),
('clf', SGDClassifier())
])
featuresets = [transform(post) for post in posts]
random.shuffle(featuresets)
size = int(len(featuresets) * .1)
train_set, test_set = featuresets[size:], featuresets[:size]
X, y = zip(*train_set)
pipeline.fit(X, y)
X, y = zip(*test_set)
pred = pipeline.predict(X)
pprint([z for z in zip(X, pred, y)
if z[1] != z[2]])
print('accuracy %f' % pipeline.score(X, y))
print(classification_report(y, pred))
return pipeline
classifier = train_classifier()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment