Created
May 14, 2015 20:30
-
-
Save glortho/200d6c05ffadf6e893e9 to your computer and use it in GitHub Desktop.
Predict if a tweet belongs to a category(based on its text) using a pre-built[Supoort Vector Machine](http://en.wikipedia.org/wiki/Support_vector_machine)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
"exec" "twxec" "-e" "predict_tweet" "$0" "$@" | |
{{docstring "Predict if a tweet belongs to a category(based on its text) using a pre-built[Supoort Vector Machine](http://en.wikipedia.org/wiki/Support_vector_machine)"}} | |
from trickle.nner.en.nner import tokens as tokenize | |
import numpy as np | |
import csv | |
import importlib | |
import pickle | |
with open({{string Model_File}}) as f: | |
model_data = pickle.load(f) | |
vocab_coordspace = model_data["vocab_coordspace"] | |
vocab_set = set(vocab_coordspace.keys()) | |
split_modules = model_data["feature_generator"].split(".") | |
tokenization_module = importlib.import_module(".".join(split_modules[:-1])) | |
tokenize = getattr(tokenization_module, split_modules[-1]) | |
class_label = {{string Classification_Label}} | |
classifier = model_data["classifier"] | |
def token_vector(doc): | |
tokens = set(tokenize(doc)) & vocab_set | |
t_vec = np.zeros([len(vocab_set)]) | |
t_vec[[vocab_coordspace[token] for token in tokens]] = 1 | |
return(t_vec) | |
def predict_tweet(msg): | |
try: | |
tvec = token_vector(msg["text"]) | |
classifications = msg.get("classifications", {}) | |
classifications[class_label] = np.asscalar( | |
classifier.predict(tvec)) == 1 | |
msg["classifications"] = classifications | |
except: | |
msg["classifications"] = msg.get("classifications", {}) | |
return msg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment