Skip to content

Instantly share code, notes, and snippets.

@halfak
Last active August 24, 2016 16:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save halfak/4b5e856902b5237d88ea49a80c1fb378 to your computer and use it in GitHub Desktop.
Save halfak/4b5e856902b5237d88ea49a80c1fb378 to your computer and use it in GitHub Desktop.
Thinking about abstract feature vectors
from revscoring.datasources.meta import gramming, hashing, frequencies, selectors
from revscoring.features.meta import vectorizers
from revscoring.features import wikitext, revision_oriented
# Step 1: Engineer dependency pipeline for selecting text features
my_grams = [(0,), (0,1), (0,1,2), (0,2), (1,2)]
parent_hash_table = frequencies.table(
hashing.hash(gramming.gram(wikitext.revision.parent.words, grams=my_grams)), n=2**20)
revision_hash_table = frequencies.table(
hashing.hash(gramming.gram(wikitext.revision.words, grams=my_grams)), n=2**20)
hashes_delta = frequencies.delta(parent_hash_table, revision_hash_table)
hashes_added = frequencies.positive(hashes_delta)
hashes_removed = frequencies.negative(hashes_delta, abs=True)
important_hashes_added = selectors.tfidf(
hashes_added, name="revision.diff.important_hashes_added", max_terms=100)
important_hashes_removed = selectors.tfidf(
hashes_removed, name="revision.diff.important_hashes_removed", max_terms=100)
# Step 2: Train the TFiDF selector to weight and select only the important hashes
# ...magically load training observations of training_rev_ids/training_labels &
# testing_rev_ids/testing_labels
training_hashes_added = extractor.extract(training_rev_ids, hashes_added)
important_hashes_added.fit(zip(training_hashes_added, training_labels))
training_hashes_removed = extractor.extract(training_rev_ids, hashes_removed)
important_hashes_removed.fit(zip(training_hashes_removed, training_labels))
# Convert the frequency table to a FeatureVector
important_hashes_added_vector = vectorizers.vectorize(
important_hashes_added, keys=important_hashes_added.terms(), returns=float)
important_hashes_removed_vector = vectorizers.vectorize(
important_hashes_removed, keys=important_hashes_removed.terms(), returns=float)
# Step 3: Train and test the model
features = [important_hashes_added_vector, important_hashes_removed_vector,
revision_oriented.revision.user.is_anon]
model = GradientBoosting(features, version="0.0.1")
training_feature_values = extractor.extract(training_rev_ids, model.features)
model.train(zip(training_feature_values, training_labels))
testing_feature_values = extractor.extract(testing_rev_ids, model.features)
model.test(zip(testing_feature_values, testing_labels)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment