Last active
August 24, 2016 16:15
-
-
Save halfak/4b5e856902b5237d88ea49a80c1fb378 to your computer and use it in GitHub Desktop.
Thinking about abstract feature vectors
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from revscoring.datasources.meta import gramming, hashing, frequencies, selectors | |
from revscoring.features.meta import vectorizers | |
from revscoring.features import wikitext, revision_oriented | |
# Step 1: Engineer dependency pipeline for selecting text features | |
my_grams = [(0,), (0,1), (0,1,2), (0,2), (1,2)] | |
parent_hash_table = frequencies.table( | |
hashing.hash(gramming.gram(wikitext.revision.parent.words, grams=my_grams)), n=2**20) | |
revision_hash_table = frequencies.table( | |
hashing.hash(gramming.gram(wikitext.revision.words, grams=my_grams)), n=2**20) | |
hashes_delta = frequencies.delta(parent_hash_table, revision_hash_table) | |
hashes_added = frequencies.positive(hashes_delta) | |
hashes_removed = frequencies.negative(hashes_delta, abs=True) | |
important_hashes_added = selectors.tfidf( | |
hashes_added, name="revision.diff.important_hashes_added", max_terms=100) | |
important_hashes_removed = selectors.tfidf( | |
hashes_removed, name="revision.diff.important_hashes_removed", max_terms=100) | |
# Step 2: Train the TFiDF selector to weight and select only the important hashes | |
# ...magically load training observations of training_rev_ids/training_labels & | |
# testing_rev_ids/testing_labels | |
training_hashes_added = extractor.extract(training_rev_ids, hashes_added) | |
important_hashes_added.fit(zip(training_hashes_added, training_labels)) | |
training_hashes_removed = extractor.extract(training_rev_ids, hashes_removed) | |
important_hashes_removed.fit(zip(training_hashes_removed, training_labels)) | |
# Convert the frequency table to a FeatureVector | |
important_hashes_added_vector = vectorizers.vectorize( | |
important_hashes_added, keys=important_hashes_added.terms(), returns=float) | |
important_hashes_removed_vector = vectorizers.vectorize( | |
important_hashes_removed, keys=important_hashes_removed.terms(), returns=float) | |
# Step 3: Train and test the model | |
features = [important_hashes_added_vector, important_hashes_removed_vector, | |
revision_oriented.revision.user.is_anon] | |
model = GradientBoosting(features, version="0.0.1") | |
training_feature_values = extractor.extract(training_rev_ids, model.features) | |
model.train(zip(training_feature_values, training_labels)) | |
testing_feature_values = extractor.extract(testing_rev_ids, model.features) | |
model.test(zip(testing_feature_values, testing_labels) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment