halfak/frequency_table_dependencies.py

## frequency_table_dependencies.py
from revscoring.datasources.meta import gramming, hashing, frequencies, selectors
from revscoring.features.meta import vectorizers
from revscoring.features import wikitext, revision_oriented

# Step 1: Engineer dependency pipeline for selecting text features
my_grams = [(0,), (0,1), (0,1,2), (0,2), (1,2)]

parent_hash_table = frequencies.table(
    hashing.hash(gramming.gram(wikitext.revision.parent.words, grams=my_grams)), n=2**20)

revision_hash_table = frequencies.table(
    hashing.hash(gramming.gram(wikitext.revision.words, grams=my_grams)), n=2**20)

hashes_delta = frequencies.delta(parent_hash_table, revision_hash_table)
hashes_added = frequencies.positive(hashes_delta)
hashes_removed = frequencies.negative(hashes_delta, abs=True)

important_hashes_added = selectors.tfidf(
    hashes_added, name="revision.diff.important_hashes_added", max_terms=100)
important_hashes_removed = selectors.tfidf(
    hashes_removed, name="revision.diff.important_hashes_removed", max_terms=100)

# Step 2: Train the TFiDF selector to weight and select only the important hashes
# ...magically load training observations of training_rev_ids/training_labels &
# testing_rev_ids/testing_labels
training_hashes_added = extractor.extract(training_rev_ids, hashes_added)
important_hashes_added.fit(zip(training_hashes_added, training_labels))
training_hashes_removed = extractor.extract(training_rev_ids, hashes_removed)
important_hashes_removed.fit(zip(training_hashes_removed, training_labels))

# Convert the frequency table to a FeatureVector
important_hashes_added_vector = vectorizers.vectorize(
    important_hashes_added, keys=important_hashes_added.terms(), returns=float)
important_hashes_removed_vector = vectorizers.vectorize(
    important_hashes_removed, keys=important_hashes_removed.terms(), returns=float)


# Step 3: Train and test the model
features = [important_hashes_added_vector, important_hashes_removed_vector,
            revision_oriented.revision.user.is_anon]
model = GradientBoosting(features, version="0.0.1")

training_feature_values = extractor.extract(training_rev_ids, model.features)
model.train(zip(training_feature_values, training_labels))

testing_feature_values = extractor.extract(testing_rev_ids, model.features)
model.test(zip(testing_feature_values, testing_labels)
	from revscoring.datasources.meta import gramming, hashing, frequencies, selectors
	from revscoring.features.meta import vectorizers
	from revscoring.features import wikitext, revision_oriented

	# Step 1: Engineer dependency pipeline for selecting text features
	my_grams = [(0,), (0,1), (0,1,2), (0,2), (1,2)]

	parent_hash_table = frequencies.table(
	hashing.hash(gramming.gram(wikitext.revision.parent.words, grams=my_grams)), n=2**20)

	revision_hash_table = frequencies.table(
	hashing.hash(gramming.gram(wikitext.revision.words, grams=my_grams)), n=2**20)

	hashes_delta = frequencies.delta(parent_hash_table, revision_hash_table)
	hashes_added = frequencies.positive(hashes_delta)
	hashes_removed = frequencies.negative(hashes_delta, abs=True)

	important_hashes_added = selectors.tfidf(
	hashes_added, name="revision.diff.important_hashes_added", max_terms=100)
	important_hashes_removed = selectors.tfidf(
	hashes_removed, name="revision.diff.important_hashes_removed", max_terms=100)

	# Step 2: Train the TFiDF selector to weight and select only the important hashes
	# ...magically load training observations of training_rev_ids/training_labels &
	# testing_rev_ids/testing_labels
	training_hashes_added = extractor.extract(training_rev_ids, hashes_added)
	important_hashes_added.fit(zip(training_hashes_added, training_labels))
	training_hashes_removed = extractor.extract(training_rev_ids, hashes_removed)
	important_hashes_removed.fit(zip(training_hashes_removed, training_labels))

	# Convert the frequency table to a FeatureVector
	important_hashes_added_vector = vectorizers.vectorize(
	important_hashes_added, keys=important_hashes_added.terms(), returns=float)
	important_hashes_removed_vector = vectorizers.vectorize(
	important_hashes_removed, keys=important_hashes_removed.terms(), returns=float)


	# Step 3: Train and test the model
	features = [important_hashes_added_vector, important_hashes_removed_vector,
	revision_oriented.revision.user.is_anon]
	model = GradientBoosting(features, version="0.0.1")

	training_feature_values = extractor.extract(training_rev_ids, model.features)
	model.train(zip(training_feature_values, training_labels))

	testing_feature_values = extractor.extract(testing_rev_ids, model.features)
	model.test(zip(testing_feature_values, testing_labels)