Last active
August 29, 2015 14:11
-
-
Save he7d3r/7f2aebb00e18b4963d07 to your computer and use it in GitHub Desktop.
Test the scorer on recent changes
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
== Classification Report == | |
precision recall f1-score support | |
0 0.85 0.95 0.90 1617 | |
1 0.56 0.30 0.39 379 | |
avg / total 0.80 0.82 0.80 1996 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Test the scorer on a dataset obtained from a tsv file | |
""" | |
import csv | |
from sklearn import metrics | |
from sklearn.cross_validation import train_test_split | |
from revscores.scorers import LinearSVC | |
from revscores.features import (added_badwords_ratio, added_misspellings_ratio, | |
badwords_added, bytes_changed, chars_added, | |
day_of_week_in_utc, hour_of_day_in_utc, | |
is_custom_comment, is_mainspace, | |
is_previous_user_same, is_section_comment, | |
longest_repeated_char_added, | |
longest_token_added, misspellings_added, | |
numeric_chars_added, page_age_in_seconds, | |
prev_badwords, prev_misspellings, prev_words, | |
proportion_of_badwords_added, | |
proportion_of_markup_added, | |
proportion_of_misspellings_added, | |
proportion_of_numeric_added, | |
proportion_of_prev_badwords, | |
proportion_of_prev_misspellings, | |
proportion_of_symbolic_added, | |
proportion_of_uppercase_added, | |
seconds_since_last_page_edit, | |
seconds_since_last_user_edit, segments_added, | |
segments_removed, symbolic_chars_added, | |
uppercase_chars_added, user_age_in_seconds, | |
user_is_anon, user_is_bot, words_added, | |
words_removed) | |
features = [added_badwords_ratio, added_misspellings_ratio, | |
badwords_added, bytes_changed, chars_added, | |
day_of_week_in_utc, hour_of_day_in_utc, | |
is_custom_comment, is_mainspace, | |
is_previous_user_same, is_section_comment, | |
longest_repeated_char_added, | |
longest_token_added, misspellings_added, | |
numeric_chars_added, page_age_in_seconds, | |
prev_badwords, prev_misspellings, prev_words, | |
proportion_of_badwords_added, | |
proportion_of_markup_added, | |
proportion_of_misspellings_added, | |
proportion_of_numeric_added, | |
proportion_of_prev_badwords, | |
proportion_of_prev_misspellings, | |
proportion_of_symbolic_added, | |
proportion_of_uppercase_added, | |
seconds_since_last_page_edit, | |
seconds_since_last_user_edit, segments_added, | |
segments_removed, symbolic_chars_added, | |
uppercase_chars_added, user_age_in_seconds, | |
user_is_anon, user_is_bot, | |
words_added, | |
words_removed] | |
linear_svc_model = LinearSVC.MODEL(features, kernel='rbf', gamma=1e-16, C=3) | |
data = [] | |
with open('revscores.tsv', newline='') as tsvfile: | |
reader = csv.reader(tsvfile, delimiter='\t') | |
headers = next(reader) | |
for row in reader: | |
values = [] | |
for cell in row[:-1]: | |
if cell == 'True': | |
values.append(1.) | |
cell = 1. | |
elif cell == 'False': | |
values.append(0.) | |
else: | |
values.append(float(cell)) | |
reverted = int(row[-1]) | |
data.append( (values, reverted) ) | |
training_set, test_set = train_test_split(data, test_size=0.4, random_state=0) | |
linear_svc_model.train(training_set) | |
y_true = [ y for x, y in test_set ] | |
y_pred = [ linear_svc_model.svc.predict(x)[0] for x, y in test_set ] | |
print('== Classification Report ==') | |
print(metrics.classification_report(y_true, y_pred)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Get the features of up to 5000 recent changes and save to a tsv file | |
""" | |
import csv | |
from mw.api import Session | |
from mw.lib import reverts | |
from revscores.extractors import APIExtractor | |
from revscores.language import Portuguese | |
from revscores.scorers import LinearSVC | |
from revscores.features import (added_badwords_ratio, added_misspellings_ratio, | |
badwords_added, bytes_changed, chars_added, | |
day_of_week_in_utc, hour_of_day_in_utc, | |
is_custom_comment, is_mainspace, | |
is_previous_user_same, is_section_comment, | |
longest_repeated_char_added, | |
longest_token_added, misspellings_added, | |
numeric_chars_added, page_age_in_seconds, | |
prev_badwords, prev_misspellings, prev_words, | |
proportion_of_badwords_added, | |
proportion_of_markup_added, | |
proportion_of_misspellings_added, | |
proportion_of_numeric_added, | |
proportion_of_prev_badwords, | |
proportion_of_prev_misspellings, | |
proportion_of_symbolic_added, | |
proportion_of_uppercase_added, | |
seconds_since_last_page_edit, | |
seconds_since_last_user_edit, segments_added, | |
segments_removed, symbolic_chars_added, | |
uppercase_chars_added, user_age_in_seconds, | |
user_is_anon, user_is_bot, words_added, | |
words_removed) | |
batch_size = 5000 | |
session = Session("https://pt.wikipedia.org/w/api.php") | |
revisions = session.recent_changes.query( | |
type={'edit'}, | |
properties={'ids'}, | |
direction="newer", | |
limit=batch_size | |
) | |
api_extractor = APIExtractor( | |
Session("https://pt.wikipedia.org/w/api.php"), | |
language=Portuguese() | |
) | |
features = [added_badwords_ratio, added_misspellings_ratio, | |
badwords_added, bytes_changed, chars_added, | |
day_of_week_in_utc, hour_of_day_in_utc, | |
is_custom_comment, is_mainspace, | |
is_previous_user_same, is_section_comment, | |
longest_repeated_char_added, | |
longest_token_added, misspellings_added, | |
numeric_chars_added, page_age_in_seconds, | |
prev_badwords, prev_misspellings, prev_words, | |
proportion_of_badwords_added, | |
proportion_of_markup_added, | |
proportion_of_misspellings_added, | |
proportion_of_numeric_added, | |
proportion_of_prev_badwords, | |
proportion_of_prev_misspellings, | |
proportion_of_symbolic_added, | |
proportion_of_uppercase_added, | |
seconds_since_last_page_edit, | |
seconds_since_last_user_edit, segments_added, | |
segments_removed, symbolic_chars_added, | |
uppercase_chars_added, user_age_in_seconds, | |
user_is_anon, user_is_bot, | |
words_added, | |
words_removed] | |
linear_svc_model = LinearSVC.MODEL(features) | |
# FIXME: Is this correct considering that "[the MLScorer] scorer expects the model to already be trained"? | |
# We need to extract the features in order to train the model | |
linear_svc = LinearSVC(api_extractor, linear_svc_model) | |
data = [] | |
p = 0 | |
with open('revscores.tsv', 'w', newline='') as f: | |
writer = csv.writer(f) | |
writer.writerow(['<revid>'] + features + ['<reverted>']) | |
for rev in revisions: | |
p += 1 | |
print(p, 'https://pt.wikipedia.org/w/index.php?diff={0}'.format(rev['revid']), end=' ') | |
try: | |
values = list(linear_svc.extract([rev['revid']]))[0] | |
# FIXME: Does it make any difference to use the values 0/1, or -1/1 or something else? | |
reverted = 0 if reverts.api.check_rev(session, rev) is None else 1 | |
except: | |
continue | |
print(reverted==1) | |
writer.writerow([rev['revid']] + values + [reverted]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment