Skip to content

Instantly share code, notes, and snippets.

@he7d3r
Last active August 29, 2015 14:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save he7d3r/7f2aebb00e18b4963d07 to your computer and use it in GitHub Desktop.
Save he7d3r/7f2aebb00e18b4963d07 to your computer and use it in GitHub Desktop.
Test the scorer on recent changes
== Classification Report ==
precision recall f1-score support
0 0.85 0.95 0.90 1617
1 0.56 0.30 0.39 379
avg / total 0.80 0.82 0.80 1996
"""
Test the scorer on a dataset obtained from a tsv file
"""
import csv
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from revscores.scorers import LinearSVC
from revscores.features import (added_badwords_ratio, added_misspellings_ratio,
badwords_added, bytes_changed, chars_added,
day_of_week_in_utc, hour_of_day_in_utc,
is_custom_comment, is_mainspace,
is_previous_user_same, is_section_comment,
longest_repeated_char_added,
longest_token_added, misspellings_added,
numeric_chars_added, page_age_in_seconds,
prev_badwords, prev_misspellings, prev_words,
proportion_of_badwords_added,
proportion_of_markup_added,
proportion_of_misspellings_added,
proportion_of_numeric_added,
proportion_of_prev_badwords,
proportion_of_prev_misspellings,
proportion_of_symbolic_added,
proportion_of_uppercase_added,
seconds_since_last_page_edit,
seconds_since_last_user_edit, segments_added,
segments_removed, symbolic_chars_added,
uppercase_chars_added, user_age_in_seconds,
user_is_anon, user_is_bot, words_added,
words_removed)
features = [added_badwords_ratio, added_misspellings_ratio,
badwords_added, bytes_changed, chars_added,
day_of_week_in_utc, hour_of_day_in_utc,
is_custom_comment, is_mainspace,
is_previous_user_same, is_section_comment,
longest_repeated_char_added,
longest_token_added, misspellings_added,
numeric_chars_added, page_age_in_seconds,
prev_badwords, prev_misspellings, prev_words,
proportion_of_badwords_added,
proportion_of_markup_added,
proportion_of_misspellings_added,
proportion_of_numeric_added,
proportion_of_prev_badwords,
proportion_of_prev_misspellings,
proportion_of_symbolic_added,
proportion_of_uppercase_added,
seconds_since_last_page_edit,
seconds_since_last_user_edit, segments_added,
segments_removed, symbolic_chars_added,
uppercase_chars_added, user_age_in_seconds,
user_is_anon, user_is_bot,
words_added,
words_removed]
linear_svc_model = LinearSVC.MODEL(features, kernel='rbf', gamma=1e-16, C=3)
data = []
with open('revscores.tsv', newline='') as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t')
headers = next(reader)
for row in reader:
values = []
for cell in row[:-1]:
if cell == 'True':
values.append(1.)
cell = 1.
elif cell == 'False':
values.append(0.)
else:
values.append(float(cell))
reverted = int(row[-1])
data.append( (values, reverted) )
training_set, test_set = train_test_split(data, test_size=0.4, random_state=0)
linear_svc_model.train(training_set)
y_true = [ y for x, y in test_set ]
y_pred = [ linear_svc_model.svc.predict(x)[0] for x, y in test_set ]
print('== Classification Report ==')
print(metrics.classification_report(y_true, y_pred))
"""
Get the features of up to 5000 recent changes and save to a tsv file
"""
import csv
from mw.api import Session
from mw.lib import reverts
from revscores.extractors import APIExtractor
from revscores.language import Portuguese
from revscores.scorers import LinearSVC
from revscores.features import (added_badwords_ratio, added_misspellings_ratio,
badwords_added, bytes_changed, chars_added,
day_of_week_in_utc, hour_of_day_in_utc,
is_custom_comment, is_mainspace,
is_previous_user_same, is_section_comment,
longest_repeated_char_added,
longest_token_added, misspellings_added,
numeric_chars_added, page_age_in_seconds,
prev_badwords, prev_misspellings, prev_words,
proportion_of_badwords_added,
proportion_of_markup_added,
proportion_of_misspellings_added,
proportion_of_numeric_added,
proportion_of_prev_badwords,
proportion_of_prev_misspellings,
proportion_of_symbolic_added,
proportion_of_uppercase_added,
seconds_since_last_page_edit,
seconds_since_last_user_edit, segments_added,
segments_removed, symbolic_chars_added,
uppercase_chars_added, user_age_in_seconds,
user_is_anon, user_is_bot, words_added,
words_removed)
batch_size = 5000
session = Session("https://pt.wikipedia.org/w/api.php")
revisions = session.recent_changes.query(
type={'edit'},
properties={'ids'},
direction="newer",
limit=batch_size
)
api_extractor = APIExtractor(
Session("https://pt.wikipedia.org/w/api.php"),
language=Portuguese()
)
features = [added_badwords_ratio, added_misspellings_ratio,
badwords_added, bytes_changed, chars_added,
day_of_week_in_utc, hour_of_day_in_utc,
is_custom_comment, is_mainspace,
is_previous_user_same, is_section_comment,
longest_repeated_char_added,
longest_token_added, misspellings_added,
numeric_chars_added, page_age_in_seconds,
prev_badwords, prev_misspellings, prev_words,
proportion_of_badwords_added,
proportion_of_markup_added,
proportion_of_misspellings_added,
proportion_of_numeric_added,
proportion_of_prev_badwords,
proportion_of_prev_misspellings,
proportion_of_symbolic_added,
proportion_of_uppercase_added,
seconds_since_last_page_edit,
seconds_since_last_user_edit, segments_added,
segments_removed, symbolic_chars_added,
uppercase_chars_added, user_age_in_seconds,
user_is_anon, user_is_bot,
words_added,
words_removed]
linear_svc_model = LinearSVC.MODEL(features)
# FIXME: Is this correct considering that "[the MLScorer] scorer expects the model to already be trained"?
# We need to extract the features in order to train the model
linear_svc = LinearSVC(api_extractor, linear_svc_model)
data = []
p = 0
with open('revscores.tsv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['<revid>'] + features + ['<reverted>'])
for rev in revisions:
p += 1
print(p, 'https://pt.wikipedia.org/w/index.php?diff={0}'.format(rev['revid']), end=' ')
try:
values = list(linear_svc.extract([rev['revid']]))[0]
# FIXME: Does it make any difference to use the values 0/1, or -1/1 or something else?
reverted = 0 if reverts.api.check_rev(session, rev) is None else 1
except:
continue
print(reverted==1)
writer.writerow([rev['revid']] + values + [reverted])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment