This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from revscoring.features import * | |
from revscoring.datasources import * | |
features = [added_badwords_ratio, added_misspellings_ratio, badwords_added, | |
bytes_changed, chars_added, day_of_week_in_utc, hour_of_day_in_utc, | |
is_content_namespace, is_custom_comment, is_mainspace, | |
is_previous_user_same, is_section_comment, longest_repeated_char_added, | |
longest_token_added, markup_chars_added, misspellings_added, | |
numeric_chars_added, page_age_in_seconds, prev_badwords, | |
prev_misspellings, prev_words, proportion_of_badwords_added, | |
proportion_of_markup_added, proportion_of_misspellings_added, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Create some folders | |
mkdir models datasets | |
# Generate a file with a new model | |
./new_model revscores.scorers.LinearSVCModel \ | |
revscores.features.added_badwords_ratio \ | |
revscores.features.added_misspellings_ratio \ | |
revscores.features.day_of_week_in_utc \ | |
revscores.features.hour_of_day_in_utc \ | |
revscores.features.is_custom_comment \ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Test GridSearchCV using a dataset obtained from a tsv file | |
""" | |
import csv | |
from sklearn import svm | |
from sklearn import metrics | |
from sklearn.cross_validation import train_test_split | |
from sklearn.grid_search import GridSearchCV | |
#from revscores.scorers import LinearSVC | |
from revscores.features import (added_badwords_ratio, added_misspellings_ratio, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
== Classification Report == | |
precision recall f1-score support | |
0 0.85 0.95 0.90 1617 | |
1 0.56 0.30 0.39 379 | |
avg / total 0.80 0.82 0.80 1996 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ python demonstrate_rc.py | |
39753618 (0 chars): 9a233f038c5f692efb3f0fbff7f4ced8a8c22cb0 | |
40663045 (0 chars): 1f0c550dceb5c542dfb304e5d6337c063aaa3c48 | |
34693351 (0 chars): 479dc3b4d6397134ce9d53e84c2fea0f451c1ae9 | |
34764900 (0 chars): f23ac498773c3a74037ba7f91e68653fa8fa5809 | |
40663042 (0 chars): 4416ce970a3a1bc1b2e1f1638cb716f9bf91c9fa | |
0 (0 chars): | |
36949986 (0 chars): 0f3919a4a56b1071087a09dce780a75374e216c6 | |
0 (0 chars): | |
40659670 (0 chars): 9c1a70783ec78719d16533106816c76887fd139f |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: UTF-8 -*- | |
# Copyright © 2014 He7d3r | |
# License: http://he7d3r.mit-license.org/ | |
""" | |
For each stem, prints out the most frequent word which is matched by some regex rule and has that stem | |
Example: | |
python stemToMostFrequentWord.py SALEBOT.TXT SALEBOT-STEMS-WORDS-STATS.TXT BADWORDSLIST.TXT | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: utf-8 -*- | |
# Copyright © 2014 He7d3r | |
# License: http://he7d3r.mit-license.org/ | |
""" Script to add a basic README.md file to many repositories in a directory""" | |
import os | |
import sh | |
rootDir = '/home/username/GitHub/' | |
myFile = 'README.md' |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright © 2014 He7d3r | |
// License: http://he7d3r.mit-license.org/ | |
var i, link, href, match, | |
links = document.getElementsByTagName( 'a' ); | |
for( i = 0; i < links.length; i++ ){ | |
link = links[i]; | |
href = link.href || ''; | |
match = href.match( /^(?:(?:https?:)?\/\/(?:old-)?bugzilla\.wikimedia\.org\/)?show_bug\.cgi\?id=(\d+)$/ ); | |
if( match ) { | |
link.href = 'https://phabricator.wikimedia.org/T' + ( parseInt( match[1], 10 ) + 2000 ); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Obtained from | |
# https://gist.github.com/he7d3r/1285f6b52e2782d96b9e#file-salebot-stats-txt | |
# using | |
# https://gist.github.com/he7d3r/82eefda254d416292141/ea2d8f01a9b6530149c056a88da9c47172a91a58 | |
# python StemsToWords.py SALEBOT-STATS.TXT SALEBOT-STEMS-WORDS-STATS.TXT ptwiki-20141015-pages-meta-history1.xml.7z ptwiki-20141015-pages-meta-history2.xml.7z ptwiki-20141015-pages-meta-history3.xml.7z ptwiki-20141015-pages-meta-history4.xml.7z | |
STEM FREQUENCY WORDS WITH THIS STEM, BY FREQUENCY | |
com 666632139 Counter({'com': 462043226, 'como': 197059812, 'comando': 3303334, 'come': 1091918, 'comida': 861471, 'comer': 703099, 'coma': 382041, 'comes': 262435, 'comidas': 189295, 'comeu': 140186, 'comendo': 119056, 'comem': 73333, 'comido': 61933, 'comia': 43458, 'comas': 37185, 'comi': 31208, 'comiam': 28096, 'comerem': 25816, 'comê': 24333, 'comar': 23143, 'comeram': 22629, 'comidos': 20561, 'comemos': 15184, 'comesse': 12021, 'comam': 10678, 'comos': 9780, 'comei': 4283, 'comessem': 3716, 'comares': 3238, 'comé': 2687, |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Adapted from http://utilitymill.com/edit/Regex_inverter | |
# License: GPL/GFDL | |
# Extracted from invRegex.py, at http://pyparsing.wikispaces.com | |
from pyparsing import (Literal, oneOf, printables, ParserElement, Combine, | |
SkipTo, operatorPrecedence, ParseFatalException, Word, nums, opAssoc, | |
Suppress, ParseResults, srange) | |
from nltk.stem.snowball import SnowballStemmer | |
import sys | |
import re |