Skip to content

Instantly share code, notes, and snippets.

View he7d3r's full-sized avatar

Helder Geovane Gomes de Lima he7d3r

View GitHub Profile
@he7d3r
he7d3r / print_dependency_graph.py
Created January 31, 2015 19:44
Prints a graph in graphviz syntax showing the dependencies between features and data sources of revscoring
from revscoring.features import *
from revscoring.datasources import *
features = [added_badwords_ratio, added_misspellings_ratio, badwords_added,
bytes_changed, chars_added, day_of_week_in_utc, hour_of_day_in_utc,
is_content_namespace, is_custom_comment, is_mainspace,
is_previous_user_same, is_section_comment, longest_repeated_char_added,
longest_token_added, markup_chars_added, misspellings_added,
numeric_chars_added, page_age_in_seconds, prev_badwords,
prev_misspellings, prev_words, proportion_of_badwords_added,
proportion_of_markup_added, proportion_of_misspellings_added,
@he7d3r
he7d3r / commands.sh
Last active August 29, 2015 14:13
Testing ORES
# Create some folders
mkdir models datasets
# Generate a file with a new model
./new_model revscores.scorers.LinearSVCModel \
revscores.features.added_badwords_ratio \
revscores.features.added_misspellings_ratio \
revscores.features.day_of_week_in_utc \
revscores.features.hour_of_day_in_utc \
revscores.features.is_custom_comment \
@he7d3r
he7d3r / demonstrate_GridSearchCV.py
Created December 24, 2014 20:07
Test GridSearchCV using a dataset obtained from a tsv file
"""
Test GridSearchCV using a dataset obtained from a tsv file
"""
import csv
from sklearn import svm
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
#from revscores.scorers import LinearSVC
from revscores.features import (added_badwords_ratio, added_misspellings_ratio,
@he7d3r
he7d3r / classification_report.txt
Last active August 29, 2015 14:11
Test the scorer on recent changes
== Classification Report ==
precision recall f1-score support
0 0.85 0.95 0.90 1617
1 0.56 0.30 0.39 379
avg / total 0.80 0.82 0.80 1996
$ python demonstrate_rc.py
39753618 (0 chars): 9a233f038c5f692efb3f0fbff7f4ced8a8c22cb0
40663045 (0 chars): 1f0c550dceb5c542dfb304e5d6337c063aaa3c48
34693351 (0 chars): 479dc3b4d6397134ce9d53e84c2fea0f451c1ae9
34764900 (0 chars): f23ac498773c3a74037ba7f91e68653fa8fa5809
40663042 (0 chars): 4416ce970a3a1bc1b2e1f1638cb716f9bf91c9fa
0 (0 chars):
36949986 (0 chars): 0f3919a4a56b1071087a09dce780a75374e216c6
0 (0 chars):
40659670 (0 chars): 9c1a70783ec78719d16533106816c76887fd139f
@he7d3r
he7d3r / stemToMostFrequentWord.py
Created December 14, 2014 00:54
For each stem, prints out the most frequent word which is matched by some regex rule and has that stem
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Copyright © 2014 He7d3r
# License: http://he7d3r.mit-license.org/
"""
For each stem, prints out the most frequent word which is matched by some regex rule and has that stem
Example:
python stemToMostFrequentWord.py SALEBOT.TXT SALEBOT-STEMS-WORDS-STATS.TXT BADWORDSLIST.TXT
"""
@he7d3r
he7d3r / AddReadmeToRepos.py
Created December 4, 2014 19:47
Script to add a basic README.md file to many repositories in a directory
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright © 2014 He7d3r
# License: http://he7d3r.mit-license.org/
""" Script to add a basic README.md file to many repositories in a directory"""
import os
import sh
rootDir = '/home/username/GitHub/'
myFile = 'README.md'
@he7d3r
he7d3r / PointBugzillaLinksToPhabricator.js
Last active August 29, 2015 14:10
Update Bugzilla links in the current page to point to the corresponding Phabricator tasks
// Copyright © 2014 He7d3r
// License: http://he7d3r.mit-license.org/
var i, link, href, match,
links = document.getElementsByTagName( 'a' );
for( i = 0; i < links.length; i++ ){
link = links[i];
href = link.href || '';
match = href.match( /^(?:(?:https?:)?\/\/(?:old-)?bugzilla\.wikimedia\.org\/)?show_bug\.cgi\?id=(\d+)$/ );
if( match ) {
link.href = 'https://phabricator.wikimedia.org/T' + ( parseInt( match[1], 10 ) + 2000 );
@he7d3r
he7d3r / SALEBOT-STEMS-WORDS-STATS.TXT
Last active August 29, 2015 14:08
Words matching each stem in the badwords list from salebot config on ptwiki
# Obtained from
# https://gist.github.com/he7d3r/1285f6b52e2782d96b9e#file-salebot-stats-txt
# using
# https://gist.github.com/he7d3r/82eefda254d416292141/ea2d8f01a9b6530149c056a88da9c47172a91a58
# python StemsToWords.py SALEBOT-STATS.TXT SALEBOT-STEMS-WORDS-STATS.TXT ptwiki-20141015-pages-meta-history1.xml.7z ptwiki-20141015-pages-meta-history2.xml.7z ptwiki-20141015-pages-meta-history3.xml.7z ptwiki-20141015-pages-meta-history4.xml.7z
STEM FREQUENCY WORDS WITH THIS STEM, BY FREQUENCY
com 666632139 Counter({'com': 462043226, 'como': 197059812, 'comando': 3303334, 'come': 1091918, 'comida': 861471, 'comer': 703099, 'coma': 382041, 'comes': 262435, 'comidas': 189295, 'comeu': 140186, 'comendo': 119056, 'comem': 73333, 'comido': 61933, 'comia': 43458, 'comas': 37185, 'comi': 31208, 'comiam': 28096, 'comerem': 25816, 'comê': 24333, 'comar': 23143, 'comeram': 22629, 'comidos': 20561, 'comemos': 15184, 'comesse': 12021, 'comam': 10678, 'comos': 9780, 'comei': 4283, 'comessem': 3716, 'comares': 3238, 'comé': 2687,
@he7d3r
he7d3r / invertSalebotRegexes.py
Created October 22, 2014 15:11
Create a list of words and a list of stems for each regex in the Salebot config
# Adapted from http://utilitymill.com/edit/Regex_inverter
# License: GPL/GFDL
# Extracted from invRegex.py, at http://pyparsing.wikispaces.com
from pyparsing import (Literal, oneOf, printables, ParserElement, Combine,
SkipTo, operatorPrecedence, ParseFatalException, Word, nums, opAssoc,
Suppress, ParseResults, srange)
from nltk.stem.snowball import SnowballStemmer
import sys
import re