Helder Geovane Gomes de Lima he7d3r

## print_dependency_graph.py
from revscoring.features import *
from revscoring.datasources import *
features = [added_badwords_ratio, added_misspellings_ratio, badwords_added,
    bytes_changed, chars_added, day_of_week_in_utc, hour_of_day_in_utc,
    is_content_namespace, is_custom_comment, is_mainspace,
    is_previous_user_same, is_section_comment, longest_repeated_char_added,
    longest_token_added, markup_chars_added, misspellings_added,
    numeric_chars_added, page_age_in_seconds, prev_badwords,
    prev_misspellings, prev_words, proportion_of_badwords_added,
    proportion_of_markup_added, proportion_of_misspellings_added,

## commands.sh
# Create some folders
mkdir models datasets

# Generate a file with a new model
./new_model revscores.scorers.LinearSVCModel \
    revscores.features.added_badwords_ratio \
    revscores.features.added_misspellings_ratio \
    revscores.features.day_of_week_in_utc \
    revscores.features.hour_of_day_in_utc \
    revscores.features.is_custom_comment \

## demonstrate_GridSearchCV.py
"""
Test GridSearchCV using a dataset obtained from a tsv file
"""
import csv
from sklearn import svm
from sklearn import metrics
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
#from revscores.scorers import LinearSVC
from revscores.features import (added_badwords_ratio, added_misspellings_ratio,

## classification_report.txt
== Classification Report ==
             precision    recall  f1-score   support

          0       0.85      0.95      0.90      1617
          1       0.56      0.30      0.39       379

avg / total       0.80      0.82      0.80      1996

## Output for direction="newer"
$ python demonstrate_rc.py
39753618 (0 chars): 9a233f038c5f692efb3f0fbff7f4ced8a8c22cb0
40663045 (0 chars): 1f0c550dceb5c542dfb304e5d6337c063aaa3c48
34693351 (0 chars): 479dc3b4d6397134ce9d53e84c2fea0f451c1ae9
34764900 (0 chars): f23ac498773c3a74037ba7f91e68653fa8fa5809
40663042 (0 chars): 4416ce970a3a1bc1b2e1f1638cb716f9bf91c9fa
0 (0 chars):
36949986 (0 chars): 0f3919a4a56b1071087a09dce780a75374e216c6
0 (0 chars):
40659670 (0 chars): 9c1a70783ec78719d16533106816c76887fd139f

## stemToMostFrequentWord.py
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# Copyright © 2014 He7d3r
# License: http://he7d3r.mit-license.org/
"""
For each stem, prints out the most frequent word which is matched by some regex rule and has that stem

Example:
python stemToMostFrequentWord.py SALEBOT.TXT SALEBOT-STEMS-WORDS-STATS.TXT BADWORDSLIST.TXT
"""

## AddReadmeToRepos.py
#!/usr/bin/python
# -*- coding: utf-8  -*-
# Copyright © 2014 He7d3r
# License: http://he7d3r.mit-license.org/
""" Script to add a basic README.md file to many repositories in a directory"""
import os
import sh

rootDir = '/home/username/GitHub/'
myFile = 'README.md'

## PointBugzillaLinksToPhabricator.js
// Copyright © 2014 He7d3r
// License: http://he7d3r.mit-license.org/
var i, link, href, match,
	links = document.getElementsByTagName( 'a' );
for( i = 0; i < links.length; i++ ){
	link = links[i];
	href = link.href || '';
	match = href.match( /^(?:(?:https?:)?\/\/(?:old-)?bugzilla\.wikimedia\.org\/)?show_bug\.cgi\?id=(\d+)$/ );
	if( match ) {
		link.href = 'https://phabricator.wikimedia.org/T' + ( parseInt( match[1], 10 ) + 2000 );

## SALEBOT-STEMS-WORDS-STATS.TXT
# Obtained from
# https://gist.github.com/he7d3r/1285f6b52e2782d96b9e#file-salebot-stats-txt
# using
# https://gist.github.com/he7d3r/82eefda254d416292141/ea2d8f01a9b6530149c056a88da9c47172a91a58
# python StemsToWords.py SALEBOT-STATS.TXT SALEBOT-STEMS-WORDS-STATS.TXT ptwiki-20141015-pages-meta-history1.xml.7z ptwiki-20141015-pages-meta-history2.xml.7z ptwiki-20141015-pages-meta-history3.xml.7z ptwiki-20141015-pages-meta-history4.xml.7z

STEM	FREQUENCY	WORDS WITH THIS STEM, BY FREQUENCY
com	666632139	Counter({'com': 462043226, 'como': 197059812, 'comando': 3303334, 'come': 1091918, 'comida': 861471, 'comer': 703099, 'coma': 382041, 'comes': 262435, 'comidas': 189295, 'comeu': 140186, 'comendo': 119056, 'comem': 73333, 'comido': 61933, 'comia': 43458, 'comas': 37185, 'comi': 31208, 'comiam': 28096, 'comerem': 25816, 'comê': 24333, 'comar': 23143, 'comeram': 22629, 'comidos': 20561, 'comemos': 15184, 'comesse': 12021, 'comam': 10678, 'comos': 9780, 'comei': 4283, 'comessem': 3716, 'comares': 3238, 'comé': 2687,

## invertSalebotRegexes.py
# Adapted from http://utilitymill.com/edit/Regex_inverter
# License: GPL/GFDL
# Extracted from invRegex.py, at http://pyparsing.wikispaces.com

from pyparsing import (Literal, oneOf, printables, ParserElement, Combine,
    SkipTo, operatorPrecedence, ParseFatalException, Word, nums, opAssoc,
    Suppress, ParseResults, srange)
from nltk.stem.snowball import SnowballStemmer
import sys
import re
	from revscoring.features import *
	from revscoring.datasources import *
	features = [added_badwords_ratio, added_misspellings_ratio, badwords_added,
	bytes_changed, chars_added, day_of_week_in_utc, hour_of_day_in_utc,
	is_content_namespace, is_custom_comment, is_mainspace,
	is_previous_user_same, is_section_comment, longest_repeated_char_added,
	longest_token_added, markup_chars_added, misspellings_added,
	numeric_chars_added, page_age_in_seconds, prev_badwords,
	prev_misspellings, prev_words, proportion_of_badwords_added,
	proportion_of_markup_added, proportion_of_misspellings_added,
	# Create some folders
	mkdir models datasets

	# Generate a file with a new model
	./new_model revscores.scorers.LinearSVCModel \
	revscores.features.added_badwords_ratio \
	revscores.features.added_misspellings_ratio \
	revscores.features.day_of_week_in_utc \
	revscores.features.hour_of_day_in_utc \
	revscores.features.is_custom_comment \
	"""
	Test GridSearchCV using a dataset obtained from a tsv file
	"""
	import csv
	from sklearn import svm
	from sklearn import metrics
	from sklearn.cross_validation import train_test_split
	from sklearn.grid_search import GridSearchCV
	#from revscores.scorers import LinearSVC
	from revscores.features import (added_badwords_ratio, added_misspellings_ratio,
	== Classification Report ==
	precision recall f1-score support

	0 0.85 0.95 0.90 1617
	1 0.56 0.30 0.39 379

	avg / total 0.80 0.82 0.80 1996
	$ python demonstrate_rc.py
	39753618 (0 chars): 9a233f038c5f692efb3f0fbff7f4ced8a8c22cb0
	40663045 (0 chars): 1f0c550dceb5c542dfb304e5d6337c063aaa3c48
	34693351 (0 chars): 479dc3b4d6397134ce9d53e84c2fea0f451c1ae9
	34764900 (0 chars): f23ac498773c3a74037ba7f91e68653fa8fa5809
	40663042 (0 chars): 4416ce970a3a1bc1b2e1f1638cb716f9bf91c9fa
	0 (0 chars):
	36949986 (0 chars): 0f3919a4a56b1071087a09dce780a75374e216c6
	0 (0 chars):
	40659670 (0 chars): 9c1a70783ec78719d16533106816c76887fd139f
	#!/usr/bin/env python
	# -- coding: UTF-8 --
	# Copyright © 2014 He7d3r
	# License: http://he7d3r.mit-license.org/
	"""
	For each stem, prints out the most frequent word which is matched by some regex rule and has that stem

	Example:
	python stemToMostFrequentWord.py SALEBOT.TXT SALEBOT-STEMS-WORDS-STATS.TXT BADWORDSLIST.TXT
	"""
	#!/usr/bin/python
	# -- coding: utf-8 --
	# Copyright © 2014 He7d3r
	# License: http://he7d3r.mit-license.org/
	""" Script to add a basic README.md file to many repositories in a directory"""
	import os
	import sh

	rootDir = '/home/username/GitHub/'
	myFile = 'README.md'
	// Copyright © 2014 He7d3r
	// License: http://he7d3r.mit-license.org/
	var i, link, href, match,
	links = document.getElementsByTagName( 'a' );
	for( i = 0; i < links.length; i++ ){
	link = links[i];
	href = link.href \|\| '';
	match = href.match( /^(?:(?:https?:)?\/\/(?:old-)?bugzilla\.wikimedia\.org\/)?show_bug\.cgi\?id=(\d+)$/ );
	if( match ) {
	link.href = 'https://phabricator.wikimedia.org/T' + ( parseInt( match[1], 10 ) + 2000 );
	# Obtained from
	# https://gist.github.com/he7d3r/1285f6b52e2782d96b9e#file-salebot-stats-txt
	# using
	# https://gist.github.com/he7d3r/82eefda254d416292141/ea2d8f01a9b6530149c056a88da9c47172a91a58
	# python StemsToWords.py SALEBOT-STATS.TXT SALEBOT-STEMS-WORDS-STATS.TXT ptwiki-20141015-pages-meta-history1.xml.7z ptwiki-20141015-pages-meta-history2.xml.7z ptwiki-20141015-pages-meta-history3.xml.7z ptwiki-20141015-pages-meta-history4.xml.7z

	STEM FREQUENCY WORDS WITH THIS STEM, BY FREQUENCY
	com 666632139 Counter({'com': 462043226, 'como': 197059812, 'comando': 3303334, 'come': 1091918, 'comida': 861471, 'comer': 703099, 'coma': 382041, 'comes': 262435, 'comidas': 189295, 'comeu': 140186, 'comendo': 119056, 'comem': 73333, 'comido': 61933, 'comia': 43458, 'comas': 37185, 'comi': 31208, 'comiam': 28096, 'comerem': 25816, 'comê': 24333, 'comar': 23143, 'comeram': 22629, 'comidos': 20561, 'comemos': 15184, 'comesse': 12021, 'comam': 10678, 'comos': 9780, 'comei': 4283, 'comessem': 3716, 'comares': 3238, 'comé': 2687,
	# Adapted from http://utilitymill.com/edit/Regex_inverter
	# License: GPL/GFDL
	# Extracted from invRegex.py, at http://pyparsing.wikispaces.com

	from pyparsing import (Literal, oneOf, printables, ParserElement, Combine,
	SkipTo, operatorPrecedence, ParseFatalException, Word, nums, opAssoc,
	Suppress, ParseResults, srange)
	from nltk.stem.snowball import SnowballStemmer
	import sys
	import re