Skip to content

Instantly share code, notes, and snippets.

@Ladsgroup
Created October 30, 2015 22:17
Show Gist options
  • Save Ladsgroup/03caf2bd2971996abab9 to your computer and use it in GitHub Desktop.
Save Ladsgroup/03caf2bd2971996abab9 to your computer and use it in GitHub Desktop.
Extracting features from dumps
import sys
import time
import codecs
import json
import random
from mw.lib import reverts
from ipaddress import ip_address
from pywikibot import xmlreader
from revscoring.dependencies import solve
from revscoring.features import revision as revsc_rev_f
from revscoring.datasources import revision as rev_d
from revscoring.features import diff as revsc_diff_f
from revscoring.datasources import parent_revision as parent_rev_d
from wb_vandalism.features import revision as wb_rev_f
from wb_vandalism.features import diff as wb_diff_f
REVERTED_RATIO = 1
NOT_REVERTED_RATIO = 0.07
base_dir = '/data/project/dexbot/pywikibot-core/'
with codecs.open('%sres_dump_aaron2.csv' % base_dir, 'w', 'utf-8') as w:
w.write('')
bots = [
u'AGbot', u'AHbot', u'ASammourBot', u'Addbot', u'AinaliBot', u'BinBot',
u'AkkakkBot', u'AlepfuBot', u'Aplikasi-Bot', u'AudeBot', u'AvocatoBot',
u'AyackBot', u'BMacZeroBot', u'BaseBot', u'BeneBot*', u'BetaBot',
u'BotMultichill', u'BotMultichillT', u'BotNinja', u'Botapyb', u'Botik',
u'BoulaurBot', u'BrackiBot', u'BraunBot', u'BraveBot', u'Byrialbot',
u'CalakBot', u'CaliburnBOT', u'CennoBot', u'Chembot', u'Chobot',
u'Citing Bot', u'CommonsDelinker', u'Cyberbot I', u'D2-bot', u'Choboty',
u'DBpedia-mapper-bot', u'DSisyphBot', u'DangSunBot', u'DangSunBot2',
u'DanmicholoBot', u'Dcirovicbot', u'Dexbot', u'DidymBot', u'DimaBot',
u'Dipsacus fullonum bot', u'DixonDBot', u'Docu w. script', u'Dom bot',
u'DrTrigonBot', u'DæghrefnBot', u'EdinBot', u'EdwardsBot', u'EmausBot',
u'Escabot', u'Faebot', u'FischBot', u'Flow talk page manager',
u'GrammarwareBot', u'InwBot', u'Kompakt-bot', u'Frettiebot',
u'FrigidBot', u'FuzzyBot', u'GPUBot', u'GZWDer (flood)', u'Makecat-bot',
u'Gzen92Bot', u'Hawk-Eye-Bot', u'HaxpettBot', u'Hazard-Bot', u'Hoo Bot',
'HuskyBot', u'HypoBOT', u'InductiveBot', u'InfoRobBot', u'InkoBot',
u'JAnDbot', u'JVbot', u'JWbot', u'JYBot', u'JackieBot', u'Jeblad (bot)',
u'JhsBot', u'JoRobot', u'KLBot2', u'KRLS Bot', u'KasparBot', u'MBAreaBot',
u'KrBot', u'Krdbot', u'L PBot', u'Legobot', u'Liangent-bot', u'MineoBot',
u'LinkRecoveryBot', u'ListeriaBot', u'Louperibot', u'Luuvabot',
u'MagulBot', u'MahdiBot', u'Maintenance script', u'MajedBot', u'Revibot',
u'Makrobot', u'MalarzBOT', u'MatSuBot', u'MatmaBot', u'MedalBot',
u'MediaWiki message delivery', u'MerlIwBot', u'Miguillen-bot', u'AvicBot',
u'Mjbmrbot', u'Nicolas1981Bot', u'NikkiBot', u'Nullzerobot', u'OctraBot',
u'OrlodrimBot', u'PBot', u'PLbot', u'Peter17-Bot', u'Pigsonthewing-bot',
u'Popcornbot', u'PoulpyBot', u'PreferentialBot', u'ProgvalInfoboxBot',
u'ProteinBoxBot', u'Ra-bot-nik', u'ReimannBot', u'Reinheitsgebot',
u'Rezabot', u'RoboViolet', u'RobotMichiel1972', u'Ruud Koot (bot)',
u'S205643bot', u'SDrewthbot', u'SKbot', u'SLiuBot', u'SamoaBot',
u'Sanjeev bot', u'SanniBot', u'Sarojbot', u'SaschaBot', u'SbisoloBot',
u'ShinobiBot', u'ShonagonBot', u'Shuaib-bot', u'Sk!dbot', u'Smbbot',
u'SourcererBot', u'SpBot', u'StackerBot', u'Steenthbot', u'VIAFbot',
u'StrynBot', u'SuccuBot', u'SweetBot', u'Symac bot', u'TambonBot',
u'The Anonybot', u'Thebot', u'ThieolBot', u'TptBot', u'SteinsplitterBot',
u'Translation Notification Bot', u'Tulsibot', u'UnderlyingBot', u'Wasabot',
u'ValterVBot', u'ViscoBot', u'VollBot', u'VsBot', u'WYImporterBot',
u'Whymbot', u'Widar of zolo', u'WikiGrok', u'Wizzo-Bot', u'WylveWidar',
u'XXN-bot', u'Xaris333Bot', u'YasBot', u'ZaBOTka', u'ZedlikBot', u'ZkBot',
u'ÖdokBot', u'AHbot', u'Addbot', u'Aibot', u'Alexbot', u'AnankeBot',
u'AvocatoBot', u'BotMultichill', u'CandalBot', u'CarsracBot', u'Chobot',
u'DSisyphBot', u'DarafshBot', u'Dexbot', u'DixonDBot', u'DragonBot',
u'EmausBot', u'Escarbot', u'FoxBot', u'Gerakibot', u'GhalyBot',
u'HydrizBot', u'Invadibot', u'JAnDbot', u'JYBot', u'JackieBot', u'JhsBot',
u'Justincheng12345-bot', u'KLBot2', u'KamikazeBot', u'LaaknorBot',
u'Louperibot', u'Luckas-bot', u'MastiBot', u'MenoBot', u'MerlIwBot',
u'Movses-bot', u'MystBot', u'NjardarBot', u'Rezabot', u'Rubinbot',
u'SassoBot', u'Sz-iwbot', u'WarddrBOT', u'Xqbot', u'YiFeiBot',
u'HerculeBot', u'SamoaBot', u'タチコマ robot']
defualt_item = {"type": "item", "labels": {}, "descriptions": [],
"aliases": [], "claims": [], "sitelinks": {}}
features = [
wb_diff_f.number_added_sitelinks,
wb_diff_f.number_removed_sitelinks,
wb_diff_f.number_changed_sitelinks,
wb_diff_f.number_added_labels,
wb_diff_f.number_removed_labels,
wb_diff_f.number_changed_labels,
wb_diff_f.number_added_descriptions,
wb_diff_f.number_removed_descriptions,
wb_diff_f.number_changed_descriptions,
wb_diff_f.number_added_aliases,
wb_diff_f.number_removed_aliases,
wb_diff_f.number_added_claims,
wb_diff_f.number_removed_claims,
wb_diff_f.number_changed_claims,
wb_diff_f.number_changed_identifiers,
wb_diff_f.en_label_touched,
wb_diff_f.number_added_sources,
wb_diff_f.number_removed_sources,
wb_diff_f.number_added_qualifiers,
wb_diff_f.number_removed_qualifiers,
wb_diff_f.number_added_badges,
wb_diff_f.number_removed_badges,
wb_diff_f.proportion_of_qid_added,
wb_diff_f.proportion_of_langauge_added,
wb_diff_f.proportion_of_links_added,
wb_diff_f.P21_changed,
wb_diff_f.P27_changed,
wb_diff_f.P54_changed,
wb_diff_f.P569_changed,
wb_diff_f.P18_changed,
wb_diff_f.P109_changed,
wb_diff_f.P373_changed,
wb_diff_f.P856_changed,
wb_rev_f.number_claims,
wb_rev_f.number_aliases,
wb_rev_f.number_sources,
wb_rev_f.number_qualifiers,
wb_rev_f.number_badges,
wb_rev_f.number_labels,
wb_rev_f.number_sitelinks,
wb_rev_f.number_descriptions,
wb_rev_f.is_human,
wb_rev_f.is_blp]
def page_info(dump):
c = 1
parent_revision = defualt_item
di = []
di_old = []
for entry in dump.parse():
if entry.ns != '0':
continue
if c != entry.id:
if c != 1:
di_old = di[:]
di = []
di.append(entry)
else:
di.append(entry)
continue
history = {}
detector = reverts.Detector(radius=5)
if c != entry.id:
parent_revision = defualt_item
parent_revision['id'] = entry.title
c = entry.id
for revision in di_old:
text = revision.text
parent = json.dumps(parent_revision)
revision.revisionid = int(revision.revisionid)
history[revision.revisionid] = \
[None] * len(features) + \
[revision.username in bots, False, False]
rev = detector.process(revision.text,
{'rev_id': revision.revisionid})
if rev:
for reverted in rev.reverteds:
history[int(reverted['rev_id'])][-1] = True
parent_revision = revision.text
parent_revision = defualt_item
for revision in di_old:
text = revision.text
parent = json.dumps(parent_revision)
revision.revisionid = int(revision.revisionid)
random_number = random.random()
reverted = history[revision.revisionid][-1]
year = int(revision.timestamp.split('-')[0])
month = int(revision.timestamp.split('-')[1])
if ((year == 2015 and month > 7) or
(year == 2014 and month < 7) or
year < 2014):
parent_revision = revision.text
continue
if ((reverted and random_number < REVERTED_RATIO) or
(not reverted and random_number < NOT_REVERTED_RATIO)):
try:
ip_address(revision.username)
except ValueError:
is_ip = False
else:
is_ip = True
try:
history[revision.revisionid] = \
list(solve(
features,
cache={rev_d.text: text,
parent_rev_d.text: parent})) \
+ [revision.username in bots, is_ip, reverted]
except:
pass
parent_revision = revision.text
yield history
def run(dumps):
number = 1050
counter = 0
start_time = time.time()
res_to_flush = ''
for casee in dumps:
dump = xmlreader.XmlDump(casee, True)
for case in page_info(dump):
for revid in case:
if number and counter > number:
return
if case[revid][0] is None:
continue
res_to_flush += str(case[revid] + [revid]) + '\n'
if counter % 1000 == 0:
res_to_flush = res_to_flush.replace('[', '')
res_to_flush = res_to_flush.replace(']', '')
res_to_flush = res_to_flush.replace(', ', ',')
print(counter, time.time() - start_time)
with codecs.open('%sres_dump_aaron2.csv' % base_dir,
'a', 'utf-8') as w:
w.write(res_to_flush)
res_to_flush = ''
counter += 1
print(time.time() - start_time)
if __name__ == "__main__":
dumps = sys.argv[1:]
run(dumps)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment