Skip to content

Instantly share code, notes, and snippets.

@chtnnh
Forked from codez266/revertslabel
Created October 28, 2020 05:24
Show Gist options
  • Save chtnnh/f03e5b0edb4f37a4c6d50e2a478776f7 to your computer and use it in GitHub Desktop.
Save chtnnh/f03e5b0edb4f37a4c6d50e2a478776f7 to your computer and use it in GitHub Desktop.
Reverts labeling from dump - involves loading revids from db and storing back, but that part is trivial
import mwreverts
from models import RevRevert, Page, Revision
import mwxml
import pdb
from collections import deque
from mwapilib import get_revs_for_revert_labeling
import sys
# This script is used for processing edits from the dump for reverts and store
# the revert status in a revert table. Edits for the pages from the page table
# are only considered for reverts
revisions_done = 0
revert_radius = 15
dump_file = '/home/asumit/wikipedia/datasets/enwiki-20200701-stub-meta-history.xml.gz'
pageids = Page.select(Page.page_id)
pageids = set([page.page_id for page in pageids])
revs = get_revs_for_revert_labeling(-1)
rev_ids = set([rev.rev_id for rev in revs])
reverted_revs = set([rev.rev_id for rev in RevRevert.select(RevRevert.rev_id)])
batch_revisions = []
batch_size = 1000
pages_done = set()
def process_dump(dump, path):
revisions_to_insert = []
for page in dump:
if page.id not in pageids:
continue
detector = mwreverts.Detector(radius=revert_radius)
window = deque(maxlen=revert_radius)
for revision in page:
revert = detector.process(revision.sha1, revision)
revision.reverted = False
revision.reverting = False
window.append(revision)
if revert is not None:
revision.reverting = True
for reverted in revert.reverteds:
reverted.reverted = True
if len(window) == revert_radius:
# only yeild if this revision was reverted or it was reverting
# some other revision
old_revision = window.popleft()
if old_revision.reverted or old_revision.reverting:
yield (old_revision.id, page.id)
#sys.stderr.write('.')
#sys.stderr.flush()
for rev_id, pageid in mwxml.map(process_dump, [dump_file]):
if rev_id not in rev_ids:
continue
if rev_id in reverted_revs:
continue
pages_done.add(pageid)
batch_revisions.append((rev_id,))
reverted_revs.add(rev_id)
if len(batch_revisions) > batch_size:
RevRevert.insert_many(batch_revisions, fields = ['rev_id']).execute()
batch_revisions = []
print('Done: {:.3f}%, {} pages'.format(len(pages_done)*100.0/len(pageids),
len(pages_done)), end = '\r')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment