-
-
Save chtnnh/f03e5b0edb4f37a4c6d50e2a478776f7 to your computer and use it in GitHub Desktop.
Reverts labeling from dump - involves loading revids from db and storing back, but that part is trivial
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import mwreverts | |
from models import RevRevert, Page, Revision | |
import mwxml | |
import pdb | |
from collections import deque | |
from mwapilib import get_revs_for_revert_labeling | |
import sys | |
# This script is used for processing edits from the dump for reverts and store | |
# the revert status in a revert table. Edits for the pages from the page table | |
# are only considered for reverts | |
revisions_done = 0 | |
revert_radius = 15 | |
dump_file = '/home/asumit/wikipedia/datasets/enwiki-20200701-stub-meta-history.xml.gz' | |
pageids = Page.select(Page.page_id) | |
pageids = set([page.page_id for page in pageids]) | |
revs = get_revs_for_revert_labeling(-1) | |
rev_ids = set([rev.rev_id for rev in revs]) | |
reverted_revs = set([rev.rev_id for rev in RevRevert.select(RevRevert.rev_id)]) | |
batch_revisions = [] | |
batch_size = 1000 | |
pages_done = set() | |
def process_dump(dump, path): | |
revisions_to_insert = [] | |
for page in dump: | |
if page.id not in pageids: | |
continue | |
detector = mwreverts.Detector(radius=revert_radius) | |
window = deque(maxlen=revert_radius) | |
for revision in page: | |
revert = detector.process(revision.sha1, revision) | |
revision.reverted = False | |
revision.reverting = False | |
window.append(revision) | |
if revert is not None: | |
revision.reverting = True | |
for reverted in revert.reverteds: | |
reverted.reverted = True | |
if len(window) == revert_radius: | |
# only yeild if this revision was reverted or it was reverting | |
# some other revision | |
old_revision = window.popleft() | |
if old_revision.reverted or old_revision.reverting: | |
yield (old_revision.id, page.id) | |
#sys.stderr.write('.') | |
#sys.stderr.flush() | |
for rev_id, pageid in mwxml.map(process_dump, [dump_file]): | |
if rev_id not in rev_ids: | |
continue | |
if rev_id in reverted_revs: | |
continue | |
pages_done.add(pageid) | |
batch_revisions.append((rev_id,)) | |
reverted_revs.add(rev_id) | |
if len(batch_revisions) > batch_size: | |
RevRevert.insert_many(batch_revisions, fields = ['rev_id']).execute() | |
batch_revisions = [] | |
print('Done: {:.3f}%, {} pages'.format(len(pages_done)*100.0/len(pageids), | |
len(pages_done)), end = '\r') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment