Last active
May 26, 2016 12:41
-
-
Save jwhitlock/592ea2590f7971598ef46a6cc74e2e5b to your computer and use it in GitHub Desktop.
Report on spam edits for last 90 days, and how long they were published
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from csv import writer | |
from datetime import datetime, timedelta | |
from os.path import expanduser | |
from pyquery import PyQuery as pq | |
from six.moves.urllib.parse import urlparse | |
from kuma.wiki.models import Document, Revision, RevisionAkismetSubmission | |
def get_homepage_links(): | |
homepage = pq('https://developer.mozilla.org') | |
hp_links = set() | |
for a_elem in homepage('a'): | |
href = a_elem.get('href', '') | |
if href.startswith('/en-US/'): | |
hp_links.add('https://developer.mozilla.org' + href) | |
else: | |
print("Skipping %s" % href) | |
return hp_links | |
def get_spam_edits(hp_links): | |
pages = [] | |
days90 = datetime.now() - timedelta(days=90) | |
subs = RevisionAkismetSubmission.objects.filter(type='spam', sent__gte=days90) | |
revision_ids = set(subs.values_list('revision_id', flat=True)) | |
for rev_id in revision_ids: | |
rev = Revision.objects.get(id=rev_id) | |
doc_id = rev.document_id | |
try: | |
doc = Document.objects.only('id', 'locale', 'slug').get(id=doc_id) | |
except Document.DoesNotExist: | |
pass | |
else: | |
full = doc.get_full_url() | |
path = urlparse(full).path | |
hp_link = full in hp_links | |
date = rev.created.date().strftime('%m/%d/%Y') | |
# Find next revision | |
next_rev = None | |
for doc_rev in doc.revisions.order_by('-created').all(): | |
if doc_rev == rev: | |
break | |
else: | |
next_rev = doc_rev | |
if next_rev: | |
duration = int((next_rev.created - rev.created).total_seconds()) | |
live = False | |
else: | |
duration = int((datetime.now() - rev.created).total_seconds()) | |
live = True | |
pages.append((path, rev.id, hp_link, date, duration, live)) | |
return pages | |
def write_spam_edits(path, pages): | |
with open(path, 'wb') as csvfile: | |
csvwriter = writer(csvfile) | |
csvwriter.writerow(('Path', 'RevID', 'Homepage Link', 'Date', 'Time Active (sec)', 'Still Active')) | |
for path, rev_id, hp_link, date, duration, live in pages: | |
csvwriter.writerow((path, | |
rev_id, | |
1 if hp_link else 0, | |
date, | |
duration, | |
1 if live else 0)) | |
def run_it(path): | |
print("Getting homepage links...") | |
hp_links = get_homepage_links() | |
print("Collecting spam edits...") | |
pages = get_spam_edits(hp_links) | |
print("Writing to %s" % path) | |
write_spam_edits(path, pages) | |
print("Done!") | |
run_it(expanduser('~/spam_edits_90_days.csv')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment