Skip to content

Instantly share code, notes, and snippets.

@jwhitlock
Last active May 26, 2016 12:41
Show Gist options
  • Save jwhitlock/592ea2590f7971598ef46a6cc74e2e5b to your computer and use it in GitHub Desktop.
Save jwhitlock/592ea2590f7971598ef46a6cc74e2e5b to your computer and use it in GitHub Desktop.
Report on spam edits for last 90 days, and how long they were published
#!/usr/bin/env python
from csv import writer
from datetime import datetime, timedelta
from os.path import expanduser
from pyquery import PyQuery as pq
from six.moves.urllib.parse import urlparse
from kuma.wiki.models import Document, Revision, RevisionAkismetSubmission
def get_homepage_links():
homepage = pq('https://developer.mozilla.org')
hp_links = set()
for a_elem in homepage('a'):
href = a_elem.get('href', '')
if href.startswith('/en-US/'):
hp_links.add('https://developer.mozilla.org' + href)
else:
print("Skipping %s" % href)
return hp_links
def get_spam_edits(hp_links):
pages = []
days90 = datetime.now() - timedelta(days=90)
subs = RevisionAkismetSubmission.objects.filter(type='spam', sent__gte=days90)
revision_ids = set(subs.values_list('revision_id', flat=True))
for rev_id in revision_ids:
rev = Revision.objects.get(id=rev_id)
doc_id = rev.document_id
try:
doc = Document.objects.only('id', 'locale', 'slug').get(id=doc_id)
except Document.DoesNotExist:
pass
else:
full = doc.get_full_url()
path = urlparse(full).path
hp_link = full in hp_links
date = rev.created.date().strftime('%m/%d/%Y')
# Find next revision
next_rev = None
for doc_rev in doc.revisions.order_by('-created').all():
if doc_rev == rev:
break
else:
next_rev = doc_rev
if next_rev:
duration = int((next_rev.created - rev.created).total_seconds())
live = False
else:
duration = int((datetime.now() - rev.created).total_seconds())
live = True
pages.append((path, rev.id, hp_link, date, duration, live))
return pages
def write_spam_edits(path, pages):
with open(path, 'wb') as csvfile:
csvwriter = writer(csvfile)
csvwriter.writerow(('Path', 'RevID', 'Homepage Link', 'Date', 'Time Active (sec)', 'Still Active'))
for path, rev_id, hp_link, date, duration, live in pages:
csvwriter.writerow((path,
rev_id,
1 if hp_link else 0,
date,
duration,
1 if live else 0))
def run_it(path):
print("Getting homepage links...")
hp_links = get_homepage_links()
print("Collecting spam edits...")
pages = get_spam_edits(hp_links)
print("Writing to %s" % path)
write_spam_edits(path, pages)
print("Done!")
run_it(expanduser('~/spam_edits_90_days.csv'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment