Skip to content

Instantly share code, notes, and snippets.

@edsu
Created January 29, 2019 10:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save edsu/660818c55941446c30fbaf93c7d0d8ce to your computer and use it in GitHub Desktop.
Save edsu/660818c55941446c30fbaf93c7d0d8ce to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
# Use the Twitter archive's tweets.csv to download all the revision information
# for edits that were announced.
import csv
import sys
import json
import time
import requests
from urllib.parse import urlparse, parse_qs
http = requests.Session()
def main():
if len(sys.argv) != 2:
sys.exit('usage: diffs.py tweets.csv')
csv_file = sys.argv[1]
for diff in diffs(csv_file):
print(json.dumps(diff))
time.sleep(.5)
def diffs(csv_file):
for lang, from_id, to_id, tweet in diff_ids(csv_file):
url = 'https://{}.wikipedia.org/w/api.php?action=compare&fromrev={}&torev={}&prop=diff|ids|title|diffsize|user|comment|parsedcomment|size&format=json'.format(lang, from_id, to_id)
diff = http.get(url).json().get('compare', None)
# sometimes revisions are deleted in which case nothing is returned
# from the API in those cases only the revision ids and the tweet are
# returned, to at least show that the edit happened. the user
# who made the edit and the page that was edited should be available
# in the text of the tweet
if diff is None:
diff = {
'fromrevid': from_id,
'torevid': to_id,
}
diff['tweet'] = tweet
yield diff
def diff_ids(csv_file):
for tweet in csv.DictReader(open(csv_file)):
urls = tweet['expanded_urls'].split(',')
for url in urls:
uri = urlparse(url)
if 'wikipedia.org' not in uri.netloc:
continue
params = parse_qs(uri.query)
if 'diff' in params:
lang = uri.netloc.split('.')[0]
yield lang, params['oldid'][0], params['diff'][0], tweet
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment