edsu/diffs.py

## diffs.py
#!/usr/bin/env python3

# Use the Twitter archive's tweets.csv to download all the revision information
# for edits that were announced.

import csv
import sys
import json
import time
import requests

from urllib.parse import urlparse, parse_qs

http = requests.Session()

def main():
    if len(sys.argv) != 2:
        sys.exit('usage: diffs.py tweets.csv')
    csv_file = sys.argv[1]
    for diff in diffs(csv_file):
        print(json.dumps(diff))
        time.sleep(.5)

def diffs(csv_file):
    for lang, from_id, to_id, tweet in diff_ids(csv_file):
        url = 'https://{}.wikipedia.org/w/api.php?action=compare&fromrev={}&torev={}&prop=diff|ids|title|diffsize|user|comment|parsedcomment|size&format=json'.format(lang, from_id, to_id)
        diff = http.get(url).json().get('compare', None)

        # sometimes revisions are deleted in which case nothing is returned
        # from the API in those cases only the revision ids and the tweet are
        # returned, to at least show that the edit happened. the user
        # who made the edit and the page that was edited should be available
        # in the text of the tweet

        if diff is None:
            diff = {
                'fromrevid': from_id,
                'torevid': to_id,
            }

        diff['tweet'] = tweet
        yield diff

def diff_ids(csv_file):
    for tweet in csv.DictReader(open(csv_file)):
        urls = tweet['expanded_urls'].split(',')
        for url in urls:
            uri = urlparse(url)
            if 'wikipedia.org' not in uri.netloc:
                continue
            params = parse_qs(uri.query)
            if 'diff' in params:
                lang = uri.netloc.split('.')[0]
                yield lang, params['oldid'][0], params['diff'][0], tweet

if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	# Use the Twitter archive's tweets.csv to download all the revision information
	# for edits that were announced.

	import csv
	import sys
	import json
	import time
	import requests

	from urllib.parse import urlparse, parse_qs

	http = requests.Session()

	def main():
	if len(sys.argv) != 2:
	sys.exit('usage: diffs.py tweets.csv')
	csv_file = sys.argv[1]
	for diff in diffs(csv_file):
	print(json.dumps(diff))
	time.sleep(.5)

	def diffs(csv_file):
	for lang, from_id, to_id, tweet in diff_ids(csv_file):
	url = 'https://{}.wikipedia.org/w/api.php?action=compare&fromrev={}&torev={}&prop=diff\|ids\|title\|diffsize\|user\|comment\|parsedcomment\|size&format=json'.format(lang, from_id, to_id)
	diff = http.get(url).json().get('compare', None)

	# sometimes revisions are deleted in which case nothing is returned
	# from the API in those cases only the revision ids and the tweet are
	# returned, to at least show that the edit happened. the user
	# who made the edit and the page that was edited should be available
	# in the text of the tweet

	if diff is None:
	diff = {
	'fromrevid': from_id,
	'torevid': to_id,
	}

	diff['tweet'] = tweet
	yield diff

	def diff_ids(csv_file):
	for tweet in csv.DictReader(open(csv_file)):
	urls = tweet['expanded_urls'].split(',')
	for url in urls:
	uri = urlparse(url)
	if 'wikipedia.org' not in uri.netloc:
	continue
	params = parse_qs(uri.query)
	if 'diff' in params:
	lang = uri.netloc.split('.')[0]
	yield lang, params['oldid'][0], params['diff'][0], tweet

	if __name__ == "__main__":
	main()