Skip to content

Instantly share code, notes, and snippets.

@wzyboy
Last active November 23, 2017 18:36
Show Gist options
  • Save wzyboy/d98336657e34a36e52bf7d2153300822 to your computer and use it in GitHub Desktop.
Save wzyboy/d98336657e34a36e52bf7d2153300822 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
'''
A Twitter archive includes only text data. This simple script parses all media
entities in the Twitter archive and extract all media URLs. One could feed the
output list to "aria2c -i" and download all the media files to disk.
Optionally, since the filenames of the media files are unique, they could be
uploaded to object storage buckets for backup purposes.
'''
import os
import glob
import argparse
from loader import load_files
def main():
ap = argparse.ArgumentParser()
ap.add_argument('--data-dir', default='./data')
args = ap.parse_args()
filenames = glob.glob(os.path.join(args.data_dir, 'js/tweets/*.js'))
data = load_files(filenames)
media_urls = set()
for item in data:
media = item['entities']['media']
if not media:
continue
for m in media:
media_url = m['media_url_https']
media_urls.add(media_url)
media_urls = sorted(media_urls)
for url in media_urls:
print(url)
if __name__ == '__main__':
main()
#!/usr/bin/env python
'''
In my Twitter archives downloaded before mid-2013, all tweets have correct
"created_at" attributes. However, Twitter archives downloaded after mid-2013
not only use a different time format, but also have incorrect "created_at"
attributes for tweets ranging from 2009 to 2010. The "created_at" attributes of
affected tweets (~10000) have correct date portions but their time portions are
reset to "00:00:00" of that day for an unknown reason.
This script is a quick and dirty way to fix this: it iterates over the
JavaScript files, finding out problematic tweets and retrieve correct
"created_at" from Twitter archives downloaded before mid-2013, and finally
replace the incorrect "created_at" attributes with correct ones.
'''
import os
import re
import glob
import argparse
from datetime import datetime
from loader import load_files
def main():
# Mon Jun 29 15:46:31 +0000 2009
# 2017-08-17 12:57:51 +0000
old_ts_format = '%a %b %d %H:%M:%S %z %Y'
new_ts_format = '%Y-%m-%d %H:%M:%S %z'
ap = argparse.ArgumentParser()
ap.add_argument('--old-data', default='./data2')
ap.add_argument('--new-data', default='./data')
args = ap.parse_args()
old_files = glob.glob(os.path.join(args.old_data, 'js/tweets/*.js'))
old_data = load_files(old_files)
old_db = {i['id']: i for i in old_data}
new_files = glob.glob(os.path.join(args.new_data, 'js/tweets/*.js'))
for js in new_files:
print('Processing {}'.format(js))
changed = False
new_lines = []
with open(js, 'r') as f:
lines = f.readlines()
for lineno, line in enumerate(lines):
matched = re.match(r' "created_at" : "(\d{4}-\d{2}-\d{2} 00:00:00 \+0000)",', line)
if not matched:
new_lines.append(line)
else:
# possible date mismatches, look behind a few lines for id
changed = True
before_lines = lines[lineno - 2:lineno]
for line in before_lines:
matched_id = re.match(r' "id" : (\d+),', line)
if matched_id:
break
else:
raise ValueError('Cannot find tweet ID in before lines in {} @ L{}:\n{}'.format(js, lineno, before_lines))
tweet_id = int(matched_id.group(1))
try:
old_tweet = old_db[tweet_id]
except KeyError:
raise ValueError('Cannot retrieve old tweet {}.'.format(tweet_id)) from None
old_ts = datetime.strptime(old_tweet['created_at'], old_ts_format)
corrected_ts = old_ts.strftime(new_ts_format)
new_line = ' "created_at" : "{}",\n'.format(corrected_ts)
new_lines.append(new_line)
if changed:
print('Writing {}'.format(js))
with open(js, 'w') as f:
f.writelines(new_lines)
if __name__ == '__main__':
main()
import json
from collections import OrderedDict
def load_file(filename, ordered_dict=False):
with open(filename, 'r') as f:
lines = f.readlines()
# drop the first line
content = ''.join(lines[1:])
if ordered_dict:
data = json.loads(content, object_pairs_hook=OrderedDict)
else:
data = json.loads(content)
return data
def load_files(filenames, ordered_dict=False):
data = []
for filename in filenames:
_data = load_file(filename, ordered_dict)
data.extend(_data)
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment