Last active
November 23, 2017 18:36
-
-
Save wzyboy/d98336657e34a36e52bf7d2153300822 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
A Twitter archive includes only text data. This simple script parses all media | |
entities in the Twitter archive and extract all media URLs. One could feed the | |
output list to "aria2c -i" and download all the media files to disk. | |
Optionally, since the filenames of the media files are unique, they could be | |
uploaded to object storage buckets for backup purposes. | |
''' | |
import os | |
import glob | |
import argparse | |
from loader import load_files | |
def main(): | |
ap = argparse.ArgumentParser() | |
ap.add_argument('--data-dir', default='./data') | |
args = ap.parse_args() | |
filenames = glob.glob(os.path.join(args.data_dir, 'js/tweets/*.js')) | |
data = load_files(filenames) | |
media_urls = set() | |
for item in data: | |
media = item['entities']['media'] | |
if not media: | |
continue | |
for m in media: | |
media_url = m['media_url_https'] | |
media_urls.add(media_url) | |
media_urls = sorted(media_urls) | |
for url in media_urls: | |
print(url) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
In my Twitter archives downloaded before mid-2013, all tweets have correct | |
"created_at" attributes. However, Twitter archives downloaded after mid-2013 | |
not only use a different time format, but also have incorrect "created_at" | |
attributes for tweets ranging from 2009 to 2010. The "created_at" attributes of | |
affected tweets (~10000) have correct date portions but their time portions are | |
reset to "00:00:00" of that day for an unknown reason. | |
This script is a quick and dirty way to fix this: it iterates over the | |
JavaScript files, finding out problematic tweets and retrieve correct | |
"created_at" from Twitter archives downloaded before mid-2013, and finally | |
replace the incorrect "created_at" attributes with correct ones. | |
''' | |
import os | |
import re | |
import glob | |
import argparse | |
from datetime import datetime | |
from loader import load_files | |
def main(): | |
# Mon Jun 29 15:46:31 +0000 2009 | |
# 2017-08-17 12:57:51 +0000 | |
old_ts_format = '%a %b %d %H:%M:%S %z %Y' | |
new_ts_format = '%Y-%m-%d %H:%M:%S %z' | |
ap = argparse.ArgumentParser() | |
ap.add_argument('--old-data', default='./data2') | |
ap.add_argument('--new-data', default='./data') | |
args = ap.parse_args() | |
old_files = glob.glob(os.path.join(args.old_data, 'js/tweets/*.js')) | |
old_data = load_files(old_files) | |
old_db = {i['id']: i for i in old_data} | |
new_files = glob.glob(os.path.join(args.new_data, 'js/tweets/*.js')) | |
for js in new_files: | |
print('Processing {}'.format(js)) | |
changed = False | |
new_lines = [] | |
with open(js, 'r') as f: | |
lines = f.readlines() | |
for lineno, line in enumerate(lines): | |
matched = re.match(r' "created_at" : "(\d{4}-\d{2}-\d{2} 00:00:00 \+0000)",', line) | |
if not matched: | |
new_lines.append(line) | |
else: | |
# possible date mismatches, look behind a few lines for id | |
changed = True | |
before_lines = lines[lineno - 2:lineno] | |
for line in before_lines: | |
matched_id = re.match(r' "id" : (\d+),', line) | |
if matched_id: | |
break | |
else: | |
raise ValueError('Cannot find tweet ID in before lines in {} @ L{}:\n{}'.format(js, lineno, before_lines)) | |
tweet_id = int(matched_id.group(1)) | |
try: | |
old_tweet = old_db[tweet_id] | |
except KeyError: | |
raise ValueError('Cannot retrieve old tweet {}.'.format(tweet_id)) from None | |
old_ts = datetime.strptime(old_tweet['created_at'], old_ts_format) | |
corrected_ts = old_ts.strftime(new_ts_format) | |
new_line = ' "created_at" : "{}",\n'.format(corrected_ts) | |
new_lines.append(new_line) | |
if changed: | |
print('Writing {}'.format(js)) | |
with open(js, 'w') as f: | |
f.writelines(new_lines) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
from collections import OrderedDict | |
def load_file(filename, ordered_dict=False): | |
with open(filename, 'r') as f: | |
lines = f.readlines() | |
# drop the first line | |
content = ''.join(lines[1:]) | |
if ordered_dict: | |
data = json.loads(content, object_pairs_hook=OrderedDict) | |
else: | |
data = json.loads(content) | |
return data | |
def load_files(filenames, ordered_dict=False): | |
data = [] | |
for filename in filenames: | |
_data = load_file(filename, ordered_dict) | |
data.extend(_data) | |
return data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment