Twitter Archive Parser, tweaked to generate a page for every tweet
#!/usr/bin/env python3
twitter-archive-parser - Python code to parse a Twitter archive and output in various ways
Copyright (C) 2022 Tim Hutton
Tweaks copyright (C) 2022 Joey Castillo
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <>.
import datetime
import glob
import json
import os
import shutil
def read_json_from_js_file(filename):
"""Reads the contents of a Twitter-produced .js file into a dictionary."""
with open(filename, 'r', encoding='utf8') as f:
data = f.readlines()
# convert js file to JSON: replace first line with just '[', squash lines into a single string
prefix = '['
if '{' in data[0]:
prefix += ' {'
data = prefix + ''.join(data[1:])
# parse the resulting JSON and return as a dict
return json.loads(data)
def extract_username(account_js_filename):
"""Returns the user's Twitter username from account.js."""
account = read_json_from_js_file(account_js_filename)
return account[0]['account']['username']
def tweet_json_to_markdown(tweet, username, archive_media_folder, output_media_folder_name):
"""Converts a JSON-format tweet into markdown. Returns tuple of timestamp and markdown."""
tweet = tweet['tweet']
timestamp_str = tweet['created_at']
timestamp = int(round(datetime.datetime.strptime(timestamp_str, '%a %b %d %X %z %Y').timestamp())) # Example: Tue Mar 19 14:05:17 +0000 2019
body = tweet['full_text']
tweet_id_str = tweet['id_str']
# replace URLs with their original versions
if 'entities' in tweet and 'urls' in tweet['entities']:
for url in tweet['entities']['urls']:
if 'url' in url and 'expanded_url' in url:
body = body.replace(url['url'], url['expanded_url'])
# replace image URLs with markdown image links to local files
if 'entities' in tweet and 'media' in tweet['entities']:
for media in tweet['entities']['media']:
if 'url' in media and 'media_url' in media:
original_url = media['url']
original_expanded_url = media['media_url']
original_filename = os.path.split(original_expanded_url)[1]
local_filename = os.path.join(archive_media_folder, tweet_id_str + '-' + original_filename)
new_url = output_media_folder_name + tweet_id_str + '-' + original_filename
if not os.path.isfile(local_filename):
print(f'Warning: missing local file: {local_filename}. Using original link instead: {original_url} (expands to {original_expanded_url})')
new_url = original_url
shutil.copy(local_filename, new_url)
markdown = f'![](/{new_url})'
body = body.replace(original_url, markdown)
# append the original Twitter URL as a link
body += f'\n\n[{timestamp_str}](/{username}/status/{tweet_id_str}) ([original]({username}/status/{tweet_id_str}))'
return timestamp, body, tweet_id_str
def main():
input_folder = '.'
output_filename = ''
output_media_folder_name = 'media/'
# Identify the file and folder names - they change slightly depending on the archive size it seems
data_folder = os.path.join(input_folder, 'data')
account_js_filename = os.path.join(data_folder, 'account.js')
if not os.path.isfile(account_js_filename):
print(f'Error: Failed to load {account_js_filename}. Start this script in the root folder of your Twitter archive.')
tweet_js_filename_templates = ['tweet.js', 'tweets.js', 'tweets-part*.js']
input_filenames = []
for tweet_js_filename_template in tweet_js_filename_templates:
input_filenames += glob.glob(os.path.join(data_folder, tweet_js_filename_template))
if len(input_filenames)==0:
print(f'Error: no files matching {tweet_js_filename_templates} in {data_folder}')
tweet_media_folder_name_templates = ['tweet_media', 'tweets_media']
tweet_media_folder_names = []
for tweet_media_folder_name_template in tweet_media_folder_name_templates:
tweet_media_folder_names += glob.glob(os.path.join(data_folder, tweet_media_folder_name_template))
if len(tweet_media_folder_names)==0:
print(f'Error: no folders matching {tweet_media_folder_name_templates} in {data_folder}')
if len(tweet_media_folder_names) > 1:
print(f'Error: multiple folders matching {tweet_media_folder_name_templates} in {data_folder}')
archive_media_folder = tweet_media_folder_names[0]
os.makedirs(output_media_folder_name, exist_ok = True)
# Parse the tweets
username = extract_username(account_js_filename)
tweets_markdown = []
for tweets_js_filename in input_filenames:
print(f'Parsing {tweets_js_filename}...')
json = read_json_from_js_file(tweets_js_filename)
tweets_markdown += [tweet_json_to_markdown(tweet, username, archive_media_folder, output_media_folder_name) for tweet in json]
print(f'Parsed {len(tweets_markdown)} tweets and replies by {username}.')
# Sort tweets with oldest first
tweets_markdown.sort(key=lambda tup: tup[0], reverse=True)
os.makedirs(f"{username}/status", exist_ok = True)
for t,md,id in tweets_markdown:
with open(f"{username}/status/{id}.md", 'w', encoding='utf-8') as f:
f.write(f'---\ntitle: "@{username}"\n---\n{md}')
tweets_markdown = [md for t,md,id in tweets_markdown] # discard timestamps
# Save as one large markdown file
all_tweets = '\n\n----\n'.join(tweets_markdown)
with open(output_filename, 'w', encoding='utf-8') as f:
print(f'Wrote to {output_filename}, which embeds images from {output_media_folder_name}')
if __name__ == "__main__":
