joeycastillo/parser.py

## parser.py
#!/usr/bin/env python3
"""
    twitter-archive-parser - Python code to parse a Twitter archive and output in various ways
    Copyright (C) 2022  Tim Hutton
    Tweaks copyright (C) 2022  Joey Castillo

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
"""

import datetime
import glob
import json
import os
import shutil

def read_json_from_js_file(filename):
    """Reads the contents of a Twitter-produced .js file into a dictionary."""
    with open(filename, 'r', encoding='utf8') as f:
        data = f.readlines()
        # convert js file to JSON: replace first line with just '[', squash lines into a single string
        prefix = '['
        if '{' in data[0]:
            prefix += ' {'
        data =  prefix + ''.join(data[1:])
        # parse the resulting JSON and return as a dict
        return json.loads(data)

def extract_username(account_js_filename):
    """Returns the user's Twitter username from account.js."""
    account = read_json_from_js_file(account_js_filename)
    return account[0]['account']['username']

def tweet_json_to_markdown(tweet, username, archive_media_folder, output_media_folder_name):
    """Converts a JSON-format tweet into markdown. Returns tuple of timestamp and markdown."""
    tweet = tweet['tweet']
    timestamp_str = tweet['created_at']
    timestamp = int(round(datetime.datetime.strptime(timestamp_str, '%a %b %d %X %z %Y').timestamp())) # Example: Tue Mar 19 14:05:17 +0000 2019
    body = tweet['full_text']
    tweet_id_str = tweet['id_str']
    # replace t.co URLs with their original versions
    if 'entities' in tweet and 'urls' in tweet['entities']:
        for url in tweet['entities']['urls']:
            if 'url' in url and 'expanded_url' in url:
                body = body.replace(url['url'], url['expanded_url'])
    # replace image URLs with markdown image links to local files
    if 'entities' in tweet and 'media' in tweet['entities']:
        for media in tweet['entities']['media']:
            if 'url' in media and 'media_url' in media:
                original_url = media['url']
                original_expanded_url = media['media_url']
                original_filename = os.path.split(original_expanded_url)[1]
                local_filename = os.path.join(archive_media_folder, tweet_id_str + '-' + original_filename)
                new_url = output_media_folder_name + tweet_id_str + '-' + original_filename
                if not os.path.isfile(local_filename):
                    print(f'Warning: missing local file: {local_filename}. Using original link instead: {original_url} (expands to {original_expanded_url})')
                    new_url = original_url
                else:
                    shutil.copy(local_filename, new_url)
                markdown = f'![](/{new_url})'
                body = body.replace(original_url, markdown)
    # append the original Twitter URL as a link
    body += f'\n\n[{timestamp_str}](/{username}/status/{tweet_id_str}) ([original](https://twitter.com/{username}/status/{tweet_id_str}))'
    return timestamp, body, tweet_id_str

def main():

    input_folder = '.'
    output_filename = '_index.md'
    output_media_folder_name = 'media/'

    # Identify the file and folder names - they change slightly depending on the archive size it seems
    data_folder = os.path.join(input_folder, 'data')
    account_js_filename = os.path.join(data_folder, 'account.js')
    if not os.path.isfile(account_js_filename):
        print(f'Error: Failed to load {account_js_filename}. Start this script in the root folder of your Twitter archive.')
        exit()
    tweet_js_filename_templates = ['tweet.js', 'tweets.js', 'tweets-part*.js']
    input_filenames = []
    for tweet_js_filename_template in tweet_js_filename_templates:
        input_filenames += glob.glob(os.path.join(data_folder, tweet_js_filename_template))
    if len(input_filenames)==0:
        print(f'Error: no files matching {tweet_js_filename_templates} in {data_folder}')
        exit()
    tweet_media_folder_name_templates = ['tweet_media', 'tweets_media']
    tweet_media_folder_names = []
    for tweet_media_folder_name_template in tweet_media_folder_name_templates:
        tweet_media_folder_names += glob.glob(os.path.join(data_folder, tweet_media_folder_name_template))
    if len(tweet_media_folder_names)==0:
        print(f'Error: no folders matching {tweet_media_folder_name_templates} in {data_folder}')
        exit()
    if len(tweet_media_folder_names) > 1:
        print(f'Error: multiple folders matching {tweet_media_folder_name_templates} in {data_folder}')
        exit()
    archive_media_folder = tweet_media_folder_names[0]
    os.makedirs(output_media_folder_name, exist_ok = True)

    # Parse the tweets
    username = extract_username(account_js_filename)
    tweets_markdown = []
    for tweets_js_filename in input_filenames:
        print(f'Parsing {tweets_js_filename}...')
        json = read_json_from_js_file(tweets_js_filename)
        tweets_markdown += [tweet_json_to_markdown(tweet, username, archive_media_folder, output_media_folder_name) for tweet in json]
    print(f'Parsed {len(tweets_markdown)} tweets and replies by {username}.')

    # Sort tweets with oldest first
    tweets_markdown.sort(key=lambda tup: tup[0], reverse=True)

    os.makedirs(f"{username}/status", exist_ok = True)

    for t,md,id in tweets_markdown:
        with open(f"{username}/status/{id}.md", 'w', encoding='utf-8') as f:
            f.write(f'---\ntitle: "@{username}"\n---\n{md}')

    tweets_markdown = [md for t,md,id in tweets_markdown] # discard timestamps

    # Save as one large markdown file
    all_tweets = '\n\n----\n'.join(tweets_markdown)
    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write(all_tweets)
    print(f'Wrote to {output_filename}, which embeds images from {output_media_folder_name}')

if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	twitter-archive-parser - Python code to parse a Twitter archive and output in various ways
	Copyright (C) 2022 Tim Hutton
	Tweaks copyright (C) 2022 Joey Castillo

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU General Public License as published by
	the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program. If not, see <https://www.gnu.org/licenses/>.
	"""

	import datetime
	import glob
	import json
	import os
	import shutil

	def read_json_from_js_file(filename):
	"""Reads the contents of a Twitter-produced .js file into a dictionary."""
	with open(filename, 'r', encoding='utf8') as f:
	data = f.readlines()
	# convert js file to JSON: replace first line with just '[', squash lines into a single string
	prefix = '['
	if '{' in data[0]:
	prefix += ' {'
	data = prefix + ''.join(data[1:])
	# parse the resulting JSON and return as a dict
	return json.loads(data)

	def extract_username(account_js_filename):
	"""Returns the user's Twitter username from account.js."""
	account = read_json_from_js_file(account_js_filename)
	return account[0]['account']['username']

	def tweet_json_to_markdown(tweet, username, archive_media_folder, output_media_folder_name):
	"""Converts a JSON-format tweet into markdown. Returns tuple of timestamp and markdown."""
	tweet = tweet['tweet']
	timestamp_str = tweet['created_at']
	timestamp = int(round(datetime.datetime.strptime(timestamp_str, '%a %b %d %X %z %Y').timestamp())) # Example: Tue Mar 19 14:05:17 +0000 2019
	body = tweet['full_text']
	tweet_id_str = tweet['id_str']
	# replace t.co URLs with their original versions
	if 'entities' in tweet and 'urls' in tweet['entities']:
	for url in tweet['entities']['urls']:
	if 'url' in url and 'expanded_url' in url:
	body = body.replace(url['url'], url['expanded_url'])
	# replace image URLs with markdown image links to local files
	if 'entities' in tweet and 'media' in tweet['entities']:
	for media in tweet['entities']['media']:
	if 'url' in media and 'media_url' in media:
	original_url = media['url']
	original_expanded_url = media['media_url']
	original_filename = os.path.split(original_expanded_url)[1]
	local_filename = os.path.join(archive_media_folder, tweet_id_str + '-' + original_filename)
	new_url = output_media_folder_name + tweet_id_str + '-' + original_filename
	if not os.path.isfile(local_filename):
	print(f'Warning: missing local file: {local_filename}. Using original link instead: {original_url} (expands to {original_expanded_url})')
	new_url = original_url
	else:
	shutil.copy(local_filename, new_url)
	markdown = f'![](/{new_url})'
	body = body.replace(original_url, markdown)
	# append the original Twitter URL as a link
	body += f'\n\n[{timestamp_str}](/{username}/status/{tweet_id_str}) ([original](https://twitter.com/{username}/status/{tweet_id_str}))'
	return timestamp, body, tweet_id_str

	def main():

	input_folder = '.'
	output_filename = '_index.md'
	output_media_folder_name = 'media/'

	# Identify the file and folder names - they change slightly depending on the archive size it seems
	data_folder = os.path.join(input_folder, 'data')
	account_js_filename = os.path.join(data_folder, 'account.js')
	if not os.path.isfile(account_js_filename):
	print(f'Error: Failed to load {account_js_filename}. Start this script in the root folder of your Twitter archive.')
	exit()
	tweet_js_filename_templates = ['tweet.js', 'tweets.js', 'tweets-part*.js']
	input_filenames = []
	for tweet_js_filename_template in tweet_js_filename_templates:
	input_filenames += glob.glob(os.path.join(data_folder, tweet_js_filename_template))
	if len(input_filenames)==0:
	print(f'Error: no files matching {tweet_js_filename_templates} in {data_folder}')
	exit()
	tweet_media_folder_name_templates = ['tweet_media', 'tweets_media']
	tweet_media_folder_names = []
	for tweet_media_folder_name_template in tweet_media_folder_name_templates:
	tweet_media_folder_names += glob.glob(os.path.join(data_folder, tweet_media_folder_name_template))
	if len(tweet_media_folder_names)==0:
	print(f'Error: no folders matching {tweet_media_folder_name_templates} in {data_folder}')
	exit()
	if len(tweet_media_folder_names) > 1:
	print(f'Error: multiple folders matching {tweet_media_folder_name_templates} in {data_folder}')
	exit()
	archive_media_folder = tweet_media_folder_names[0]
	os.makedirs(output_media_folder_name, exist_ok = True)

	# Parse the tweets
	username = extract_username(account_js_filename)
	tweets_markdown = []
	for tweets_js_filename in input_filenames:
	print(f'Parsing {tweets_js_filename}...')
	json = read_json_from_js_file(tweets_js_filename)
	tweets_markdown += [tweet_json_to_markdown(tweet, username, archive_media_folder, output_media_folder_name) for tweet in json]
	print(f'Parsed {len(tweets_markdown)} tweets and replies by {username}.')

	# Sort tweets with oldest first
	tweets_markdown.sort(key=lambda tup: tup[0], reverse=True)

	os.makedirs(f"{username}/status", exist_ok = True)

	for t,md,id in tweets_markdown:
	with open(f"{username}/status/{id}.md", 'w', encoding='utf-8') as f:
	f.write(f'---\ntitle: "@{username}"\n---\n{md}')

	tweets_markdown = [md for t,md,id in tweets_markdown] # discard timestamps

	# Save as one large markdown file
	all_tweets = '\n\n----\n'.join(tweets_markdown)
	with open(output_filename, 'w', encoding='utf-8') as f:
	f.write(all_tweets)
	print(f'Wrote to {output_filename}, which embeds images from {output_media_folder_name}')

	if __name__ == "__main__":
	main()