Last active
November 11, 2024 16:27
-
-
Save kanzure/4e7bcc58344ceaa1a668e65a434adb2b to your computer and use it in GitHub Desktop.
Use the bitcoin-dev pipermail archive and create individual files for each email, using the pipermail order for counting. This should create an identical mapping between URLs from lists.linuxfoundation.org and the actual emails. I use this mapping on gnusha.org/url for the redirect service. Try using https://gnusha.org/url if you need redirect s…
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Used for processing the mbox files found in: | |
# https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/bitcoin-dev-ml-archive.2024-02-09-004.tar.gz | |
# | |
# Why? | |
# | |
# Linux Foundation has deprecated lists.linuxfoundation.org, and now we need a url rewriting map | |
# for the pipermail archive to numbered email files. | |
# | |
# That remappig app can be found at: https://gnusha.org/url | |
# | |
# See https://x.com/kanzure/status/1853588622017990667 for more information. | |
# | |
# bitcoin-dev-ml-archive.2024-02-09-004.tar.gz has bad pipermail .txt.gz | |
# archives, in particular for 2019-February.txt.gz and 2019-March.txt.gz are both | |
# bad for bitcoin-dev mailing list, however the bitcoin-dev mbox files are | |
# correct and include the missing content. | |
# | |
# I believe what happened is that during the migration from Linux Foundation mail | |
# server to OSUOSL mail server in 2019 there was some outage, and while the mbox | |
# file was fixed and the generated mailman pipermail archives were corrected, it | |
# looks like the .txt.gz archives were never updated after that incident. | |
# | |
# Therefore, use the mbox files instead of the pipermail .txt.gz archives in | |
# this script. | |
processed_dir="processed/" | |
email_output_dir="$processed_dir/email/" | |
mapping_output_file="$processed_dir/mapping.bitcoin-dev.txt" | |
tmp_counter="/tmp/email-counter-$(uuidgen).txt" | |
# if mapping output file exists, move it to a /tmp backup | |
if [[ -f "$mapping_output_file" ]]; then | |
backup_file="/tmp/$(basename "$mapping_output_file").$(uuidgen).txt" | |
mv "$mapping_output_file" "$backup_file" | |
echo "Moved $mapping_output_file to $backup_file" | |
fi | |
# setup | |
mkdir -p "$processed_dir" | |
mkdir -p "$email_output_dir" | |
email_counter=0 | |
process_mbox() { | |
local filename="mbox/$1" # mbox filename to process | |
# check file existence | |
if [[ -f "$filename" ]]; then | |
# write current counter to temp file before processing | |
echo "$email_counter" > "$tmp_counter" | |
# define the command to process each email | |
process_email_cmd=" | |
# read current counter from temp file | |
email_counter=\$(cat $tmp_counter) | |
# format the email ID as a 6-digit zero-padded number | |
email_id=\$(printf '%06d' \"\$email_counter\") | |
# new filename will be the email ID printf 6 digits | |
new_filename=\$(printf '%06d' \"\$email_counter\") | |
new_fullpath=\"$email_output_dir/\$new_filename.eml\" | |
# save email content to file | |
cat > \"\$new_fullpath\" | |
# get the message id from the email | |
message_id=\$(grep -m 1 \"^Message-ID:\" \"\$new_fullpath\" | sed 's/.*Message-ID: <//; s/>.*//') | |
# if no message id, then try to parse for Message-Id (case insensitive) | |
if [ -z \"\$message_id\" ]; then | |
message_id=\$(grep -i -m 1 \"^[Mm][Ee][Ss][Ss][Aa][Gg][Ee]-[Ii][Dd]:\" \"\$new_fullpath\" | sed 's/.*[Mm][Ee][Ss][Ss][Aa][Gg][Ee]-[Ii][Dd]: *<//; s/>.*//') | |
fi | |
# store a mapping between the counter and message_id | |
echo \"\$message_id | \$new_filename.eml\" >> $mapping_output_file | |
# keepalive for user | |
echo \"\$message_id | \$new_filename.eml\" | |
email_counter=\$((email_counter + 1)) | |
echo \"\$email_counter\" > $tmp_counter | |
" | |
# Use formail to process the emails inside the mbox file | |
cat "$filename" | formail -s sh -c "$process_email_cmd" | |
# Update the global counter from the temp file | |
email_counter=$(cat "$tmp_counter") | |
else | |
echo "File $filename does not exist. Skipping..." | |
fi | |
} | |
# process all mbox files | |
for filename in ./mbox/*.mbox; do | |
process_mbox "$(basename "$filename")" | |
done | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# Used for processing the lightning-dev pipermail archive found in: | |
# https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/bitcoin-dev-ml-archive.2024-02-09-004.tar.gz | |
# in particular: bitcoin-dev-ml-archive/pipermail-archives/lightning-dev/ | |
# | |
# Why? | |
# | |
# Linux Foundation has deprecated lists.linuxfoundation.org, and now we need a url rewriting map | |
# for the pipermail archive to numbered email files. | |
# | |
# See https://x.com/kanzure/status/1853588622017990667 for more information. | |
# | |
# output: https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/lightning-dev-pipermail.2024-11-07.zip | |
# see: https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/linuxfoundation-pipermail/lightning-dev/ | |
processed_dir="processed/" | |
mapping_output_file="$processed_dir/mapping.lightning-dev.txt" | |
tmp_counter="/tmp/email-counter-$(uuidgen).txt" | |
# if mapping output file exists, move it to a /tmp backup | |
if [[ -f "$mapping_output_file" ]]; then | |
backup_file="/tmp/$(basename "$mapping_output_file").$(uuidgen).txt" | |
mv "$mapping_output_file" "$backup_file" | |
echo "Moved $mapping_output_file to $backup_file" | |
fi | |
# setup | |
mkdir -p "$processed_dir" | |
email_counter=0 | |
generate_filename_list() { | |
# lightning-dev isn't this old, of course | |
local start_year=2011 | |
local start_month=8 | |
local end_year=2024 | |
local end_month=4 # ended earlier than this | |
local current_year=$start_year | |
local current_month=$start_month | |
local filenames=() | |
while [[ $current_year -lt $end_year || ($current_year -eq $end_year && $current_month -le $end_month) ]]; do | |
# Get the full month name (e.g., "August") | |
local month_name=$(date -d "$current_year-$current_month-01" +"%B") | |
# Format the filename as "{year}-{month_name}.txt.gz" | |
local filename="${current_year}-${month_name}.txt.gz" | |
filenames+=("$filename") # Append filename to the list | |
# Move to the next month | |
current_month=$((current_month + 1)) | |
if [[ $current_month -gt 12 ]]; then | |
current_month=1 | |
current_year=$((current_year + 1)) | |
fi | |
done | |
# return a list of filenames | |
echo "${filenames[@]}" | |
} | |
# function to process a single pipermail .txt.gz file | |
process_pipermail_file() { | |
local filename="$1" | |
# check if the file exists | |
if [[ -f "$filename" ]]; then | |
# write current counter to temp file before processing | |
echo "$email_counter" > "$tmp_counter" | |
# define the command to process each email | |
process_email_cmd=" | |
# Read current counter from temp file | |
email_counter=\$(cat $tmp_counter) | |
# Format the email ID as a 6-digit zero-padded number | |
email_id=\$(printf '%06d' \"\$email_counter\") | |
# remove .txt.gz from filename | |
filename=\"$filename\" | |
base_filename=\$(basename \"\$filename\") | |
year_month_name=\${base_filename%.txt.gz} | |
# Create directory if it doesn't exist | |
mkdir_path=\"$processed_dir/\$year_month_name\" | |
mkdir -p \"\$mkdir_path\" | |
# The new filename will be the email ID printf 6 digits | |
new_filename=\$(printf '%06d' \"\$email_counter\") | |
new_fullpath=\"$processed_dir/\$year_month_name/\$new_filename.eml\" | |
# Save email content to file | |
cat > \"\$new_fullpath\" | |
# get the message id from the email | |
message_id=\$(grep -m 1 \"Message-ID:\" \"\$new_fullpath\" | sed 's/.*Message-ID: <//; s/>.*//') | |
# if no message id, then try to parse for Message-Id (case insensitive) | |
if [ -z \"\$message_id\" ]; then | |
message_id=\$(grep -i -m 1 \"^[Mm][Ee][Ss][Ss][Aa][Gg][Ee]-[Ii][Dd]:\" \"\$new_fullpath\" | sed 's/.*[Mm][Ee][Ss][Ss][Aa][Gg][Ee]-[Ii][Dd]: *<//; s/>.*//') | |
fi | |
# store mapping | |
echo \"$filename | \$email_id | \$message_id | \$new_fullpath\" >> $mapping_output_file | |
# Output the formatted result for logging | |
echo \"$filename | \$email_id | \$message_id | \$new_fullpath\" | |
email_counter=\$((email_counter + 1)) | |
echo \"\$email_counter\" > $tmp_counter | |
" | |
# use formail to process the emails inside the .gz file | |
zcat "$filename" | formail -s sh -c "$process_email_cmd" | |
# update the global counter from the temp file | |
email_counter=$(cat "$tmp_counter") | |
else | |
echo "File $filename does not exist. Skipping..." | |
fi | |
} | |
# could just glob *.txt.gz I guess... | |
filenames_list=$(generate_filename_list) | |
# process each of the pipermail *.txt.gz files | |
for filename in $filenames_list; do | |
process_pipermail_file "$filename" | |
done | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import csv | |
bitcoin_dev_mapping_file = "processed/mapping.bitcoin-dev.txt" | |
lightning_dev_mapping_file = "processed/mapping.lightning-dev.txt" | |
output_file = "processed/mapping.json" | |
def process_bitcoin_dev_mapping(file): | |
data = {} | |
for line in file: | |
# strip any extra whitespace or newline characters | |
line = line.strip() | |
# split the line on " | " | |
if " | " in line: | |
email_id, path = line.split(" | ") | |
# Add the mapping to the dictionary | |
# remove the .eml from the end of the relative path | |
data[path[:-4]] = email_id | |
# note that August-2021/019317 is malformed in the bitcoin-dev archive | |
# or rather: the used parser is somehow wrong. | |
# its actual message-id is: CAOU__fx0ajVfuEyoYCOkf8nZPOkbuYDctTfuzhCDdoU=jcA0Tg@mail.gmail.com | |
data["019317"] = "CAOU__fx0ajVfuEyoYCOkf8nZPOkbuYDctTfuzhCDdoU=jcA0Tg@mail.gmail.com" | |
# add the migration email, which was not included in the archive | |
# because it was sent after the migration | |
data["022327"] = "CABaSBaxDjj6ySBx4v+rmpfrw4pE9b=JZJPzPQj_ZUiBg1HGFyA@mail.gmail.com" | |
return data | |
def process_lightning_dev_mapping(file): | |
data = {} | |
csv_reader = csv.reader(file, delimiter='|') | |
counter = 0 | |
for row in csv_reader: | |
# Skip empty rows | |
if not row: | |
continue | |
# trim whitespace from each field | |
row = [field.strip() for field in row] | |
# unpack fields | |
(pipermail_archive_filename, sixdigit_id, message_id, email_file_location) = row | |
# some_id is sixdigit_id with leading zeros removed | |
some_id = sixdigit_id.lstrip('0') | |
# id 0 is "" so let's special case that and fix it | |
if counter == 0: | |
some_id = 0 | |
# email_file_location looks like "processed/{year}-{month}/id.txt" | |
# remove the "processed/" prefix | |
email_file_location = email_file_location.replace("processed/", "") | |
# NOTE: The format of this dictionary is different than the bitcoin-dev dictionary. | |
data[sixdigit_id] = { | |
'pipermail_archive_filename': "lightning-dev/" +pipermail_archive_filename, | |
'id': int(some_id), | |
'sixdigit_id': sixdigit_id, | |
'message_id': message_id, | |
'email_file_location': email_file_location, | |
} | |
counter += 1 | |
return data | |
# Dictionary to store the mappings | |
data = {} | |
# Read the file and process each line | |
with open(bitcoin_dev_mapping_file, "r") as file: | |
data["bitcoin-dev"] = process_bitcoin_dev_mapping(file) | |
with open(lightning_dev_mapping_file, "r") as file: | |
data["lightning-dev"] = process_lightning_dev_mapping(file) | |
# Write the dictionary to a JSON file | |
with open(output_file, "w") as json_file: | |
json.dump(data, json_file, indent=4) | |
print(f"JSON file '{output_file}' created successfully.") | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Verify broken links and replacement with: | |
# https://gist.github.com/kouloumos/e2a9c50221bf76e2e2bd4074617357f6 | |
# from https://github.com/bitcointranscripts/bitcointranscripts/pull/566#issuecomment-2462917970 | |
# find and replace with gnusha.org/url prefix | |
find ./ -type f -exec sed -i 's|https://lists.linuxfoundation.org/|https://gnusha.org/url/https://lists.linuxfoundation.org/|g' {} + | |
find ./ -type f -exec sed -i 's|http://lists.linuxfoundation.org/|https://gnusha.org/url/https://lists.linuxfoundation.org/|g' {} + | |
# Resolve without a gnusha.org/url permalink | |
# | |
# update resolver.py to in __main__ and to use resolve_remotely (this will be slow) | |
find ./workdir/ -type f -name "*.md" -exec python3 resolver.py {} \; | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
import datetime | |
import urllib.parse | |
import re | |
import os | |
# for remote resolving | |
# pip3 install requests | |
import requests | |
class ResolverStatusException(Exception): | |
pass | |
# if using remote resolver | |
RESOLVER_HOST = "http://localhost:5000" | |
#RESOLVER_HOST = "https://gnusha.org/url" | |
BITCOIN_DEV_PUBLIC_INBOX_URL = "https://gnusha.org/pi/bitcoindev/" | |
PIPERMAIL_ARCHIVE_URL = "https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/linuxfoundation-pipermail/" | |
BITCOIN_DEV_PIPERMAIL_ARCHIVE_URL = f"{PIPERMAIL_ARCHIVE_URL}bitcoin-dev/" | |
LIGHTNING_DEV_PIPERMAIL_ARCHIVE_URL = f"{PIPERMAIL_ARCHIVE_URL}lightning-dev/" | |
# mapping file only has these two mailing lists | |
SUPPORTED_MAILING_LIST_NAMES = [ | |
"bitcoin-dev", | |
"lightning-dev", | |
] | |
# various URL prefixes that this redirector can handle | |
SUPPORTED_URL_PREFIXES = [ | |
"https://lists.linuxfoundation.org/pipermail/", | |
"https:/lists.linuxfoundation.org/pipermail/", | |
"http://lists.linuxfoundation.org/pipermail/", | |
"lists.linuxfoundation.org/pipermail/", | |
] | |
MAPPING = None | |
MAPPING_LOCATION = __file__.replace("resolver.py", "mapping.json") | |
def load_mapping(mapping_location=MAPPING_LOCATION): | |
try: | |
with open(mapping_location, 'r') as f: | |
mapping = json.load(f) | |
# note that August-2021/019317 is malformed in the bitcoin-dev archive | |
# or rather: the used parser is somehow wrong. | |
# its actual message-id is: CAOU__fx0ajVfuEyoYCOkf8nZPOkbuYDctTfuzhCDdoU=jcA0Tg@mail.gmail.com | |
mapping["bitcoin-dev"]["019317"] = "CAOU__fx0ajVfuEyoYCOkf8nZPOkbuYDctTfuzhCDdoU=jcA0Tg@mail.gmail.com" | |
# this one is missing in the mapping.json file | |
# the email was sent after the migration! | |
# https://github.com/bitcoinops/bitcoinops.github.io/pull/2018#discussion_r1833154513 | |
mapping["bitcoin-dev"]["022327"] = "CABaSBaxDjj6ySBx4v+rmpfrw4pE9b=JZJPzPQj_ZUiBg1HGFyA@mail.gmail.com" | |
# This is fixed in future runs of pipermail-mapper.py | |
return mapping | |
except: | |
print("ERROR: Could not load mapping.json") | |
sys.exit(1) | |
def load_and_set_global_mapping(mapping_location=MAPPING_LOCATION): | |
global MAPPING | |
if MAPPING in [None, {}]: | |
# slow! slow! slow! | |
mapping = load_mapping(mapping_location=mapping_location) | |
# update global variable value | |
MAPPING = mapping | |
return MAPPING | |
else: | |
return MAPPING | |
load_and_set_global_mapping() | |
def parse_filepart(filepart): | |
"""Parse a filepart in format '{year}-{month}/{id}.html' into its components.""" | |
year_month, some_filename = filepart.split("/") | |
year, month = year_month.split("-") | |
return (year, month, some_filename) | |
# super slow | |
def resolve_remotely(url): | |
response = requests.head(f"{RESOLVER_HOST}/{url}", allow_redirects=True) | |
if response.status_code >= 400: | |
raise ResolverStatusException(f"Error: Resolver unable to resolve {url}") | |
return response.url | |
# useful for testing file modification in general | |
# also a find/sed replacement? | |
def resolve_simply(url, mapping=None): | |
return "https://gnusha.org/url/" + url | |
def resolve_locally(url, mapping=None): | |
# TODO: if you want to actually use wayback machine here, then download | |
# their url list and do the redirector in python instead of making users | |
# click the date selection page. | |
#new_url = 'https://web.archive.org/web/*/' + url | |
#return new_url | |
# archive root url | |
if re.match(r'^https?://lists\.linuxfoundation\.org/pipermail/bitcoin-dev/?$', url): | |
return "https://gnusha.org/pi/bitcoindev/" | |
# mailing list info page | |
if re.match(r'^https?://lists\.linuxfoundation\.org/mailman/listinfo/bitcoin-dev/?$', url): | |
return "https://groups.google.com/group/bitcoindev" | |
attachment_paths = ["bitcoin-dev/attachments", "lightning-dev/attachments"] | |
if any(x in url for x in attachment_paths): | |
print("ERROR: Can't handle attachments yet: ", url) | |
return "https://web.archive.org/web/*/" + url | |
# like http://lists.linuxfoundation.org/pipermail/bitcoin-dev/attachments/20190225/a27d8837/attachment-0001.pdf | |
# check that the url matches one of the known prefixes | |
piperarchive = None | |
for prefix in SUPPORTED_URL_PREFIXES: | |
if url.startswith(prefix): | |
piperarchive = url[len(prefix):] | |
break | |
else: | |
print("ERROR: Given url did not match any known prefix: ", url) | |
return None | |
# just being cautious, should be unnecessary | |
if not piperarchive: | |
print("ERROR: piperarchive is None") | |
return None | |
for mailing_list_name in SUPPORTED_MAILING_LIST_NAMES: | |
if piperarchive.startswith(mailing_list_name + "/"): | |
break | |
else: | |
print("ERROR: Could not find matching mailing list name in piperarchive: ", url) | |
return None | |
if mapping in [None, {}]: | |
# this can be very slow! | |
mapping = load_and_set_global_mapping() | |
if mapping in [None, {}]: | |
raise Exception("ERROR: Mapping is empty") | |
if piperarchive.startswith("bitcoin-dev/"): | |
filepart = piperarchive[len("bitcoin-dev/"):] | |
# filepart is in format {year}-{month}/{id}.html | |
(year, month, some_filename) = parse_filepart(filepart) | |
if some_filename in ["thread.html", "date.html", "subject.html", "author.html"]: | |
month_num = datetime.datetime.strptime(month, "%B").month | |
start_date = datetime.datetime(int(year), month_num, 1) | |
end_date = datetime.datetime(int(year), month_num, 1) + datetime.timedelta(days=32) | |
end_date = end_date.replace(day=1) # first day of next month | |
# represent in git approxdate format | |
date_range = f"d:{start_date.year}-{start_date.month:02d}-01..{end_date.year}-{end_date.month:02d}-01" | |
url_encoded_date_range = urllib.parse.quote(date_range) | |
new_url = f"{BITCOIN_DEV_PUBLIC_INBOX_URL}?q={url_encoded_date_range}" | |
return new_url | |
elif ".html" in filepart: | |
# filename without the .html extension | |
# TODO: simplify and use some_filename here | |
locator = filepart[:-len(".html")] | |
# everything after the last / in locator | |
sixdigit_id = locator.split("/")[-1] | |
if mapping not in [None, {}]: | |
# somewhat slow lookup, compared to template print id into url | |
message_id = mapping["bitcoin-dev"][sixdigit_id] | |
new_url = f"{BITCOIN_DEV_PUBLIC_INBOX_URL}{message_id}/" | |
return new_url | |
else: | |
#new_url = f"{BITCOIN_DEV_PIPERMAIL_ARCHIVE_URL}txt/{sixdigit_id}.txt" | |
#return new_url | |
return None | |
else: | |
return None | |
elif piperarchive.startswith("lightning-dev/"): | |
filepart = piperarchive[len("lightning-dev/"):] | |
if some_filename in ["thread.html", "date.html", "subject.html", "author.html"]: | |
return None | |
elif ".html" in filepart: | |
year_month_id = filepart[:-len(".html")] | |
just_id = year_month_id.split("/")[-1] | |
if mapping not in [None, {}]: | |
message_data = mapping["lightning-dev"][just_id] | |
email_file_location = message_data["email_file_location"] | |
new_url = f"{LIGHTNING_DEV_PIPERMAIL_ARCHIVE_URL}{email_file_location}" | |
return new_url | |
else: | |
#new_url = f"{LIGHTNING_DEV_PIPERMAIL_ARCHIVE_URL}txt/{just_id}.txt" | |
#return new_url | |
return None | |
else: | |
return None | |
else: | |
return None | |
return None | |
def resolve_redirect_url(url): | |
print(f"Resolving {url}") | |
resolver = resolve_locally | |
resolver = resolve_remotely | |
# check if the URL matches the specified pattern | |
if re.match(r'^https?://lists\.linuxfoundation\.org/', url): | |
try: | |
try: | |
resolved_url = resolver(url) | |
except ResolverStatusException as e: | |
# no redirect was found by remote server | |
return url | |
else: | |
# if no redirect was found, return original URL | |
if not resolved_url: | |
return url | |
# resolve_locally can return None | |
if resolved_url != None: | |
print(f"Resolved {url} to {resolved_url}") | |
return resolved_url | |
else: | |
return url | |
except: | |
return url | |
return url | |
def process_file(file_path): | |
""" | |
Look for URLs in the file and replace them with the resolved URLs. | |
""" | |
print(f"Processing {file_path}") | |
with open(file_path, 'r') as file: | |
content = file.read() | |
# Look for URLs in the file and replace them with resolved URLs. | |
content = re.sub(r'(https?://lists\.linuxfoundation\.org/[^ \n]+)', lambda match: resolve_redirect_url(match.group(1)), content) | |
with open(file_path, 'w') as file: | |
file.write(content) | |
def find(): | |
""" | |
Find all .md files in workdir directory and process them with process_file() | |
""" | |
for root, dirs, files in os.walk("./workdir/"): | |
for file in files: | |
if file.endswith(".md"): | |
filepath = os.path.join(root, file) | |
process_file(filepath) | |
if __name__ == "__main__": | |
# slow mode: | |
#process_file(sys.argv[1]) | |
# slow because it loads mapping.json every time | |
#find ./workdir/ -type f -name "*.md" -exec python3 resolve-urls.py {} \; | |
# fast mode: | |
find() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment