kanzure/pipermail-extractor.sh

## pipermail-extractor.sh
#!/bin/bash
#
# Used for processing the mbox files found in:
# https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/bitcoin-dev-ml-archive.2024-02-09-004.tar.gz
#
# Why?
#
# Linux Foundation has deprecated lists.linuxfoundation.org, and now we need a url rewriting map
# for the pipermail archive to numbered email files.
#
# That remapping app can be found at: https://gnusha.org/url
#
# See https://x.com/kanzure/status/1853588622017990667 for more information.
#
# bitcoin-dev-ml-archive.2024-02-09-004.tar.gz has bad pipermail .txt.gz
# archives, in particular for 2019-February.txt.gz and 2019-March.txt.gz are both
# bad for bitcoin-dev mailing list, however the bitcoin-dev mbox files are
# correct and include the missing content.
#
# I believe what happened is that during the migration from Linux Foundation mail
# server to OSUOSL mail server in 2019 there was some outage, and while the mbox
# file was fixed and the generated mailman pipermail archives were corrected, it
# looks like the .txt.gz archives were never updated after that incident.
#
# Therefore, use the mbox files instead of the pipermail .txt.gz archives in
# this script.

processed_dir="processed/"
email_output_dir="$processed_dir/email/"
mapping_output_file="$processed_dir/mapping.bitcoin-dev.txt"
tmp_counter="/tmp/email-counter-$(uuidgen).txt"

# if mapping output file exists, move it to a /tmp backup
if [[ -f "$mapping_output_file" ]]; then
    backup_file="/tmp/$(basename "$mapping_output_file").$(uuidgen).txt"
    mv "$mapping_output_file" "$backup_file"
    echo "Moved $mapping_output_file to $backup_file"
fi

# setup
mkdir -p "$processed_dir"
mkdir -p "$email_output_dir"

email_counter=0

process_mbox() {
    local filename="mbox/$1"  # mbox filename to process

    # check file existence
    if [[ -f "$filename" ]]; then
        # write current counter to temp file before processing
        echo "$email_counter" > "$tmp_counter"

        # define the command to process each email
        process_email_cmd="
            # read current counter from temp file
            email_counter=\$(cat $tmp_counter)

            # format the email ID as a 6-digit zero-padded number
            email_id=\$(printf '%06d' \"\$email_counter\")

            # new filename will be the email ID printf 6 digits
            new_filename=\$(printf '%06d' \"\$email_counter\")
            new_fullpath=\"$email_output_dir/\$new_filename.eml\"

            # save email content to file
            cat > \"\$new_fullpath\"

            # get the message id from the email
            message_id=\$(grep -m 1 \"^Message-ID:\" \"\$new_fullpath\" | sed 's/.*Message-ID: <//; s/>.*//')

            # if no message id, then try to parse for Message-Id (case insensitive)
            if [ -z \"\$message_id\" ]; then
                message_id=\$(grep -i -m 1 \"^[Mm][Ee][Ss][Ss][Aa][Gg][Ee]-[Ii][Dd]:\" \"\$new_fullpath\" | sed 's/.*[Mm][Ee][Ss][Ss][Aa][Gg][Ee]-[Ii][Dd]: *<//; s/>.*//')
            fi

            # store a mapping between the counter and message_id
            echo \"\$message_id | \$new_filename.eml\" >> $mapping_output_file

            # keepalive for user
            echo \"\$message_id | \$new_filename.eml\"

            email_counter=\$((email_counter + 1))
            echo \"\$email_counter\" > $tmp_counter
        "

        # Use formail to process the emails inside the mbox file
        cat "$filename" | formail -s sh -c "$process_email_cmd"

        # Update the global counter from the temp file
        email_counter=$(cat "$tmp_counter")
    else
        echo "File $filename does not exist. Skipping..."
    fi
}

# process all mbox files
for filename in ./mbox/*.mbox; do
    process_mbox "$(basename "$filename")"
done


## pipermail-lightning-dev-counter.sh
#!/bin/bash
#
# Used for processing the lightning-dev pipermail archive found in:
# https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/bitcoin-dev-ml-archive.2024-02-09-004.tar.gz
# in particular: bitcoin-dev-ml-archive/pipermail-archives/lightning-dev/
#
# Why?
#
# Linux Foundation has deprecated lists.linuxfoundation.org, and now we need a url rewriting map
# for the pipermail archive to numbered email files.
#
# See https://x.com/kanzure/status/1853588622017990667 for more information.
#
# output: https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/lightning-dev-pipermail.2024-12-10.zip
# see: https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/linuxfoundation-pipermail/lightning-dev/

processed_dir="processed"
mapping_output_file="$processed_dir/mapping.lightning-dev.txt"
tmp_counter="/tmp/email-counter-$(uuidgen).txt"

# if mapping output file exists, move it to a /tmp backup
if [[ -f "$mapping_output_file" ]]; then
    backup_file="/tmp/$(basename "$mapping_output_file").$(uuidgen).txt"
    mv "$mapping_output_file" "$backup_file"
    echo "Moved $mapping_output_file to $backup_file"
fi

# setup
mkdir -p "$processed_dir/"

email_counter=0

generate_filename_list() {
    # lightning-dev isn't this old, of course
    local start_year=2011
    local start_month=8
    local end_year=2024
    local end_month=4 # ended earlier than this

    local current_year=$start_year
    local current_month=$start_month

    local filenames=()

    while [[ $current_year -lt $end_year || ($current_year -eq $end_year && $current_month -le $end_month) ]]; do
        # Get the full month name (e.g., "August")
        local month_name=$(date -d "$current_year-$current_month-01" +"%B")

        # Format the filename as "{year}-{month_name}.txt.gz"
        local filename="${current_year}-${month_name}.txt.gz"
        filenames+=("$filename")  # Append filename to the list

        # Move to the next month
        current_month=$((current_month + 1))
        if [[ $current_month -gt 12 ]]; then
            current_month=1
            current_year=$((current_year + 1))
        fi
    done

    # return a list of filenames
    echo "${filenames[@]}"
}

# function to process a single pipermail .txt.gz file
process_pipermail_file() {
    local filename="$1"

    # check if the file exists
    if [[ -f "$filename" ]]; then
        # write current counter to temp file before processing
        echo "$email_counter" > "$tmp_counter"

        # define the command to process each email
        process_email_cmd="
            # Read current counter from temp file
            email_counter=\$(cat $tmp_counter)

            # Format the email ID as a 6-digit zero-padded number
            email_id=\$(printf '%06d' \"\$email_counter\")

            # remove .txt.gz from filename
            filename=\"$filename\"
            base_filename=\$(basename \"\$filename\")
            year_month_name=\${base_filename%.txt.gz}

            # if email_id is equal to 003215 and year_month_name is equal to 2021-August, then exit 0.
            # https://gist.github.com/kanzure/4e7bcc58344ceaa1a668e65a434adb2b?permalink_comment_id=5330243#gistcomment-5330243
            if [ \"\$email_id\" = \"003215\" ] && [ \"\$year_month_name\" = \"2021-August\" ]; then
                exit 0
            fi

            # if email id is to equal to 003216 and year_month_name is equal to 2021-August, then exit 0.
            if [ \"\$email_id\" = \"003216\" ] && [ \"\$year_month_name\" = \"2021-August\" ]; then
                exit 0
            fi

            # skip 3257, 3258, 3259 in September, and pick up at 3257 in October
            if [ \"\$email_id\" = \"003257\" ] && [ \"\$year_month_name\" = \"2021-September\" ]; then
                # skip these
                email_counter=3257
                echo \"\$email_counter\" > $tmp_counter
                exit 0
                # 3257, 3258, 3259 are missing from the LF HTML-generated archive
                # and the next email is 003257.html, so we skip these.
                # 3259.html should be this email:
                # https://gnusha.org/pi/bitcoindev/MkPutJpff5rqUxXFQrEyHZl6Iz0DfrJU_-BQD-y0El65GQFnj7igVfmWU79fPCtiFztUYl4ofzrqeaN0HFMB45YPErY9rYY7_h1XkuTMfvc=@wuille.net/
                # but instead 3259.html is by my count 3262:
                # https://web.archive.org/web/20231114193855/https://lists.linuxfoundation.org/pipermail/lightning-dev/2021-October/003259.html
                # and the .txt.gz 3259.txt email does not appear in the LF HTML-generated archive.
            fi

            # Create directory if it doesn't exist
            mkdir_path=\"$processed_dir/\$year_month_name\"
            mkdir -p \"\$mkdir_path\"

            # The new filename will be the email ID printf 6 digits
            new_filename=\$(printf '%06d' \"\$email_counter\")
            new_fullpath=\"$processed_dir/\$year_month_name/\$new_filename.txt\"

            # Save email content to file
            cat > \"\$new_fullpath\"

            # get the message id from the email
            message_id=\$(grep -m 1 \"Message-ID:\" \"\$new_fullpath\" | sed 's/.*Message-ID: <//; s/>.*//')

            # if no message id, then try to parse for Message-Id (case insensitive)
            if [ -z \"\$message_id\" ]; then
                message_id=\$(grep -i -m 1 \"^[Mm][Ee][Ss][Ss][Aa][Gg][Ee]-[Ii][Dd]:\" \"\$new_fullpath\" | sed 's/.*[Mm][Ee][Ss][Ss][Aa][Gg][Ee]-[Ii][Dd]: *<//; s/>.*//')
            fi

            # store mapping
            echo \"$filename | \$email_id | \$message_id | \$new_fullpath\" >> $mapping_output_file

            # Output the formatted result for logging
            echo \"$filename | \$email_id | \$message_id | \$new_fullpath\"

            email_counter=\$((email_counter + 1))
            echo \"\$email_counter\" > $tmp_counter

            if [ \"\$new_filename\" -eq \"003221\" ]; then
                # skip 3222 and 3223
                # 003222 in .txt.gz matches Wayback Machine 003224
                # 032223 in .txt.gz matches Wayback Machine 003225
                # Wayback Machine 003222 from LF HTML-generated archive is 404
                # Wayback Machine 003223 from LF HTML-generated archive is 404
                # speculation: Someone manually deleted 003222.html and 003223.html, and removed 2 emails from .txt.gz too.
                # This must have been after 003224.html or 003225.html was generated.
                email_counter=3224
                echo \"\$email_counter\" > $tmp_counter
            fi

            if [ \"\$new_filename\" -eq \"003265\" ]; then
                # 3266, 3267, 3268 are missing from the LF HTML-generated archive on Wayback Machine
                # 003266.txt can be found at 003269.html
                email_counter=3269
                echo \"\$email_counter\" > $tmp_counter
            fi
        "

        # use formail to process the emails inside the .gz file
        zcat "$filename" | formail -s sh -c "$process_email_cmd"

        # update the global counter from the temp file
        email_counter=$(cat "$tmp_counter")
    else
        echo "File $filename does not exist. Skipping..."
    fi
}

# could just glob *.txt.gz I guess...
filenames_list=$(generate_filename_list)

# process each of the pipermail *.txt.gz files
for filename in $filenames_list; do
    process_pipermail_file "$filename"
done


## pipermail-mapper.py
import json
import csv

bitcoin_dev_mapping_file = "processed/mapping.bitcoin-dev.txt"
lightning_dev_mapping_file = "processed/mapping.lightning-dev.txt"
output_file = "processed/mapping.json"

def process_bitcoin_dev_mapping(file):
    data = {}

    for line in file:
        # strip any extra whitespace or newline characters
        line = line.strip()

        # split the line on " | "
        if " | " in line:
            email_id, path = line.split(" | ")

            # Add the mapping to the dictionary
            # remove the .eml from the end of the relative path
            data[path[:-4]] = email_id

    # note that August-2021/019317 is malformed in the bitcoin-dev archive
    # or rather: the used parser is somehow wrong.
    # its actual message-id is: CAOU__fx0ajVfuEyoYCOkf8nZPOkbuYDctTfuzhCDdoU=jcA0Tg@mail.gmail.com
    #data["019317"] = "CAOU__fx0ajVfuEyoYCOkf8nZPOkbuYDctTfuzhCDdoU=jcA0Tg@mail.gmail.com"
    # That is no longer true.
    # https://web.archive.org/web/20231114131216/https://lists.linuxfoundation.org/pipermail/bitcoin-dev/2021-August/019317.html
    # seems to match email "9403a01d93b3fe2e871517304b552194@riseup.net"
    # "CAOU__fx0ajVfuEyoYCOkf8nZPOkbuYDctTfuzhCDdoU=jcA0Tg@mail.gmail.com" is 2021-August/019330.html

    # add the migration email, which was not included in the archive
    # because it was sent after the migration
    data["022327"] = "CABaSBaxDjj6ySBx4v+rmpfrw4pE9b=JZJPzPQj_ZUiBg1HGFyA@mail.gmail.com"

    return data

def process_lightning_dev_mapping(file):
    data = {}

    csv_reader = csv.reader(file, delimiter='|')

    counter = 0
    for row in csv_reader:
        # Skip empty rows
        if not row:
            continue

        # trim whitespace from each field
        row = [field.strip() for field in row]

        # unpack fields
        (pipermail_archive_filename, sixdigit_id, message_id, email_file_location) = row

        # some_id is sixdigit_id with leading zeros removed
        some_id = sixdigit_id.lstrip('0')

        # id 0 is "" so let's special case that and fix it
        if counter == 0:
            some_id = 0

        # email_file_location looks like "processed/{year}-{month}/id.txt"
        # remove the "processed/" prefix
        email_file_location = email_file_location.replace("processed/", "")

        # NOTE: The format of this dictionary is different than the bitcoin-dev dictionary.
        data[sixdigit_id] = {
            'pipermail_archive_filename': "lightning-dev/" +pipermail_archive_filename,
            'id': int(some_id),
            'sixdigit_id': sixdigit_id,
            'message_id': message_id,
            'email_file_location': email_file_location,
        }

        counter += 1

    return data

# Dictionary to store the mappings
data = {}

# Read the file and process each line
with open(bitcoin_dev_mapping_file, "r") as file:
    data["bitcoin-dev"] = process_bitcoin_dev_mapping(file)

with open(lightning_dev_mapping_file, "r") as file:
    data["lightning-dev"] = process_lightning_dev_mapping(file)

# Write the dictionary to a JSON file
with open(output_file, "w") as json_file:
    json.dump(data, json_file, indent=4)

print(f"JSON file '{output_file}' created successfully.")


## replace-with-prefix.sh
# Verify broken links and replacement with:
# https://gist.github.com/kouloumos/e2a9c50221bf76e2e2bd4074617357f6
# from https://github.com/bitcointranscripts/bitcointranscripts/pull/566#issuecomment-2462917970


# find and replace with gnusha.org/url prefix
find ./ -type f -exec sed -i 's|https://lists.linuxfoundation.org/|https://gnusha.org/url/https://lists.linuxfoundation.org/|g' {} +
find ./ -type f -exec sed -i 's|http://lists.linuxfoundation.org/|https://gnusha.org/url/https://lists.linuxfoundation.org/|g' {} +


# Resolve without a gnusha.org/url permalink
#
# update resolver.py to in __main__ and to use resolve_remotely (this will be slow)
find ./workdir/ -type f -name "*.md" -exec python3 resolver.py {} \;


## resolver.py
import sys
import json
import datetime
import urllib.parse
import re
import os

# for remote resolving
# pip3 install requests
import requests

class ResolverStatusException(Exception):
    pass

# if using remote resolver
RESOLVER_HOST = "http://localhost:5000"
#RESOLVER_HOST = "https://gnusha.org/url"

BITCOIN_DEV_PUBLIC_INBOX_URL = "https://gnusha.org/pi/bitcoindev/"
PIPERMAIL_ARCHIVE_URL = "https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/linuxfoundation-pipermail/"
BITCOIN_DEV_PIPERMAIL_ARCHIVE_URL = f"{PIPERMAIL_ARCHIVE_URL}bitcoin-dev/"
LIGHTNING_DEV_PIPERMAIL_ARCHIVE_URL = f"{PIPERMAIL_ARCHIVE_URL}lightning-dev/"

# mapping file only has these two mailing lists
SUPPORTED_MAILING_LIST_NAMES = [
    "bitcoin-dev",
    "lightning-dev",
]

# various URL prefixes that this redirector can handle
SUPPORTED_URL_PREFIXES = [
    "https://lists.linuxfoundation.org/pipermail/",
    "https:/lists.linuxfoundation.org/pipermail/",
    "http://lists.linuxfoundation.org/pipermail/",
    "lists.linuxfoundation.org/pipermail/",
]

MAPPING = None
MAPPING_LOCATION = __file__.replace("resolver.py", "mapping.json")

def load_mapping(mapping_location=MAPPING_LOCATION):
    try:
        with open(mapping_location, 'r') as f:
            mapping = json.load(f)

        # note that August-2021/019317 is malformed in the bitcoin-dev archive
        # or rather: the used parser is somehow wrong.
        # its actual message-id is: CAOU__fx0ajVfuEyoYCOkf8nZPOkbuYDctTfuzhCDdoU=jcA0Tg@mail.gmail.com
        mapping["bitcoin-dev"]["019317"] = "CAOU__fx0ajVfuEyoYCOkf8nZPOkbuYDctTfuzhCDdoU=jcA0Tg@mail.gmail.com"

        # this one is missing in the mapping.json file
        # the email was sent after the migration!
        # https://github.com/bitcoinops/bitcoinops.github.io/pull/2018#discussion_r1833154513
        mapping["bitcoin-dev"]["022327"] = "CABaSBaxDjj6ySBx4v+rmpfrw4pE9b=JZJPzPQj_ZUiBg1HGFyA@mail.gmail.com"
        # This is fixed in future runs of pipermail-mapper.py

        return mapping
    except:
        print("ERROR: Could not load mapping.json")
        sys.exit(1)

def load_and_set_global_mapping(mapping_location=MAPPING_LOCATION):
    global MAPPING
    if MAPPING in [None, {}]:
        # slow! slow! slow!
        mapping = load_mapping(mapping_location=mapping_location)

        # update global variable value
        MAPPING = mapping

        return MAPPING
    else:
        return MAPPING

load_and_set_global_mapping()

def parse_filepart(filepart):
    """Parse a filepart in format '{year}-{month}/{id}.html' into its components."""
    year_month, some_filename = filepart.split("/")
    year, month = year_month.split("-")
    return (year, month, some_filename)

# super slow
def resolve_remotely(url):
    response = requests.head(f"{RESOLVER_HOST}/{url}", allow_redirects=True)
    if response.status_code >= 400:
        raise ResolverStatusException(f"Error: Resolver unable to resolve {url}")
    return response.url

# useful for testing file modification in general
# also a find/sed replacement?
def resolve_simply(url, mapping=None):
    return "https://gnusha.org/url/" + url

def resolve_locally(url, mapping=None):
    # TODO: if you want to actually use wayback machine here, then download
    # their url list and do the redirector in python instead of making users
    # click the date selection page.
    #new_url = 'https://web.archive.org/web/*/' + url
    #return new_url

    # archive root url
    if re.match(r'^https?://lists\.linuxfoundation\.org/pipermail/bitcoin-dev/?$', url):
        return "https://gnusha.org/pi/bitcoindev/"

    # mailing list info page
    if re.match(r'^https?://lists\.linuxfoundation\.org/mailman/listinfo/bitcoin-dev/?$', url):
        return "https://groups.google.com/group/bitcoindev"

    attachment_paths = ["bitcoin-dev/attachments", "lightning-dev/attachments"]
    if any(x in url for x in attachment_paths):
        print("ERROR: Can't handle attachments yet: ", url)
        return "https://web.archive.org/web/*/" + url
    # like http://lists.linuxfoundation.org/pipermail/bitcoin-dev/attachments/20190225/a27d8837/attachment-0001.pdf

    # check that the url matches one of the known prefixes
    piperarchive = None
    for prefix in SUPPORTED_URL_PREFIXES:
        if url.startswith(prefix):
            piperarchive = url[len(prefix):]
            break
    else:
        print("ERROR: Given url did not match any known prefix: ", url)
        return None

    # just being cautious, should be unnecessary
    if not piperarchive:
        print("ERROR: piperarchive is None")
        return None

    for mailing_list_name in SUPPORTED_MAILING_LIST_NAMES:
        if piperarchive.startswith(mailing_list_name + "/"):
            break
    else:
        print("ERROR: Could not find matching mailing list name in piperarchive: ", url)
        return None

    if mapping in [None, {}]:
        # this can be very slow!
        mapping = load_and_set_global_mapping()

    if mapping in [None, {}]:
        raise Exception("ERROR: Mapping is empty")

    if piperarchive.startswith("bitcoin-dev/"):
        filepart = piperarchive[len("bitcoin-dev/"):]

        # filepart is in format {year}-{month}/{id}.html
        (year, month, some_filename) = parse_filepart(filepart)

        if some_filename in ["thread.html", "date.html", "subject.html", "author.html"]:
            month_num = datetime.datetime.strptime(month, "%B").month
            start_date = datetime.datetime(int(year), month_num, 1)
            end_date = datetime.datetime(int(year), month_num, 1) + datetime.timedelta(days=32)
            end_date = end_date.replace(day=1)  # first day of next month

            # represent in git approxdate format
            date_range = f"d:{start_date.year}-{start_date.month:02d}-01..{end_date.year}-{end_date.month:02d}-01"
            url_encoded_date_range = urllib.parse.quote(date_range)

            new_url = f"{BITCOIN_DEV_PUBLIC_INBOX_URL}?q={url_encoded_date_range}"
            return new_url
        elif ".html" in filepart:
            # filename without the .html extension
            # TODO: simplify and use some_filename here
            locator = filepart[:-len(".html")]

            # everything after the last / in locator
            sixdigit_id = locator.split("/")[-1]

            if mapping not in [None, {}]:
                # somewhat slow lookup, compared to template print id into url
                message_id = mapping["bitcoin-dev"][sixdigit_id]
                new_url = f"{BITCOIN_DEV_PUBLIC_INBOX_URL}{message_id}/"
                return new_url
            else:
                #new_url = f"{BITCOIN_DEV_PIPERMAIL_ARCHIVE_URL}txt/{sixdigit_id}.txt"
                #return new_url
                return None
        else:
            return None
    elif piperarchive.startswith("lightning-dev/"):
        filepart = piperarchive[len("lightning-dev/"):]

        if some_filename in ["thread.html", "date.html", "subject.html", "author.html"]:
            return None
        elif ".html" in filepart:
            year_month_id = filepart[:-len(".html")]
            just_id = year_month_id.split("/")[-1]

            if mapping not in [None, {}]:
                message_data = mapping["lightning-dev"][just_id]
                email_file_location = message_data["email_file_location"]
                new_url = f"{LIGHTNING_DEV_PIPERMAIL_ARCHIVE_URL}{email_file_location}"
                return new_url
            else:
                #new_url = f"{LIGHTNING_DEV_PIPERMAIL_ARCHIVE_URL}txt/{just_id}.txt"
                #return new_url
                return None
        else:
            return None
    else:
        return None

    return None

def resolve_redirect_url(url):
    print(f"Resolving {url}")
    resolver = resolve_locally
    resolver = resolve_remotely

    # check if the URL matches the specified pattern
    if re.match(r'^https?://lists\.linuxfoundation\.org/', url):
        try:
            try:
                resolved_url = resolver(url)
            except ResolverStatusException as e:
                # no redirect was found by remote server
                return url
            else:
                # if no redirect was found, return original URL
                if not resolved_url:
                    return url

            # resolve_locally can return None
            if resolved_url != None:
                print(f"Resolved {url} to {resolved_url}")
                return resolved_url
            else:
                return url
        except:
            return url

    return url

def process_file(file_path):
    """
    Look for URLs in the file and replace them with the resolved URLs.
    """
    print(f"Processing {file_path}")

    with open(file_path, 'r') as file:
        content = file.read()

    # Look for URLs in the file and replace them with resolved URLs.
    content = re.sub(r'(https?://lists\.linuxfoundation\.org/[^ \n]+)', lambda match: resolve_redirect_url(match.group(1)), content)

    with open(file_path, 'w') as file:
        file.write(content)

def find():
    """
    Find all .md files in workdir directory and process them with process_file()
    """
    for root, dirs, files in os.walk("./workdir/"):
        for file in files:
            if file.endswith(".md"):
                filepath = os.path.join(root, file)
                process_file(filepath)

if __name__ == "__main__":
    # slow mode:
    #process_file(sys.argv[1])
    # slow because it loads mapping.json every time
    #find ./workdir/ -type f -name "*.md" -exec python3 resolve-urls.py {} \;

    # fast mode:
    find()
	#!/bin/bash
	#
	# Used for processing the mbox files found in:
	# https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/bitcoin-dev-ml-archive.2024-02-09-004.tar.gz
	#
	# Why?
	#
	# Linux Foundation has deprecated lists.linuxfoundation.org, and now we need a url rewriting map
	# for the pipermail archive to numbered email files.
	#
	# That remapping app can be found at: https://gnusha.org/url
	#
	# See https://x.com/kanzure/status/1853588622017990667 for more information.
	#
	# bitcoin-dev-ml-archive.2024-02-09-004.tar.gz has bad pipermail .txt.gz
	# archives, in particular for 2019-February.txt.gz and 2019-March.txt.gz are both
	# bad for bitcoin-dev mailing list, however the bitcoin-dev mbox files are
	# correct and include the missing content.
	#
	# I believe what happened is that during the migration from Linux Foundation mail
	# server to OSUOSL mail server in 2019 there was some outage, and while the mbox
	# file was fixed and the generated mailman pipermail archives were corrected, it
	# looks like the .txt.gz archives were never updated after that incident.
	#
	# Therefore, use the mbox files instead of the pipermail .txt.gz archives in
	# this script.

	processed_dir="processed/"
	email_output_dir="$processed_dir/email/"
	mapping_output_file="$processed_dir/mapping.bitcoin-dev.txt"
	tmp_counter="/tmp/email-counter-$(uuidgen).txt"

	# if mapping output file exists, move it to a /tmp backup
	if [[ -f "$mapping_output_file" ]]; then
	backup_file="/tmp/$(basename "$mapping_output_file").$(uuidgen).txt"
	mv "$mapping_output_file" "$backup_file"
	echo "Moved $mapping_output_file to $backup_file"
	fi

	# setup
	mkdir -p "$processed_dir"
	mkdir -p "$email_output_dir"

	email_counter=0

	process_mbox() {
	local filename="mbox/$1" # mbox filename to process

	# check file existence
	if [[ -f "$filename" ]]; then
	# write current counter to temp file before processing
	echo "$email_counter" > "$tmp_counter"

	# define the command to process each email
	process_email_cmd="
	# read current counter from temp file
	email_counter=\$(cat $tmp_counter)

	# format the email ID as a 6-digit zero-padded number
	email_id=\$(printf '%06d' \"\$email_counter\")

	# new filename will be the email ID printf 6 digits
	new_filename=\$(printf '%06d' \"\$email_counter\")
	new_fullpath=\"$email_output_dir/\$new_filename.eml\"

	# save email content to file
	cat > \"\$new_fullpath\"

	# get the message id from the email
	message_id=\$(grep -m 1 \"^Message-ID:\" \"\$new_fullpath\" \| sed 's/.Message-ID: <//; s/>.//')

	# if no message id, then try to parse for Message-Id (case insensitive)
	if [ -z \"\$message_id\" ]; then
	message_id=\$(grep -i -m 1 \"^[Mm][Ee][Ss][Ss][Aa][Gg][Ee]-[Ii][Dd]:\" \"\$new_fullpath\" \| sed 's/.[Mm][Ee][Ss][Ss][Aa][Gg][Ee]-[Ii][Dd]: <//; s/>.*//')
	fi

	# store a mapping between the counter and message_id
	echo \"\$message_id \| \$new_filename.eml\" >> $mapping_output_file

	# keepalive for user
	echo \"\$message_id \| \$new_filename.eml\"

	email_counter=\$((email_counter + 1))
	echo \"\$email_counter\" > $tmp_counter
	"

	# Use formail to process the emails inside the mbox file
	cat "$filename" \| formail -s sh -c "$process_email_cmd"

	# Update the global counter from the temp file
	email_counter=$(cat "$tmp_counter")
	else
	echo "File $filename does not exist. Skipping..."
	fi
	}

	# process all mbox files
	for filename in ./mbox/*.mbox; do
	process_mbox "$(basename "$filename")"
	done
	#!/bin/bash
	#
	# Used for processing the lightning-dev pipermail archive found in:
	# https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/bitcoin-dev-ml-archive.2024-02-09-004.tar.gz
	# in particular: bitcoin-dev-ml-archive/pipermail-archives/lightning-dev/
	#
	# Why?
	#
	# Linux Foundation has deprecated lists.linuxfoundation.org, and now we need a url rewriting map
	# for the pipermail archive to numbered email files.
	#
	# See https://x.com/kanzure/status/1853588622017990667 for more information.
	#
	# output: https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/lightning-dev-pipermail.2024-12-10.zip
	# see: https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/linuxfoundation-pipermail/lightning-dev/

	processed_dir="processed"
	mapping_output_file="$processed_dir/mapping.lightning-dev.txt"
	tmp_counter="/tmp/email-counter-$(uuidgen).txt"

	# if mapping output file exists, move it to a /tmp backup
	if [[ -f "$mapping_output_file" ]]; then
	backup_file="/tmp/$(basename "$mapping_output_file").$(uuidgen).txt"
	mv "$mapping_output_file" "$backup_file"
	echo "Moved $mapping_output_file to $backup_file"
	fi

	# setup
	mkdir -p "$processed_dir/"

	email_counter=0

	generate_filename_list() {
	# lightning-dev isn't this old, of course
	local start_year=2011
	local start_month=8
	local end_year=2024
	local end_month=4 # ended earlier than this

	local current_year=$start_year
	local current_month=$start_month

	local filenames=()

	while [[ $current_year -lt $end_year \|\| ($current_year -eq $end_year && $current_month -le $end_month) ]]; do
	# Get the full month name (e.g., "August")
	local month_name=$(date -d "$current_year-$current_month-01" +"%B")

	# Format the filename as "{year}-{month_name}.txt.gz"
	local filename="${current_year}-${month_name}.txt.gz"
	filenames+=("$filename") # Append filename to the list

	# Move to the next month
	current_month=$((current_month + 1))
	if [[ $current_month -gt 12 ]]; then
	current_month=1
	current_year=$((current_year + 1))
	fi
	done

	# return a list of filenames
	echo "${filenames[@]}"
	}

	# function to process a single pipermail .txt.gz file
	process_pipermail_file() {
	local filename="$1"

	# check if the file exists
	if [[ -f "$filename" ]]; then
	# write current counter to temp file before processing
	echo "$email_counter" > "$tmp_counter"

	# define the command to process each email
	process_email_cmd="
	# Read current counter from temp file
	email_counter=\$(cat $tmp_counter)

	# Format the email ID as a 6-digit zero-padded number
	email_id=\$(printf '%06d' \"\$email_counter\")

	# remove .txt.gz from filename
	filename=\"$filename\"
	base_filename=\$(basename \"\$filename\")
	year_month_name=\${base_filename%.txt.gz}

	# if email_id is equal to 003215 and year_month_name is equal to 2021-August, then exit 0.
	# https://gist.github.com/kanzure/4e7bcc58344ceaa1a668e65a434adb2b?permalink_comment_id=5330243#gistcomment-5330243
	if [ \"\$email_id\" = \"003215\" ] && [ \"\$year_month_name\" = \"2021-August\" ]; then
	exit 0
	fi

	# if email id is to equal to 003216 and year_month_name is equal to 2021-August, then exit 0.
	if [ \"\$email_id\" = \"003216\" ] && [ \"\$year_month_name\" = \"2021-August\" ]; then
	exit 0
	fi

	# skip 3257, 3258, 3259 in September, and pick up at 3257 in October
	if [ \"\$email_id\" = \"003257\" ] && [ \"\$year_month_name\" = \"2021-September\" ]; then
	# skip these
	email_counter=3257
	echo \"\$email_counter\" > $tmp_counter
	exit 0
	# 3257, 3258, 3259 are missing from the LF HTML-generated archive
	# and the next email is 003257.html, so we skip these.
	# 3259.html should be this email:
	# https://gnusha.org/pi/bitcoindev/MkPutJpff5rqUxXFQrEyHZl6Iz0DfrJU_-BQD-y0El65GQFnj7igVfmWU79fPCtiFztUYl4ofzrqeaN0HFMB45YPErY9rYY7_h1XkuTMfvc=@wuille.net/
	# but instead 3259.html is by my count 3262:
	# https://web.archive.org/web/20231114193855/https://lists.linuxfoundation.org/pipermail/lightning-dev/2021-October/003259.html
	# and the .txt.gz 3259.txt email does not appear in the LF HTML-generated archive.
	fi

	# Create directory if it doesn't exist
	mkdir_path=\"$processed_dir/\$year_month_name\"
	mkdir -p \"\$mkdir_path\"

	# The new filename will be the email ID printf 6 digits
	new_filename=\$(printf '%06d' \"\$email_counter\")
	new_fullpath=\"$processed_dir/\$year_month_name/\$new_filename.txt\"

	# Save email content to file
	cat > \"\$new_fullpath\"

	# get the message id from the email
	message_id=\$(grep -m 1 \"Message-ID:\" \"\$new_fullpath\" \| sed 's/.Message-ID: <//; s/>.//')

	# if no message id, then try to parse for Message-Id (case insensitive)
	if [ -z \"\$message_id\" ]; then
	message_id=\$(grep -i -m 1 \"^[Mm][Ee][Ss][Ss][Aa][Gg][Ee]-[Ii][Dd]:\" \"\$new_fullpath\" \| sed 's/.[Mm][Ee][Ss][Ss][Aa][Gg][Ee]-[Ii][Dd]: <//; s/>.*//')
	fi

	# store mapping
	echo \"$filename \| \$email_id \| \$message_id \| \$new_fullpath\" >> $mapping_output_file

	# Output the formatted result for logging
	echo \"$filename \| \$email_id \| \$message_id \| \$new_fullpath\"

	email_counter=\$((email_counter + 1))
	echo \"\$email_counter\" > $tmp_counter

	if [ \"\$new_filename\" -eq \"003221\" ]; then
	# skip 3222 and 3223
	# 003222 in .txt.gz matches Wayback Machine 003224
	# 032223 in .txt.gz matches Wayback Machine 003225
	# Wayback Machine 003222 from LF HTML-generated archive is 404
	# Wayback Machine 003223 from LF HTML-generated archive is 404
	# speculation: Someone manually deleted 003222.html and 003223.html, and removed 2 emails from .txt.gz too.
	# This must have been after 003224.html or 003225.html was generated.
	email_counter=3224
	echo \"\$email_counter\" > $tmp_counter
	fi

	if [ \"\$new_filename\" -eq \"003265\" ]; then
	# 3266, 3267, 3268 are missing from the LF HTML-generated archive on Wayback Machine
	# 003266.txt can be found at 003269.html
	email_counter=3269
	echo \"\$email_counter\" > $tmp_counter
	fi
	"

	# use formail to process the emails inside the .gz file
	zcat "$filename" \| formail -s sh -c "$process_email_cmd"

	# update the global counter from the temp file
	email_counter=$(cat "$tmp_counter")
	else
	echo "File $filename does not exist. Skipping..."
	fi
	}

	# could just glob *.txt.gz I guess...
	filenames_list=$(generate_filename_list)

	# process each of the pipermail *.txt.gz files
	for filename in $filenames_list; do
	process_pipermail_file "$filename"
	done
	import json
	import csv

	bitcoin_dev_mapping_file = "processed/mapping.bitcoin-dev.txt"
	lightning_dev_mapping_file = "processed/mapping.lightning-dev.txt"
	output_file = "processed/mapping.json"

	def process_bitcoin_dev_mapping(file):
	data = {}

	for line in file:
	# strip any extra whitespace or newline characters
	line = line.strip()

	# split the line on " \| "
	if " \| " in line:
	email_id, path = line.split(" \| ")

	# Add the mapping to the dictionary
	# remove the .eml from the end of the relative path
	data[path[:-4]] = email_id

	# note that August-2021/019317 is malformed in the bitcoin-dev archive
	# or rather: the used parser is somehow wrong.
	# its actual message-id is: CAOU__fx0ajVfuEyoYCOkf8nZPOkbuYDctTfuzhCDdoU=jcA0Tg@mail.gmail.com
	#data["019317"] = "CAOU__fx0ajVfuEyoYCOkf8nZPOkbuYDctTfuzhCDdoU=jcA0Tg@mail.gmail.com"
	# That is no longer true.
	# https://web.archive.org/web/20231114131216/https://lists.linuxfoundation.org/pipermail/bitcoin-dev/2021-August/019317.html
	# seems to match email "9403a01d93b3fe2e871517304b552194@riseup.net"
	# "CAOU__fx0ajVfuEyoYCOkf8nZPOkbuYDctTfuzhCDdoU=jcA0Tg@mail.gmail.com" is 2021-August/019330.html

	# add the migration email, which was not included in the archive
	# because it was sent after the migration
	data["022327"] = "CABaSBaxDjj6ySBx4v+rmpfrw4pE9b=JZJPzPQj_ZUiBg1HGFyA@mail.gmail.com"

	return data

	def process_lightning_dev_mapping(file):
	data = {}

	csv_reader = csv.reader(file, delimiter='\|')

	counter = 0
	for row in csv_reader:
	# Skip empty rows
	if not row:
	continue

	# trim whitespace from each field
	row = [field.strip() for field in row]

	# unpack fields
	(pipermail_archive_filename, sixdigit_id, message_id, email_file_location) = row

	# some_id is sixdigit_id with leading zeros removed
	some_id = sixdigit_id.lstrip('0')

	# id 0 is "" so let's special case that and fix it
	if counter == 0:
	some_id = 0

	# email_file_location looks like "processed/{year}-{month}/id.txt"
	# remove the "processed/" prefix
	email_file_location = email_file_location.replace("processed/", "")

	# NOTE: The format of this dictionary is different than the bitcoin-dev dictionary.
	data[sixdigit_id] = {
	'pipermail_archive_filename': "lightning-dev/" +pipermail_archive_filename,
	'id': int(some_id),
	'sixdigit_id': sixdigit_id,
	'message_id': message_id,
	'email_file_location': email_file_location,
	}

	counter += 1

	return data

	# Dictionary to store the mappings
	data = {}

	# Read the file and process each line
	with open(bitcoin_dev_mapping_file, "r") as file:
	data["bitcoin-dev"] = process_bitcoin_dev_mapping(file)

	with open(lightning_dev_mapping_file, "r") as file:
	data["lightning-dev"] = process_lightning_dev_mapping(file)

	# Write the dictionary to a JSON file
	with open(output_file, "w") as json_file:
	json.dump(data, json_file, indent=4)

	print(f"JSON file '{output_file}' created successfully.")
	import sys
	import json
	import datetime
	import urllib.parse
	import re
	import os

	# for remote resolving
	# pip3 install requests
	import requests

	class ResolverStatusException(Exception):
	pass

	# if using remote resolver
	RESOLVER_HOST = "http://localhost:5000"
	#RESOLVER_HOST = "https://gnusha.org/url"

	BITCOIN_DEV_PUBLIC_INBOX_URL = "https://gnusha.org/pi/bitcoindev/"
	PIPERMAIL_ARCHIVE_URL = "https://diyhpl.us/~bryan/irc/bitcoin/bitcoin-dev/linuxfoundation-pipermail/"
	BITCOIN_DEV_PIPERMAIL_ARCHIVE_URL = f"{PIPERMAIL_ARCHIVE_URL}bitcoin-dev/"
	LIGHTNING_DEV_PIPERMAIL_ARCHIVE_URL = f"{PIPERMAIL_ARCHIVE_URL}lightning-dev/"

	# mapping file only has these two mailing lists
	SUPPORTED_MAILING_LIST_NAMES = [
	"bitcoin-dev",
	"lightning-dev",
	]

	# various URL prefixes that this redirector can handle
	SUPPORTED_URL_PREFIXES = [
	"https://lists.linuxfoundation.org/pipermail/",
	"https:/lists.linuxfoundation.org/pipermail/",
	"http://lists.linuxfoundation.org/pipermail/",
	"lists.linuxfoundation.org/pipermail/",
	]

	MAPPING = None
	MAPPING_LOCATION = __file__.replace("resolver.py", "mapping.json")

	def load_mapping(mapping_location=MAPPING_LOCATION):
	try:
	with open(mapping_location, 'r') as f:
	mapping = json.load(f)

	# note that August-2021/019317 is malformed in the bitcoin-dev archive
	# or rather: the used parser is somehow wrong.
	# its actual message-id is: CAOU__fx0ajVfuEyoYCOkf8nZPOkbuYDctTfuzhCDdoU=jcA0Tg@mail.gmail.com
	mapping["bitcoin-dev"]["019317"] = "CAOU__fx0ajVfuEyoYCOkf8nZPOkbuYDctTfuzhCDdoU=jcA0Tg@mail.gmail.com"

	# this one is missing in the mapping.json file
	# the email was sent after the migration!
	# https://github.com/bitcoinops/bitcoinops.github.io/pull/2018#discussion_r1833154513
	mapping["bitcoin-dev"]["022327"] = "CABaSBaxDjj6ySBx4v+rmpfrw4pE9b=JZJPzPQj_ZUiBg1HGFyA@mail.gmail.com"
	# This is fixed in future runs of pipermail-mapper.py

	return mapping
	except:
	print("ERROR: Could not load mapping.json")
	sys.exit(1)

	def load_and_set_global_mapping(mapping_location=MAPPING_LOCATION):
	global MAPPING
	if MAPPING in [None, {}]:
	# slow! slow! slow!
	mapping = load_mapping(mapping_location=mapping_location)

	# update global variable value
	MAPPING = mapping

	return MAPPING
	else:
	return MAPPING

	load_and_set_global_mapping()

	def parse_filepart(filepart):
	"""Parse a filepart in format '{year}-{month}/{id}.html' into its components."""
	year_month, some_filename = filepart.split("/")
	year, month = year_month.split("-")
	return (year, month, some_filename)

	# super slow
	def resolve_remotely(url):
	response = requests.head(f"{RESOLVER_HOST}/{url}", allow_redirects=True)
	if response.status_code >= 400:
	raise ResolverStatusException(f"Error: Resolver unable to resolve {url}")
	return response.url

	# useful for testing file modification in general
	# also a find/sed replacement?
	def resolve_simply(url, mapping=None):
	return "https://gnusha.org/url/" + url

	def resolve_locally(url, mapping=None):
	# TODO: if you want to actually use wayback machine here, then download
	# their url list and do the redirector in python instead of making users
	# click the date selection page.
	#new_url = 'https://web.archive.org/web/*/' + url
	#return new_url

	# archive root url
	if re.match(r'^https?://lists\.linuxfoundation\.org/pipermail/bitcoin-dev/?$', url):
	return "https://gnusha.org/pi/bitcoindev/"

	# mailing list info page
	if re.match(r'^https?://lists\.linuxfoundation\.org/mailman/listinfo/bitcoin-dev/?$', url):
	return "https://groups.google.com/group/bitcoindev"

	attachment_paths = ["bitcoin-dev/attachments", "lightning-dev/attachments"]
	if any(x in url for x in attachment_paths):
	print("ERROR: Can't handle attachments yet: ", url)
	return "https://web.archive.org/web/*/" + url
	# like http://lists.linuxfoundation.org/pipermail/bitcoin-dev/attachments/20190225/a27d8837/attachment-0001.pdf

	# check that the url matches one of the known prefixes
	piperarchive = None
	for prefix in SUPPORTED_URL_PREFIXES:
	if url.startswith(prefix):
	piperarchive = url[len(prefix):]
	break
	else:
	print("ERROR: Given url did not match any known prefix: ", url)
	return None

	# just being cautious, should be unnecessary
	if not piperarchive:
	print("ERROR: piperarchive is None")
	return None

	for mailing_list_name in SUPPORTED_MAILING_LIST_NAMES:
	if piperarchive.startswith(mailing_list_name + "/"):
	break
	else:
	print("ERROR: Could not find matching mailing list name in piperarchive: ", url)
	return None

	if mapping in [None, {}]:
	# this can be very slow!
	mapping = load_and_set_global_mapping()

	if mapping in [None, {}]:
	raise Exception("ERROR: Mapping is empty")

	if piperarchive.startswith("bitcoin-dev/"):
	filepart = piperarchive[len("bitcoin-dev/"):]

	# filepart is in format {year}-{month}/{id}.html
	(year, month, some_filename) = parse_filepart(filepart)

	if some_filename in ["thread.html", "date.html", "subject.html", "author.html"]:
	month_num = datetime.datetime.strptime(month, "%B").month
	start_date = datetime.datetime(int(year), month_num, 1)
	end_date = datetime.datetime(int(year), month_num, 1) + datetime.timedelta(days=32)
	end_date = end_date.replace(day=1) # first day of next month

	# represent in git approxdate format
	date_range = f"d:{start_date.year}-{start_date.month:02d}-01..{end_date.year}-{end_date.month:02d}-01"
	url_encoded_date_range = urllib.parse.quote(date_range)

	new_url = f"{BITCOIN_DEV_PUBLIC_INBOX_URL}?q={url_encoded_date_range}"
	return new_url
	elif ".html" in filepart:
	# filename without the .html extension
	# TODO: simplify and use some_filename here
	locator = filepart[:-len(".html")]

	# everything after the last / in locator
	sixdigit_id = locator.split("/")[-1]

	if mapping not in [None, {}]:
	# somewhat slow lookup, compared to template print id into url
	message_id = mapping["bitcoin-dev"][sixdigit_id]
	new_url = f"{BITCOIN_DEV_PUBLIC_INBOX_URL}{message_id}/"
	return new_url
	else:
	#new_url = f"{BITCOIN_DEV_PIPERMAIL_ARCHIVE_URL}txt/{sixdigit_id}.txt"
	#return new_url
	return None
	else:
	return None
	elif piperarchive.startswith("lightning-dev/"):
	filepart = piperarchive[len("lightning-dev/"):]

	if some_filename in ["thread.html", "date.html", "subject.html", "author.html"]:
	return None
	elif ".html" in filepart:
	year_month_id = filepart[:-len(".html")]
	just_id = year_month_id.split("/")[-1]

	if mapping not in [None, {}]:
	message_data = mapping["lightning-dev"][just_id]
	email_file_location = message_data["email_file_location"]
	new_url = f"{LIGHTNING_DEV_PIPERMAIL_ARCHIVE_URL}{email_file_location}"
	return new_url
	else:
	#new_url = f"{LIGHTNING_DEV_PIPERMAIL_ARCHIVE_URL}txt/{just_id}.txt"
	#return new_url
	return None
	else:
	return None
	else:
	return None

	return None

	def resolve_redirect_url(url):
	print(f"Resolving {url}")
	resolver = resolve_locally
	resolver = resolve_remotely

	# check if the URL matches the specified pattern
	if re.match(r'^https?://lists\.linuxfoundation\.org/', url):
	try:
	try:
	resolved_url = resolver(url)
	except ResolverStatusException as e:
	# no redirect was found by remote server
	return url
	else:
	# if no redirect was found, return original URL
	if not resolved_url:
	return url

	# resolve_locally can return None
	if resolved_url != None:
	print(f"Resolved {url} to {resolved_url}")
	return resolved_url
	else:
	return url
	except:
	return url

	return url

	def process_file(file_path):
	"""
	Look for URLs in the file and replace them with the resolved URLs.
	"""
	print(f"Processing {file_path}")

	with open(file_path, 'r') as file:
	content = file.read()

	# Look for URLs in the file and replace them with resolved URLs.
	content = re.sub(r'(https?://lists\.linuxfoundation\.org/[^ \n]+)', lambda match: resolve_redirect_url(match.group(1)), content)

	with open(file_path, 'w') as file:
	file.write(content)

	def find():
	"""
	Find all .md files in workdir directory and process them with process_file()
	"""
	for root, dirs, files in os.walk("./workdir/"):
	for file in files:
	if file.endswith(".md"):
	filepath = os.path.join(root, file)
	process_file(filepath)

	if __name__ == "__main__":
	# slow mode:
	#process_file(sys.argv[1])
	# slow because it loads mapping.json every time
	#find ./workdir/ -type f -name "*.md" -exec python3 resolve-urls.py {} \;

	# fast mode:
	find()