Dedupe a Maildir with X-TUID headers
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
set -euo pipefail | |
# After some strange combination of OfflineIMAP & mbsync/isync created a whole | |
# bunch of duplicate messages in my Fastmail account I had to start cleaning things | |
# up. I discovered that the duplicates all had X-TUID headers and were showing up | |
# totally out of order in the Fastmail UI. I now run mbsync with `CopyArrivalDate yes` | |
# which appears prevents this from occurring again. | |
# | |
# I found that every message with an X-TUID header had a matching message without | |
# the header, so I can simply delete those. | |
# | |
# Additionally I had some really old emails from ~1998 that barely had any headers | |
# and I needed to add a Date header which I construct based on either the Delivery-date | |
# or Message-Id header. You probably don't want this and should disable it. | |
# | |
# Requires fdupes, rename, gsed, ack & dedupe_maildir.py (another gist) | |
# | |
# MAKE A BACKUP OF ${MAILDIR} before you begin! | |
MAILDIR="$HOME/Maildir" | |
TMPDIR="$(mktemp -d)" | |
echo "== Working on \"${MAILDIR}\" in \"${TMPDIR}\"" | |
XTUIDS="${TMPDIR}/xtuids" | |
DUPES0="${TMPDIR}/dupes0" | |
DUPES1="${TMPDIR}/dupes1" | |
DUPES="${TMPDIR}/xtuid_dupes" | |
ORIGS="${TMPDIR}/origs" | |
MISSING_DATES="${TMPDIR}/missing_dates" | |
echo -n "== Stripping UIDs from filenames... " | |
find "${MAILDIR}" -type f -exec rename 's/,U=[1-9][0-9]*.*:/:/' {} + | |
sleep 10 | |
echo "done" | |
echo "== Finding duplicate messages" | |
find "${MAILDIR}" -type d -exec fdupes -pH1 {} >> "${DUPES0}" \; | |
echo "== Found $(wc -l "${DUPES0}" | awk '{print $1}') duplicated files" | |
echo -n "== Finding messages with X-TUID header... " | |
ag -l '^X-TUID:' "${MAILDIR}" > "${XTUIDS}" | |
echo "$(wc -l "${XTUIDS}" | awk '{print $1}') found" | |
echo -n "== Stripping X-TUID header from messages... " | |
cat "${XTUIDS}" | xargs -n 4 -P 16 gsed -i '/^X-TUID:.*/d' | |
echo "done" | |
echo "== Finding duplicate messages" | |
find "${MAILDIR}" -type d -exec fdupes -pH1 {} >> "${DUPES1}" \; | |
echo "== Found $(wc -l "${DUPES1}" | awk '{print $1}') duplicated files" | |
echo -n "== Finding dupes that previously had X-TUID header... " | |
# pure Bash was too slow here. Should have just written the whole thing in Python... | |
./dedupe_maildir.py "${XTUIDS}" "${DUPES1}" "${DUPES}" "${ORIGS}" | |
echo -n "== Deleting X-TUID dupes... " | |
cat "${DUPES}" | xargs -n 64 -P 16 rm -f | |
echo "done" | |
echo -n "== Finding messages with no Date header... " | |
#ag -iL '^Date:' "${MAILDIR}" > "${MISSING_DATES}" || true # 295 matches TODO: file a fucking bug | |
#grep -irL '^Date:' "${MAILDIR}" > "${MISSING_DATES}" || true # 51 matches | |
ack -iL '^Date:' "${MAILDIR}" > "${MISSING_DATES}" || true # 51 matches | |
echo "$(wc -l "${MISSING_DATES}" | awk '{print $1}') found" | |
echo -n "== Faking up Date header based on Delivery-date or Message-Id... " | |
for f in $(cat "${MISSING_DATES}"); do | |
if grep -i '^Delivery-date:' "${f}" 2>&1 > /dev/null; then | |
date="$(grep -i '^Delivery-date:' "${f}" | gsed 's/Delivery-date://')" | |
else | |
# This likely won't work for many as it expects a specific Message-Id format | |
date="$(TZ=GMT date -j -R -f "%Y%m%d%H%M%S" "$(awk -F\. '/^Message-Id:.*<Version.32/ {print $3}' "${f}"|head -1)")" | |
fi | |
date="$(echo "${date}" | xargs)" | |
gsed -i "/^From:/i Date: ${date}" "${f}" | |
touch_date="$(date -j -f "%a, %d %b %Y %T %z" "${date}" +%Y%m%d%H%M)" | |
touch -t "${touch_date}" "${f}" | |
done | |
echo "done" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment