Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Crawl communigate mail
#!/usr/bin/env zsh
# zsh because bash cant handle {$a..$b} expansion
# Directories to crawl
DIRS=(INBOX INBOX%2FARCHIVE Sent%20Items)
# ID range to try
RANGE_MIN=${RANGE_MIN:-1}
RANGE_MAX=${RANGE_MAX:-150}
# Session cookie: login and see cookies in devtools
SESSION_COOKIE=${SESSION_COOKIE:-$1}
# right-click on mail in list, click "message in internet format", grap path part after Session
SESSION_PATH=${SESSION_PATH:-$2}
# Destination folder
DEST="${DEST:-mails}"
# Temp folder for wget, will be removed
TMP="${TMP:-tmp}"
mkdir -p "$DEST"
rm -rf "${TMP}"
mkdir -p "${TMP}/student.bmstu.ru/Session"
ln -s "$(realpath $DEST)" "${TMP}/student.bmstu.ru/Session/${SESSION_PATH}"
pushd "${TMP}"
for d in ${DIRS[*]}; do
wget \
--mirror \
--recursive \
--execute robots=off \
--header "Cookie: CGateProWebUser=$SESSION_COOKIE" \
--timestamping \
--page-requisites \
--html-extension \
--adjust-extension \
--restrict-file-names=unix \
--convert-links \
https://student.bmstu.ru/Session/"$SESSION_PATH"/FORMAT/hPronto-/"$d-MM-1"/{$RANGE_MIN..$RANGE_MAX}
done
popd
rm -rf "${TMP}"
# Add missing encoding header
find "${DEST}" -name '*html' -print0 |
xargs -0 grep -L 'DOCTYPE' |
tr '\n' '\0' |
xargs -0 -L1 \
sed -i '1i <!DOCTYPE html><head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8"></head>'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment