Created
April 27, 2020 20:08
-
-
Save stek29/1171faa72c6b57e4d2ae87cbfafc76c3 to your computer and use it in GitHub Desktop.
Crawl communigate mail
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env zsh | |
# zsh because bash cant handle {$a..$b} expansion | |
# Directories to crawl | |
DIRS=(INBOX INBOX%2FARCHIVE Sent%20Items) | |
# ID range to try | |
RANGE_MIN=${RANGE_MIN:-1} | |
RANGE_MAX=${RANGE_MAX:-150} | |
# Session cookie: login and see cookies in devtools | |
SESSION_COOKIE=${SESSION_COOKIE:-$1} | |
# right-click on mail in list, click "message in internet format", grap path part after Session | |
SESSION_PATH=${SESSION_PATH:-$2} | |
# Destination folder | |
DEST="${DEST:-mails}" | |
# Temp folder for wget, will be removed | |
TMP="${TMP:-tmp}" | |
mkdir -p "$DEST" | |
rm -rf "${TMP}" | |
mkdir -p "${TMP}/student.bmstu.ru/Session" | |
ln -s "$(realpath $DEST)" "${TMP}/student.bmstu.ru/Session/${SESSION_PATH}" | |
pushd "${TMP}" | |
for d in ${DIRS[*]}; do | |
wget \ | |
--mirror \ | |
--recursive \ | |
--execute robots=off \ | |
--header "Cookie: CGateProWebUser=$SESSION_COOKIE" \ | |
--timestamping \ | |
--page-requisites \ | |
--html-extension \ | |
--adjust-extension \ | |
--restrict-file-names=unix \ | |
--convert-links \ | |
https://student.bmstu.ru/Session/"$SESSION_PATH"/FORMAT/hPronto-/"$d-MM-1"/{$RANGE_MIN..$RANGE_MAX} | |
done | |
popd | |
rm -rf "${TMP}" | |
# Add missing encoding header | |
find "${DEST}" -name '*html' -print0 | | |
xargs -0 grep -L 'DOCTYPE' | | |
tr '\n' '\0' | | |
xargs -0 -L1 \ | |
sed -i '1i <!DOCTYPE html><head><meta http-equiv="Content-Type" content="text/html;charset=UTF-8"></head>' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment