Skip to content

Instantly share code, notes, and snippets.

@egg82
Created February 21, 2023 16:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save egg82/47f42fc611e9afcb9d4562c224d8648e to your computer and use it in GitHub Desktop.
Save egg82/47f42fc611e9afcb9d4562c224d8648e to your computer and use it in GitHub Desktop.
#!/usr/bin/env bash
SOURCE=${BASH_SOURCE[0]}
while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink
DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )
SOURCE=$(readlink "$SOURCE")
[[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located
done
DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )
echo
echo "Downloading to ${DIR}"
echo
echo "Checking authors.."
downloaded=false
if [[ ! -f "${DIR}/authors.ndjson.zst" ]] || [[ $(find "${DIR}/authors.ndjson.zst" -ctime +60 -print) ]]; then
echo
echo "Downloading authors to ${DIR}/authors.ndjson.zst"
rm -f "${DIR}/authors.ndjson.zst"
wget -q --show-progress -c https://files.pushshift.io/reddit/authors/authors.ndjson.zst -O "${DIR}/authors.ndjson.zst.tmp" && mv "${DIR}/authors.ndjson.zst.tmp" "${DIR}/authors.ndjson.zst"
downloaded=true
fi
if [[ -f "${DIR}/authors.ndjson.zst" ]]; then
if [[ ! -f "${DIR}/authors.ndjson" ]] || $downloaded; then
echo
echo "Unpacking authors to ${DIR}/authors.ndjson"
rm -f "${DIR}/authors.ndjson"
zstd -q --progress --memory=2048MB -d "${DIR}/authors.ndjson.zst" --output-dir-flat "${DIR}"
fi
fi
echo
echo "Checking comments.."
mkdir -p "${DIR}/comments"
for year in {2005..2105}; do
for month in {1..12}; do
month_formatted=$(printf "%02d" "${month}")
if wget --spider "https://files.pushshift.io/reddit/comments/RC_${year}-${month_formatted}.zst" 2>/dev/null; then
if [[ ! -f "${DIR}/comments/RC_${year}-${month_formatted}.zst" ]]; then
echo
echo "Downloading https://files.pushshift.io/reddit/comments/RC_${year}-${month_formatted}.zst"
rm -f "${DIR}/comments/RC_${year}-${month_formatted}.zst"
wget -q --show-progress -c "https://files.pushshift.io/reddit/comments/RC_${year}-${month_formatted}.zst" -O "${DIR}/comments/RC_${year}-${month_formatted}.zst.tmp" && mv "${DIR}/comments/RC_${year}-${month_formatted}.zst.tmp" "${DIR}/comments/RC_${year}-${month_formatted}.zst"
fi
#if [[ -f "${DIR}/comments/RC_${year}-${month_formatted}.zst" ]]; then
# if [[ ! -f "${DIR}/comments/RC_${year}-${month_formatted}" ]] && [[ ! -f "${DIR}/comments/RC_${year}-${month_formatted}.json" ]]; then
# echo
# echo "Unpacking comments to ${DIR}/comments/RC_${year}-${month_formatted}.json"
# rm -f "${DIR}/comments/RC_${year}-${month_formatted}"
# rm -f "${DIR}/comments/RC_${year}-${month_formatted}.json"
# zstd -q --progress --memory=2048MB -d "${DIR}/comments/RC_${year}-${month_formatted}.zst" --output-dir-flat "${DIR}/comments"
# mv "${DIR}/comments/RC_${year}-${month_formatted}" "${DIR}/comments/RC_${year}-${month_formatted}.json"
# fi
#fi
fi
done
done
echo
echo "Checking submissions.."
mkdir -p "${DIR}/submissions"
for year in {2005..2105}; do
for month in {1..12}; do
month_formatted=$(printf "%02d" "${month}")
if wget --spider "https://files.pushshift.io/reddit/submissions/RS_${year}-${month_formatted}.zst" 2>/dev/null; then
if [[ ! -f "${DIR}/submissions/RS_${year}-${month_formatted}.zst" ]]; then
echo
echo "Downloading https://files.pushshift.io/reddit/submissions/RS_${year}-${month_formatted}.zst"
rm -f "${DIR}/submissions/RS_${year}-${month_formatted}.zst"
wget -q --show-progress -c "https://files.pushshift.io/reddit/submissions/RS_${year}-${month_formatted}.zst" -O "${DIR}/submissions/RS_${year}-${month_formatted}.zst.tmp" && mv "${DIR}/submissions/RS_${year}-${month_formatted}.zst.tmp" "${DIR}/submissions/RS_${year}-${month_formatted}.zst"
fi
#if [[ -f "${DIR}/submissions/RS_${year}-${month_formatted}.zst" ]]; then
# if [[ ! -f "${DIR}/submissions/RS_${year}-${month_formatted}" ]] && [[ ! -f "${DIR}/submissions/RS_${year}-${month_formatted}.json" ]]; then
# echo
# echo "Unpacking submissions to ${DIR}/submissions/RS_${year}-${month_formatted}.json"
# rm -f "${DIR}/submissions/RS_${year}-${month_formatted}"
# rm -f "${DIR}/submissions/RS_${year}-${month_formatted}.json"
# zstd -q --progress --memory=2048MB -d "${DIR}/submissions/RS_${year}-${month_formatted}.zst" --output-dir-flat "${DIR}/submissions"
# mv "${DIR}/submissions/RS_${year}-${month_formatted}" "${DIR}/submissions/RS_${year}-${month_formatted}.json"
# fi
#fi
fi
done
done
echo
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment