Created
February 21, 2023 16:30
-
-
Save egg82/47f42fc611e9afcb9d4562c224d8648e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
SOURCE=${BASH_SOURCE[0]} | |
while [ -L "$SOURCE" ]; do # resolve $SOURCE until the file is no longer a symlink | |
DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) | |
SOURCE=$(readlink "$SOURCE") | |
[[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE # if $SOURCE was a relative symlink, we need to resolve it relative to the path where the symlink file was located | |
done | |
DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd ) | |
echo | |
echo "Downloading to ${DIR}" | |
echo | |
echo "Checking authors.." | |
downloaded=false | |
if [[ ! -f "${DIR}/authors.ndjson.zst" ]] || [[ $(find "${DIR}/authors.ndjson.zst" -ctime +60 -print) ]]; then | |
echo | |
echo "Downloading authors to ${DIR}/authors.ndjson.zst" | |
rm -f "${DIR}/authors.ndjson.zst" | |
wget -q --show-progress -c https://files.pushshift.io/reddit/authors/authors.ndjson.zst -O "${DIR}/authors.ndjson.zst.tmp" && mv "${DIR}/authors.ndjson.zst.tmp" "${DIR}/authors.ndjson.zst" | |
downloaded=true | |
fi | |
if [[ -f "${DIR}/authors.ndjson.zst" ]]; then | |
if [[ ! -f "${DIR}/authors.ndjson" ]] || $downloaded; then | |
echo | |
echo "Unpacking authors to ${DIR}/authors.ndjson" | |
rm -f "${DIR}/authors.ndjson" | |
zstd -q --progress --memory=2048MB -d "${DIR}/authors.ndjson.zst" --output-dir-flat "${DIR}" | |
fi | |
fi | |
echo | |
echo "Checking comments.." | |
mkdir -p "${DIR}/comments" | |
for year in {2005..2105}; do | |
for month in {1..12}; do | |
month_formatted=$(printf "%02d" "${month}") | |
if wget --spider "https://files.pushshift.io/reddit/comments/RC_${year}-${month_formatted}.zst" 2>/dev/null; then | |
if [[ ! -f "${DIR}/comments/RC_${year}-${month_formatted}.zst" ]]; then | |
echo | |
echo "Downloading https://files.pushshift.io/reddit/comments/RC_${year}-${month_formatted}.zst" | |
rm -f "${DIR}/comments/RC_${year}-${month_formatted}.zst" | |
wget -q --show-progress -c "https://files.pushshift.io/reddit/comments/RC_${year}-${month_formatted}.zst" -O "${DIR}/comments/RC_${year}-${month_formatted}.zst.tmp" && mv "${DIR}/comments/RC_${year}-${month_formatted}.zst.tmp" "${DIR}/comments/RC_${year}-${month_formatted}.zst" | |
fi | |
#if [[ -f "${DIR}/comments/RC_${year}-${month_formatted}.zst" ]]; then | |
# if [[ ! -f "${DIR}/comments/RC_${year}-${month_formatted}" ]] && [[ ! -f "${DIR}/comments/RC_${year}-${month_formatted}.json" ]]; then | |
# echo | |
# echo "Unpacking comments to ${DIR}/comments/RC_${year}-${month_formatted}.json" | |
# rm -f "${DIR}/comments/RC_${year}-${month_formatted}" | |
# rm -f "${DIR}/comments/RC_${year}-${month_formatted}.json" | |
# zstd -q --progress --memory=2048MB -d "${DIR}/comments/RC_${year}-${month_formatted}.zst" --output-dir-flat "${DIR}/comments" | |
# mv "${DIR}/comments/RC_${year}-${month_formatted}" "${DIR}/comments/RC_${year}-${month_formatted}.json" | |
# fi | |
#fi | |
fi | |
done | |
done | |
echo | |
echo "Checking submissions.." | |
mkdir -p "${DIR}/submissions" | |
for year in {2005..2105}; do | |
for month in {1..12}; do | |
month_formatted=$(printf "%02d" "${month}") | |
if wget --spider "https://files.pushshift.io/reddit/submissions/RS_${year}-${month_formatted}.zst" 2>/dev/null; then | |
if [[ ! -f "${DIR}/submissions/RS_${year}-${month_formatted}.zst" ]]; then | |
echo | |
echo "Downloading https://files.pushshift.io/reddit/submissions/RS_${year}-${month_formatted}.zst" | |
rm -f "${DIR}/submissions/RS_${year}-${month_formatted}.zst" | |
wget -q --show-progress -c "https://files.pushshift.io/reddit/submissions/RS_${year}-${month_formatted}.zst" -O "${DIR}/submissions/RS_${year}-${month_formatted}.zst.tmp" && mv "${DIR}/submissions/RS_${year}-${month_formatted}.zst.tmp" "${DIR}/submissions/RS_${year}-${month_formatted}.zst" | |
fi | |
#if [[ -f "${DIR}/submissions/RS_${year}-${month_formatted}.zst" ]]; then | |
# if [[ ! -f "${DIR}/submissions/RS_${year}-${month_formatted}" ]] && [[ ! -f "${DIR}/submissions/RS_${year}-${month_formatted}.json" ]]; then | |
# echo | |
# echo "Unpacking submissions to ${DIR}/submissions/RS_${year}-${month_formatted}.json" | |
# rm -f "${DIR}/submissions/RS_${year}-${month_formatted}" | |
# rm -f "${DIR}/submissions/RS_${year}-${month_formatted}.json" | |
# zstd -q --progress --memory=2048MB -d "${DIR}/submissions/RS_${year}-${month_formatted}.zst" --output-dir-flat "${DIR}/submissions" | |
# mv "${DIR}/submissions/RS_${year}-${month_formatted}" "${DIR}/submissions/RS_${year}-${month_formatted}.json" | |
# fi | |
#fi | |
fi | |
done | |
done | |
echo |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment