Created
September 25, 2021 16:35
-
-
Save marco-schmidt/06fb9a47a0a1fce35cdb1e1f77b22a91 to your computer and use it in GitHub Desktop.
Download Internet Archive Twitter stream archives
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# Purpose: Download a month of archive.org Twitter stream tar files. | |
# Author: Marco Schmidt | |
# Usage: ./download_twitter.sh DEST_DIR YEAR MONTH PATHYEAR PATHMONTH | |
# Requirements: (1) tools bash, date, mkdir, nohup, seq, sleep, wget | |
# (2) network connectivity to archive.org via https | |
# (3) enough free disk space | |
# Examples: (1) | |
# regular case, the address contains the same month/year in path and file name | |
# see https://archive.org/details/archiveteam-twitter-stream-2020-01 | |
# ./download_twitter.sh /mnt/vol3/twitter 2020 1 2020 1 | |
# (2) | |
# the files for November 2019 were put into the directory for November 2018 | |
# see https://archive.org/details/archiveteam-twitter-stream-2018-11 | |
# ./download_twitter.sh /mnt/vol3/twitter 2019 11 2018 11 | |
# Remarks: Why wget and not curl? | |
# The script is supposed to support continuing downloads of partially | |
# downloaded files. | |
# When running the script on a directory where all downloads are complete, | |
# curl -c - would append html code with an HTTP 416 error message to the | |
# downloaded tar files, see https://github.com/curl/curl/issues/1163 | |
# e=exit on error, u=exit on unset variable, x=print expanded command before executing it | |
set -eux | |
# get script parameters or default values | |
rootdir=${1-.} | |
year=${2-2020} | |
month=${3-1} | |
month=$(printf "%02d" "${month}") | |
pathyear=${4-} | |
pathmonth=${5-} | |
pathmonth=$(printf "%02d" "${pathmonth}") | |
# show which version of wget is installed, fail otherwise | |
wget --version | |
# determine directory to hold downloaded tar files | |
destdir="$rootdir/$year/$month" | |
# if necessary create that directory | |
mkdir -p "${destdir}" | |
# change to that directory | |
pushd "${destdir}" | |
# determine number of days in given month | |
days=$(date -d "$year/$month/1 + 1 month - 1 day" "+%d") | |
# start one download per day in the month in the background | |
firstday=1 | |
lastday="$days" | |
urlprefix="https://archive.org/download/archiveteam-twitter-stream-${pathyear}-${pathmonth}/twitter_stream_" | |
for i in $(seq "${firstday}" "${lastday}"); do | |
# add a leading zero character if day number smaller 10 | |
day=$(printf "%02d" "${i}") | |
# start download in the background so that closing the current shell will not terminate it | |
nohup wget -c -q "${urlprefix}${year}_${month}_${day}.tar" &>/dev/null & | |
# wait a little until next loop iteration | |
sleep 10 | |
done | |
# go back to directory where script was started | |
popd |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment