phette23/download.fish

## readme.md

      
    Raw
  

              readme.md
            
          
    Readme

https://support.archive-it.org/hc/en-us/articles/360015225051-Find-and-download-your-WARC-files-with-WASAPI
If you fill in the correct credentials at the top of the download.fish script, it will: write the Archive-It API data to a JSON file, write the WARC URLs to a separate text file, download the first ten WARCs, rename them (removing a long ".tmp" extension), and write the downloaded URLs to a separate text file.
Wanted to use the (deprecated) gdrive tool https://github.com/prasmussen/gdrive but when I go to run it for the first time Google blocks it from accessing my account.

  
## download.fish
#!/usr/bin/env fish

set USER username
set PASS password
set COLLECTION 123456
set JSONFILE data.json
set URLSFILE urls.txt
set DONEFILE done.txt
set LIMIT 8

# download JSON data from WASAPI then write all WARC URLs to file
if test ! -f $JSONFILE
    curl -u $USER:$PASS "https://warcs.archive-it.org/wasapi/v1/webdata?collection=$COLLECTION" > $JSONFILE
    jq -r .files[].locations[0] $JSONFILE > $URLSFILE
end

# go through them $LIMIT at a time, save finished URLs to done.txt
for INDEX in (seq 1 $LIMIT)
    set_color --bold red
    echo "Downloading file $INDEX out of $LIMIT"
    set URL (head -n 1 $URLSFILE)
    echo -e $URL '\n'
    set_color normal
    wget --http-user=$USER --http-password=$PASS --accept txt,gz $URL
    if [ $status -eq 0 ]
        # "cut" first line of URLSFILE to DONEFILE
        echo $URL >> done.txt
        rename -v 's/\?.*tmp//' *.tmp
        sed -i '.bak' '1d' $URLSFILE
    else
        echo -e 'Error downloading\n$URL'
        exit 1
    end
end

set_color --bold red
echo -e "\nProgress:"
wc -l $URLSFILE && wc -l $DONEFILE
	#!/usr/bin/env fish

	set USER username
	set PASS password
	set COLLECTION 123456
	set JSONFILE data.json
	set URLSFILE urls.txt
	set DONEFILE done.txt
	set LIMIT 8

	# download JSON data from WASAPI then write all WARC URLs to file
	if test ! -f $JSONFILE
	curl -u $USER:$PASS "https://warcs.archive-it.org/wasapi/v1/webdata?collection=$COLLECTION" > $JSONFILE
	jq -r .files[].locations[0] $JSONFILE > $URLSFILE
	end

	# go through them $LIMIT at a time, save finished URLs to done.txt
	for INDEX in (seq 1 $LIMIT)
	set_color --bold red
	echo "Downloading file $INDEX out of $LIMIT"
	set URL (head -n 1 $URLSFILE)
	echo -e $URL '\n'
	set_color normal
	wget --http-user=$USER --http-password=$PASS --accept txt,gz $URL
	if [ $status -eq 0 ]
	# "cut" first line of URLSFILE to DONEFILE
	echo $URL >> done.txt
	rename -v 's/\?.tmp//' .tmp
	sed -i '.bak' '1d' $URLSFILE
	else
	echo -e 'Error downloading\n$URL'
	exit 1
	end
	end

	set_color --bold red
	echo -e "\nProgress:"
	wc -l $URLSFILE && wc -l $DONEFILE