-
-
Save modeco80/963187c38b8f40a4e3f074d5e2c903d4 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
# Script to download a collection from archive.org using aria2c. | |
# Useful for if torrents aren't an option, but can be used on collections | |
# which provide them as well. | |
import xml.etree.ElementTree as ElementTree | |
import tempfile | |
import requests | |
import subprocess | |
import argparse | |
import urllib.parse | |
class IACollection(): | |
def __init__(self, collection: str): | |
self.files = [] | |
self.collection = collection | |
def IADownloadUrl(self, path): | |
return f'https://archive.org/download/{self.collection}/{urllib.parse.quote(path)}' | |
def PopulateFileList(self): | |
xmlRes = requests.get(self.IADownloadUrl(f'{self.collection}_files.xml')) | |
xmlRoot = ElementTree.fromstring(xmlRes.text) | |
for xmlChild in xmlRoot: | |
if xmlChild.tag == 'file': | |
self.files.append({ | |
'url': self.IADownloadUrl(xmlChild.attrib['name']), | |
'path': xmlChild.attrib['name'] | |
}) | |
def Download(self): | |
with tempfile.NamedTemporaryFile() as tmpList: | |
for file in self.files: | |
tmpList.write(f'{file["url"]}\n out={file["path"]}\n'.encode()) | |
tmpList.flush() | |
# Execute aria2c with the temporary list file. "--follow-torrent=false" is intended to stop aria2 | |
# from using the download list to start downloading from the torrent, and "--continue=true" allows continuation of partial downloads. | |
try: | |
aria2CommandLine = f'aria2c -x 6 --follow-torrent=false --continue=true --input-file={tmpList.name}' | |
print(f'Executing aria2c with "{aria2CommandLine}"') | |
subprocess.run(aria2CommandLine, shell=True, capture_output=False, check=True) | |
except: | |
print('aria2c failed to run :(') | |
def main(): | |
parser = argparse.ArgumentParser(description='Download a Internet Archive collection using aria2c.') | |
parser.add_argument('--collection', dest='collection', required=True, type=str, help='Collection to download from. Duh!') | |
args = parser.parse_args() | |
collection = IACollection(args.collection) | |
collection.PopulateFileList() | |
collection.Download() | |
if __name__ == "__main__": | |
main() |
Modify line 40 to add --continue=true
... oh, so that's how you do that 😅 (I've actually been wondering... oops)
You would think downloading using the torrent would work, but due to the way archive.org is setup with torrents, any files that are added after the initial upload will not be added to the torrent, so you will often times be missing substantial amounts of files if you download through the torrent.
Wow, seriously? I wrote this for the opposite reason (I wanted to use torrents, but a specific collection I wanted to download explicitly barred it, and a manual procedure I had written down was getting annoying, so I decided to write a quick and dirty Python script to do what I was doing by hand automatically (without requiring an API key, like some things seem to.. for no reason), with some improvements, like preserving folder structure), but.. considering that, I guess this script is more useful than I thought it would be?
Great work.
No problem! It's something I released because I thought it might be useful for other people, so it's really nice to actually see people using it.
So I'll admit, I am pretty ignorant when it comes to Python, but after returning back from a vacation where my PC was off, I found that this script ceased functioning properly for me, even using the original or the new one. I ended up tracking the error down to the system either creating and deleting the temp file before it could be used or not creating it at all. I ended up fixing this by doing this modification on line 33:
with tempfile.NamedTemporaryFile(delete=False) as tmpList:
It was originally:
with tempfile.NamedTemporaryFile() as tmpList:
So if anyone gets an error saying that there are no files, check the temp file directory/file that it lists to see if it exists. If it does not, it might be indicative that this could be your solution.
Thank you very much for this, I just wanted to add this possible revision/update/option for people. Modify line 40 to add --continue=true:
aria2CommandLine = f'aria2c -x 6 --follow-torrent=false --continue=true --input-file={tmpList.name}'
This will allow you to have Aria resume your files where you left off if you are downloading a large amount of files. It will skip the completed files (those without .aria2 files) and will resume those with the .aria2 files. If you don't have this on, and your downloads fail and you need to resume, this script will start from the beginning and name the files with a .1 from the beginning.
I haven't tried this modification with starting a new download, but I would assume it would work fine.
Thanks again Modeco80 for the base script, its working great!
For those who are unaware, downloading off archive.org is a challenge, especially with large amounts of files. You would think downloading using the torrent would work, but due to the way archive.org is setup with torrents, any files that are added after the initial upload will not be added to the torrent, so you will often times be missing substantial amounts of files if you download through the torrent. This script lets you get all files directly and use Aria2c multi-threads to download. Great work.