-
-
Save modeco80/963187c38b8f40a4e3f074d5e2c903d4 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3 | |
# Script to download a collection from archive.org using aria2c. | |
# Useful for if torrents aren't an option, but can be used on collections | |
# which provide them as well. | |
import xml.etree.ElementTree as ElementTree | |
import tempfile | |
import requests | |
import subprocess | |
import argparse | |
import urllib.parse | |
class IACollection(): | |
def __init__(self, collection: str): | |
self.files = [] | |
self.collection = collection | |
def IADownloadUrl(self, path): | |
return f'https://archive.org/download/{self.collection}/{urllib.parse.quote(path)}' | |
def PopulateFileList(self): | |
xmlRes = requests.get(self.IADownloadUrl(f'{self.collection}_files.xml')) | |
xmlRoot = ElementTree.fromstring(xmlRes.text) | |
for xmlChild in xmlRoot: | |
if xmlChild.tag == 'file': | |
self.files.append({ | |
'url': self.IADownloadUrl(xmlChild.attrib['name']), | |
'path': xmlChild.attrib['name'] | |
}) | |
def Download(self): | |
with tempfile.NamedTemporaryFile() as tmpList: | |
for file in self.files: | |
tmpList.write(f'{file["url"]}\n out={file["path"]}\n'.encode()) | |
tmpList.flush() | |
# Execute aria2c with the temporary list file. "--follow-torrent=false" is intended to stop aria2 | |
# from using the download list to start downloading from the torrent, and "--continue=true" allows continuation of partial downloads. | |
try: | |
aria2CommandLine = f'aria2c -x 6 --follow-torrent=false --continue=true --input-file={tmpList.name}' | |
print(f'Executing aria2c with "{aria2CommandLine}"') | |
subprocess.run(aria2CommandLine, shell=True, capture_output=False, check=True) | |
except: | |
print('aria2c failed to run :(') | |
def main(): | |
parser = argparse.ArgumentParser(description='Download a Internet Archive collection using aria2c.') | |
parser.add_argument('--collection', dest='collection', required=True, type=str, help='Collection to download from. Duh!') | |
args = parser.parse_args() | |
collection = IACollection(args.collection) | |
collection.PopulateFileList() | |
collection.Download() | |
if __name__ == "__main__": | |
main() |
So I'll admit, I am pretty ignorant when it comes to Python, but after returning back from a vacation where my PC was off, I found that this script ceased functioning properly for me, even using the original or the new one. I ended up tracking the error down to the system either creating and deleting the temp file before it could be used or not creating it at all. I ended up fixing this by doing this modification on line 33:
with tempfile.NamedTemporaryFile(delete=False) as tmpList:
It was originally:
with tempfile.NamedTemporaryFile() as tmpList:
So if anyone gets an error saying that there are no files, check the temp file directory/file that it lists to see if it exists. If it does not, it might be indicative that this could be your solution.
... oh, so that's how you do that 😅 (I've actually been wondering... oops)
Wow, seriously? I wrote this for the opposite reason (I wanted to use torrents, but a specific collection I wanted to download explicitly barred it, and a manual procedure I had written down was getting annoying, so I decided to write a quick and dirty Python script to do what I was doing by hand automatically (without requiring an API key, like some things seem to.. for no reason), with some improvements, like preserving folder structure), but.. considering that, I guess this script is more useful than I thought it would be?
No problem! It's something I released because I thought it might be useful for other people, so it's really nice to actually see people using it.