Last active
July 13, 2023 00:57
-
-
Save modeco80/963187c38b8f40a4e3f074d5e2c903d4 to your computer and use it in GitHub Desktop.
Script to download a Internet Archive collection using aria2c.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Script to download a collection from archive.org using aria2c. | |
# Useful for if torrents aren't an option, but can be used on collections | |
# which provide them as well. | |
import xml.etree.ElementTree as ElementTree | |
import tempfile | |
import requests | |
import subprocess | |
import argparse | |
import urllib.parse | |
class IACollection(): | |
def __init__(self, collection: str): | |
self.files = [] | |
self.collection = collection | |
def IADownloadUrl(self, path): | |
return f'https://archive.org/download/{self.collection}/{urllib.parse.quote(path)}' | |
def PopulateFileList(self): | |
xmlRes = requests.get(self.IADownloadUrl(f'{self.collection}_files.xml')) | |
xmlRoot = ElementTree.fromstring(xmlRes.text) | |
for xmlChild in xmlRoot: | |
if xmlChild.tag == 'file': | |
self.files.append({ | |
'url': self.IADownloadUrl(xmlChild.attrib['name']), | |
'path': xmlChild.attrib['name'] | |
}) | |
def Download(self): | |
with tempfile.NamedTemporaryFile() as tmpList: | |
for file in self.files: | |
tmpList.write(f'{file["url"]}\n out={file["path"]}\n'.encode()) | |
tmpList.flush() | |
# Execute aria2c with the temporary list file. "--follow-torrent=false" is intended to stop aria2 | |
# from using the download list to start downloading from the torrent, and "--continue=true" allows continuation of partial downloads. | |
try: | |
aria2CommandLine = f'aria2c -x 6 --follow-torrent=false --continue=true --input-file={tmpList.name}' | |
print(f'Executing aria2c with "{aria2CommandLine}"') | |
subprocess.run(aria2CommandLine, shell=True, capture_output=False, check=True) | |
except: | |
print('aria2c failed to run :(') | |
def main(): | |
parser = argparse.ArgumentParser(description='Download a Internet Archive collection using aria2c.') | |
parser.add_argument('--collection', dest='collection', required=True, type=str, help='Collection to download from. Duh!') | |
args = parser.parse_args() | |
collection = IACollection(args.collection) | |
collection.PopulateFileList() | |
collection.Download() | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
So I'll admit, I am pretty ignorant when it comes to Python, but after returning back from a vacation where my PC was off, I found that this script ceased functioning properly for me, even using the original or the new one. I ended up tracking the error down to the system either creating and deleting the temp file before it could be used or not creating it at all. I ended up fixing this by doing this modification on line 33:
with tempfile.NamedTemporaryFile(delete=False) as tmpList:
It was originally:
with tempfile.NamedTemporaryFile() as tmpList:
So if anyone gets an error saying that there are no files, check the temp file directory/file that it lists to see if it exists. If it does not, it might be indicative that this could be your solution.