Skip to content

Instantly share code, notes, and snippets.

@asehmi
Forked from pszemraj/download_URLs_in_file.py
Created January 23, 2024 07:10
Show Gist options
  • Save asehmi/2265f2776a831b37f595b6228b56d47c to your computer and use it in GitHub Desktop.
Save asehmi/2265f2776a831b37f595b6228b56d47c to your computer and use it in GitHub Desktop.
pdf downloading utils
import os
import argparse
import requests
from urllib.parse import urlparse
from tqdm import tqdm
from joblib import Parallel, delayed
from tenacity import retry, stop_after_attempt, wait_fixed
@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
def download_url(url, target_dir, force_redownload):
filename = os.path.basename(urlparse(url).path)
target_path = os.path.join(target_dir, filename)
if os.path.exists(target_path) and not force_redownload:
return # File already exists and redownload not forced
response = requests.get(url, stream=True)
response.raise_for_status() # Raise an error for bad responses
with open(target_path, "wb") as target_file:
for chunk in response.iter_content(chunk_size=8192):
target_file.write(chunk)
def download_files_from_list(file_path, target_dir, n_jobs, force_redownload):
if not os.path.exists(target_dir):
os.makedirs(target_dir)
# Load all URLs into a list
with open(file_path, "r") as file:
urls = [line.strip() for line in file]
# Parallel download using joblib and tqdm for progress
Parallel(n_jobs=n_jobs, prefer='threads')(
delayed(download_url)(url, target_dir, force_redownload)
for url in tqdm(urls, desc="Downloading files")
)
def main():
parser = argparse.ArgumentParser(
description="Download files from a list of URLs in parallel."
)
parser.add_argument(
"file_path", help="Path to the text file containing URLs line by line."
)
parser.add_argument(
"target_dir", help="Directory where the files should be downloaded."
)
parser.add_argument(
"--n_jobs",
type=int,
default=-1,
help="Number of CPU cores to use for parallel downloading. Default is all cores.",
)
parser.add_argument(
"--force_redownload",
action="store_true",
help="Force re-download of files even if they already exist.",
)
args = parser.parse_args()
download_files_from_list(
args.file_path, args.target_dir, args.n_jobs, args.force_redownload
)
print(f"Files downloaded to: {args.target_dir}")
if __name__ == "__main__":
main()
#!/bin/bash
FILE_PATH="$1"
TARGET_DIR="$2"
if [ -z "$FILE_PATH" ] || [ -z "$TARGET_DIR" ]; then
echo "Usage: $0 <file_path> <target_dir>"
exit 1
fi
# Create the target directory if it doesn't exist
mkdir -p "$TARGET_DIR"
while IFS= read -r url; do
wget -P "$TARGET_DIR" "$url"
done < "$FILE_PATH"
echo "Files downloaded to: $TARGET_DIR"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment