asehmi/download_URLs_in_file.py

## download_URLs_in_file.py
import os
import argparse
import requests
from urllib.parse import urlparse
from tqdm import tqdm
from joblib import Parallel, delayed
from tenacity import retry, stop_after_attempt, wait_fixed


@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
def download_url(url, target_dir, force_redownload):
    filename = os.path.basename(urlparse(url).path)
    target_path = os.path.join(target_dir, filename)

    if os.path.exists(target_path) and not force_redownload:
        return  # File already exists and redownload not forced

    response = requests.get(url, stream=True)
    response.raise_for_status()  # Raise an error for bad responses

    with open(target_path, "wb") as target_file:
        for chunk in response.iter_content(chunk_size=8192):
            target_file.write(chunk)


def download_files_from_list(file_path, target_dir, n_jobs, force_redownload):
    if not os.path.exists(target_dir):
        os.makedirs(target_dir)
    # Load all URLs into a list
    with open(file_path, "r") as file:
        urls = [line.strip() for line in file]
    # Parallel download using joblib and tqdm for progress
    Parallel(n_jobs=n_jobs, prefer='threads')(
        delayed(download_url)(url, target_dir, force_redownload)
        for url in tqdm(urls, desc="Downloading files")
    )


def main():
    parser = argparse.ArgumentParser(
        description="Download files from a list of URLs in parallel."
    )
    parser.add_argument(
        "file_path", help="Path to the text file containing URLs line by line."
    )
    parser.add_argument(
        "target_dir", help="Directory where the files should be downloaded."
    )
    parser.add_argument(
        "--n_jobs",
        type=int,
        default=-1,
        help="Number of CPU cores to use for parallel downloading. Default is all cores.",
    )
    parser.add_argument(
        "--force_redownload",
        action="store_true",
        help="Force re-download of files even if they already exist.",
    )
    args = parser.parse_args()

    download_files_from_list(
        args.file_path, args.target_dir, args.n_jobs, args.force_redownload
    )
    print(f"Files downloaded to: {args.target_dir}")


if __name__ == "__main__":
    main()

## download_URLs_in_file.sh
#!/bin/bash

FILE_PATH="$1"
TARGET_DIR="$2"

if [ -z "$FILE_PATH" ] || [ -z "$TARGET_DIR" ]; then
    echo "Usage: $0 <file_path> <target_dir>"
    exit 1
fi

# Create the target directory if it doesn't exist
mkdir -p "$TARGET_DIR"

while IFS= read -r url; do
    wget -P "$TARGET_DIR" "$url"
done < "$FILE_PATH"

echo "Files downloaded to: $TARGET_DIR"

## extract_pdf_links.py
import argparse
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse


def extract_pdf_links_from_url(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    pdf_links = []

    for link in soup.find_all("a"):
        href = link.get("href")
        if href and href.endswith(".pdf"):
            # Convert relative URLs to absolute URLs
            abs_url = requests.compat.urljoin(url, href)
            pdf_links.append(abs_url)

    return pdf_links


def main():
    parser = argparse.ArgumentParser(
        description="Extract PDF links from a given webpage."
    )
    parser.add_argument("url", help="The URL of the webpage to extract from.")
    args = parser.parse_args()

    pdf_links = extract_pdf_links_from_url(args.url)

    # Use the base URL to create the file name
    base_url = urlparse(args.url).netloc
    file_name = base_url.replace(".", "_") + "_pdf_links.txt"

    with open(file_name, "w") as f:
        for link in pdf_links:
            f.write(link + "\n")

    print(f"PDF download links written to: {file_name}")


if __name__ == "__main__":
    main()

## extract_pdf_links.sh
#!/bin/bash

URL="$1"

if [ -z "$URL" ]; then
    echo "Usage: $0 <URL>"
    exit 1
fi

# Extract the base URL for use in forming absolute paths
BASE_URL=$(echo "$URL" | awk -F/ '{print $3}')
FILENAME="${BASE_URL//./_}_pdf_links.txt"

wget -O - "$URL" 2>/dev/null | grep -o '<a href=['"'"'"][^"'"'"']*\.pdf['"'"'"]' | sed 's/^<a href=["'"'"']\([^"'"'"']*\)["'"'"']$/\1/g' | while read -r line; do
    # Check if the line starts with "http", indicating it's an absolute path
    if [[ $line == http* ]]; then
        echo "$line"
    else
        # Form the full absolute URL
        echo "http://$BASE_URL$line"
    fi
done > "$FILENAME"

echo "PDF download links written to: $FILENAME"
	import os
	import argparse
	import requests
	from urllib.parse import urlparse
	from tqdm import tqdm
	from joblib import Parallel, delayed
	from tenacity import retry, stop_after_attempt, wait_fixed


	@retry(stop=stop_after_attempt(5), wait=wait_fixed(2))
	def download_url(url, target_dir, force_redownload):
	filename = os.path.basename(urlparse(url).path)
	target_path = os.path.join(target_dir, filename)

	if os.path.exists(target_path) and not force_redownload:
	return # File already exists and redownload not forced

	response = requests.get(url, stream=True)
	response.raise_for_status() # Raise an error for bad responses

	with open(target_path, "wb") as target_file:
	for chunk in response.iter_content(chunk_size=8192):
	target_file.write(chunk)


	def download_files_from_list(file_path, target_dir, n_jobs, force_redownload):
	if not os.path.exists(target_dir):
	os.makedirs(target_dir)
	# Load all URLs into a list
	with open(file_path, "r") as file:
	urls = [line.strip() for line in file]
	# Parallel download using joblib and tqdm for progress
	Parallel(n_jobs=n_jobs, prefer='threads')(
	delayed(download_url)(url, target_dir, force_redownload)
	for url in tqdm(urls, desc="Downloading files")
	)


	def main():
	parser = argparse.ArgumentParser(
	description="Download files from a list of URLs in parallel."
	)
	parser.add_argument(
	"file_path", help="Path to the text file containing URLs line by line."
	)
	parser.add_argument(
	"target_dir", help="Directory where the files should be downloaded."
	)
	parser.add_argument(
	"--n_jobs",
	type=int,
	default=-1,
	help="Number of CPU cores to use for parallel downloading. Default is all cores.",
	)
	parser.add_argument(
	"--force_redownload",
	action="store_true",
	help="Force re-download of files even if they already exist.",
	)
	args = parser.parse_args()

	download_files_from_list(
	args.file_path, args.target_dir, args.n_jobs, args.force_redownload
	)
	print(f"Files downloaded to: {args.target_dir}")


	if __name__ == "__main__":
	main()
	#!/bin/bash

	FILE_PATH="$1"
	TARGET_DIR="$2"

	if [ -z "$FILE_PATH" ] \|\| [ -z "$TARGET_DIR" ]; then
	echo "Usage: $0 <file_path> <target_dir>"
	exit 1
	fi

	# Create the target directory if it doesn't exist
	mkdir -p "$TARGET_DIR"

	while IFS= read -r url; do
	wget -P "$TARGET_DIR" "$url"
	done < "$FILE_PATH"

	echo "Files downloaded to: $TARGET_DIR"
	import argparse
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse


	def extract_pdf_links_from_url(url):
	response = requests.get(url)
	soup = BeautifulSoup(response.content, "html.parser")

	pdf_links = []

	for link in soup.find_all("a"):
	href = link.get("href")
	if href and href.endswith(".pdf"):
	# Convert relative URLs to absolute URLs
	abs_url = requests.compat.urljoin(url, href)
	pdf_links.append(abs_url)

	return pdf_links


	def main():
	parser = argparse.ArgumentParser(
	description="Extract PDF links from a given webpage."
	)
	parser.add_argument("url", help="The URL of the webpage to extract from.")
	args = parser.parse_args()

	pdf_links = extract_pdf_links_from_url(args.url)

	# Use the base URL to create the file name
	base_url = urlparse(args.url).netloc
	file_name = base_url.replace(".", "_") + "_pdf_links.txt"

	with open(file_name, "w") as f:
	for link in pdf_links:
	f.write(link + "\n")

	print(f"PDF download links written to: {file_name}")


	if __name__ == "__main__":
	main()
	#!/bin/bash

	URL="$1"

	if [ -z "$URL" ]; then
	echo "Usage: $0 <URL>"
	exit 1
	fi

	# Extract the base URL for use in forming absolute paths
	BASE_URL=$(echo "$URL" \| awk -F/ '{print $3}')
	FILENAME="${BASE_URL//./_}_pdf_links.txt"

	wget -O - "$URL" 2>/dev/null \| grep -o '<a href=['"'"'"][^"'"'"']\.pdf['"'"'"]' \| sed 's/^<a href=["'"'"']\([^"'"'"']\)["'"'"']$/\1/g' \| while read -r line; do
	# Check if the line starts with "http", indicating it's an absolute path
	if [[ $line == http* ]]; then
	echo "$line"
	else
	# Form the full absolute URL
	echo "http://$BASE_URL$line"
	fi
	done > "$FILENAME"

	echo "PDF download links written to: $FILENAME"