Created
April 3, 2023 08:12
-
-
Save trdthg/908d3d5003835945f24ae635e5a7ade3 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import os | |
import wget | |
import json | |
from urllib.parse import unquote | |
import urllib.request | |
url = "https://c.rcex.live:8/SF/HiresMusic/" | |
completedList = [] | |
# download mp3 files recursively from the url | |
# and save them to the local directory | |
def download(url, path): | |
# get the html content | |
fork = requests.get(url) | |
fork.encoding = 'utf-8' | |
html = fork.text | |
# parse the html content | |
soup = BeautifulSoup(html, 'html.parser') | |
# find all the links in the html content | |
# css selector for nth child from 4th to last | |
selector = 'tr:nth-child(n+4):not(:last-child) > td:nth-child(2) > a' | |
# soup find by css selector | |
links = soup.select(selector) | |
# iterate over the links | |
for link in links: | |
# get the link href | |
href = link.get('href') | |
text = unquote(href) | |
# if the link is a directory | |
if href.endswith('/'): | |
# create a new directory with intermediate directories | |
os.makedirs(path + text, exist_ok=True) | |
# recursively download the files from the new directory | |
if testDirectoryNeedsDownloading(url + href): | |
if download(url + href, path + text): | |
handleDirectoryDownloadComplete(url + href) | |
else: | |
print(f'directory {url + href} already downloaded, skipping') | |
# if the link is a file | |
else: | |
# download using wget | |
fullURL = url + href | |
# escape and file path | |
# create the full path | |
fullPath = path + text | |
# wget.download(fullURL, fullPath) | |
# if file exists, skip | |
if os.path.exists(fullPath): | |
print(f'file {fullPath} exists, skipping') | |
continue | |
print(fullURL, fullPath) | |
if os.path.exists(fullPath + ".tmp"): | |
os.remove(fullPath + ".tmp") | |
urllib.request.urlretrieve(fullURL, fullPath + ".tmp") | |
os.rename(fullPath + ".tmp", fullPath) | |
# os.system('wget -O ' + fullPath + ' ' + fullURL) | |
return True | |
def handleDirectoryDownloadComplete(s): | |
global completedList | |
completedList.append(s) | |
json.dump(completedList, open('completed.json', 'w')) | |
def testDirectoryNeedsDownloading(s): | |
global completedList | |
if len(completedList) == 0: | |
completedList = json.load(open('completed.json')) | |
if s in completedList: | |
return False | |
return True | |
if __name__ == '__main__': | |
download(url, '/mnt/c/Users/trdth/Downloads/tmp/') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment