Skip to content

Instantly share code, notes, and snippets.

@trdthg
Created April 3, 2023 08:12
Show Gist options
  • Save trdthg/908d3d5003835945f24ae635e5a7ade3 to your computer and use it in GitHub Desktop.
Save trdthg/908d3d5003835945f24ae635e5a7ade3 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import os
import wget
import json
from urllib.parse import unquote
import urllib.request
url = "https://c.rcex.live:8/SF/HiresMusic/"
completedList = []
# download mp3 files recursively from the url
# and save them to the local directory
def download(url, path):
# get the html content
fork = requests.get(url)
fork.encoding = 'utf-8'
html = fork.text
# parse the html content
soup = BeautifulSoup(html, 'html.parser')
# find all the links in the html content
# css selector for nth child from 4th to last
selector = 'tr:nth-child(n+4):not(:last-child) > td:nth-child(2) > a'
# soup find by css selector
links = soup.select(selector)
# iterate over the links
for link in links:
# get the link href
href = link.get('href')
text = unquote(href)
# if the link is a directory
if href.endswith('/'):
# create a new directory with intermediate directories
os.makedirs(path + text, exist_ok=True)
# recursively download the files from the new directory
if testDirectoryNeedsDownloading(url + href):
if download(url + href, path + text):
handleDirectoryDownloadComplete(url + href)
else:
print(f'directory {url + href} already downloaded, skipping')
# if the link is a file
else:
# download using wget
fullURL = url + href
# escape and file path
# create the full path
fullPath = path + text
# wget.download(fullURL, fullPath)
# if file exists, skip
if os.path.exists(fullPath):
print(f'file {fullPath} exists, skipping')
continue
print(fullURL, fullPath)
if os.path.exists(fullPath + ".tmp"):
os.remove(fullPath + ".tmp")
urllib.request.urlretrieve(fullURL, fullPath + ".tmp")
os.rename(fullPath + ".tmp", fullPath)
# os.system('wget -O ' + fullPath + ' ' + fullURL)
return True
def handleDirectoryDownloadComplete(s):
global completedList
completedList.append(s)
json.dump(completedList, open('completed.json', 'w'))
def testDirectoryNeedsDownloading(s):
global completedList
if len(completedList) == 0:
completedList = json.load(open('completed.json'))
if s in completedList:
return False
return True
if __name__ == '__main__':
download(url, '/mnt/c/Users/trdth/Downloads/tmp/')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment