Last active
October 17, 2020 13:37
-
-
Save vietvudanh/244356ef90b174d1068f29ae73d91599 to your computer and use it in GitHub Desktop.
download album from chiasenhac
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""script for downloading album from chiasenhac | |
""" | |
import time | |
import sys | |
from pathlib import Path | |
from bs4 import BeautifulSoup | |
import requests | |
from multiprocessing import Pool | |
BASE_OUTPUT = 'albums' | |
NUMBER_PROCESS = 8 | |
MUSIC_QUALITY = '320' | |
def download_file(params): | |
"""download single file""" | |
url, album_path = params | |
down_page = requests.get(url).text | |
down_soup = BeautifulSoup(down_page, 'html.parser') | |
filename = down_soup.title.text.split('Download: ')[-1].split(" - ")[0] + ".mp3" | |
filename = filename.replace(u'Tải nhạc ', '') | |
path = Path(album_path) / filename | |
if path.exists(): | |
print(f"file {filename} exists") | |
return | |
start_time = time.time() | |
print("start: {}".format(filename)) | |
for link in down_soup.find_all('a'): | |
href = link.get('href') | |
if href and href.find('.mp3') > 0 and href.find(MUSIC_QUALITY) > 0: | |
with path.open('wb') as f: | |
content = requests.get(href).content | |
f.write(content) | |
print("done : {}::{}s".format(filename, (time.time() - start_time))) | |
def main(url): | |
org_page = requests.get(url).text | |
org_soup = BeautifulSoup(org_page, 'html.parser') | |
list_url = set() # link might appear twice, so use set | |
d_table = org_soup.find('div', class_='d-table') | |
for cell in d_table.select('div.name.d-table-cell'): | |
a_tag = cell.find('a') | |
if a_tag: | |
list_url.add(a_tag.get('href')) | |
print(f'songs: {len(list_url)}') | |
# meta | |
# artist, album, year | |
artist = org_soup.find_all(text="Ca sĩ: ")[0].parent.parent.find('a').text | |
album = org_soup.find_all(text="Album: ")[0].parent.parent.find('a').text | |
album_path = Path(BASE_OUTPUT) / artist / album | |
if not album_path.exists(): | |
album_path.mkdir(parents=True) | |
params = [ | |
(url, str(album_path)) | |
for url in list_url | |
] | |
p = Pool(processes=NUMBER_PROCESS) | |
p.map(download_file, params) | |
p.close() | |
if __name__ == '__main__': | |
if len(sys.argv) == 1: | |
print("Missing url") | |
sys.extit(1) | |
main(sys.argv[1]) |
Is there a good site (or) resource for learning to do such stuff using Python ?
@himanshuxd: For me, I studied by just start coding and google "how to python" (mostly in docs and StackOverFlow). I had got the basic programming skills in C/C++, Java from college. So this part is really easy, Python is easy to learn.
There are some good books: Learn Python the hard way, Dive into Python. For the course, I do not know many. Codeacademy seems a good place to start.
When you got the basic, just read the code of popular language, you will learn a lot. Like Flask, Django or reddit... there are many.
Update, #chiasenhac now requires login to download high quality (>= 320Kbps) music. So this script will only work for 128Kbps.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Thank U @vietvudanh