Created
December 6, 2022 07:28
-
-
Save manhtai/c06e7f0341eb4d4340c1ead3c0f862cf to your computer and use it in GitHub Desktop.
Get audio files from learninglink.oup.com
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import bs4 | |
import os | |
root = "https://learninglink.oup.com" | |
url = root + "/access/laitz-4e-student-resources" | |
def get_page_links(): | |
res = requests.get(url) | |
soup = bs4.BeautifulSoup(res.content, 'html.parser') | |
links = [] | |
for a in soup.find_all('a'): | |
link = a.get('href') | |
if link.startswith('/access/content/laitz-4e-student-resources/laitz-audio-text-example'): | |
links.append(link) | |
return links | |
def get_audio_link(link): | |
filename = link.split('/')[-1] + '.wav' | |
res = requests.get(root + link) | |
soup = bs4.BeautifulSoup(res.content, 'html.parser') | |
src = soup.find('source').get('src') | |
return f"{filename}|{src}" | |
def get_links(): | |
if os.path.isfile('list.txt'): | |
with open('list.txt') as f: | |
links = f.readlines() | |
return [li.strip() for li in links] | |
links = [get_audio_link(li) for li in get_page_links()] | |
with open('list.txt', 'w') as f: | |
f.writelines([li + "\n" for li in links]) | |
return links | |
def get_audio(filename, link): | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0", | |
"Accept": "audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5", | |
"Range": "bytes=0", | |
} | |
res = requests.get(root + link, headers=headers, stream=True) | |
if res.status_code == 200: | |
with open(filename, 'wb') as f: | |
f.write(res.content) | |
print(f"Get {filename} success!") | |
else: | |
print(f"Status {res.status_code} while getting {filename}") | |
def get_all_audio(): | |
for url in get_links(): | |
filename, link = url.split('|') | |
get_audio(filename, link) | |
if __name__ == '__main__': | |
get_all_audio() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment