Skip to content

Instantly share code, notes, and snippets.

@manhtai
Created December 6, 2022 07:28
Show Gist options
  • Save manhtai/c06e7f0341eb4d4340c1ead3c0f862cf to your computer and use it in GitHub Desktop.
Save manhtai/c06e7f0341eb4d4340c1ead3c0f862cf to your computer and use it in GitHub Desktop.
Get audio files from learninglink.oup.com
import requests
import bs4
import os
root = "https://learninglink.oup.com"
url = root + "/access/laitz-4e-student-resources"
def get_page_links():
res = requests.get(url)
soup = bs4.BeautifulSoup(res.content, 'html.parser')
links = []
for a in soup.find_all('a'):
link = a.get('href')
if link.startswith('/access/content/laitz-4e-student-resources/laitz-audio-text-example'):
links.append(link)
return links
def get_audio_link(link):
filename = link.split('/')[-1] + '.wav'
res = requests.get(root + link)
soup = bs4.BeautifulSoup(res.content, 'html.parser')
src = soup.find('source').get('src')
return f"{filename}|{src}"
def get_links():
if os.path.isfile('list.txt'):
with open('list.txt') as f:
links = f.readlines()
return [li.strip() for li in links]
links = [get_audio_link(li) for li in get_page_links()]
with open('list.txt', 'w') as f:
f.writelines([li + "\n" for li in links])
return links
def get_audio(filename, link):
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:107.0) Gecko/20100101 Firefox/107.0",
"Accept": "audio/webm,audio/ogg,audio/wav,audio/*;q=0.9,application/ogg;q=0.7,video/*;q=0.6,*/*;q=0.5",
"Range": "bytes=0",
}
res = requests.get(root + link, headers=headers, stream=True)
if res.status_code == 200:
with open(filename, 'wb') as f:
f.write(res.content)
print(f"Get {filename} success!")
else:
print(f"Status {res.status_code} while getting {filename}")
def get_all_audio():
for url in get_links():
filename, link = url.split('|')
get_audio(filename, link)
if __name__ == '__main__':
get_all_audio()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment