Skip to content

Instantly share code, notes, and snippets.

@nhtranngoc
Created May 8, 2024 17:00
Show Gist options
  • Save nhtranngoc/f77345353b1fb5c5013ba72e20688d07 to your computer and use it in GitHub Desktop.
Save nhtranngoc/f77345353b1fb5c5013ba72e20688d07 to your computer and use it in GitHub Desktop.
Automated Scraper for Cambridge Listening Tests for various books (Preliminary 1, Preliminary 1 For School, New KET 1, New KET 1 For School, FIRST 4, FIRST 5, and C1 Advanced 4)
import requests
# Universal header, pretend we're a browser yay
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'}
book_prefixes = ["Pre1", "Prefs1", "Key1", "Kfs1", "ELT_First4", "ELT_FIRST5", "ELT_Adv4"]
test_count = 4
# PET has 4 sections
# KET has 5 sections
# FCE has 4 sections
# CAE has 4 sections
# Generate a list of names for each audio section in each test in each book. (yea)
def generate_urls():
urls = []
for prefix in book_prefixes:
for test in range(1, test_count + 1):
section_count = 4
if (prefix == "Key1") or (prefix == "Kfs1"):
section_count = 5
for section in range (1, section_count + 1):
urls.append(prefix + "_t" + str(test) + "_audio" + str(section))
return urls
if __name__ == "__main__":
urls = generate_urls()
for url in urls:
print("REQUESTING: File " + url)
doc = requests.get('http://cambridge.org/' + url, allow_redirects=True, headers=headers)
if (doc.status_code != 200):
print("REQUEST FAILED: File " + url + " unable to download, status code " + doc.status_code)
continue
with open(url + '.mp3', 'wb') as f:
f.write(doc.content)
print("REQUESTED: File " + url + " successfully")
print("----")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment