Skip to content

Instantly share code, notes, and snippets.

@hengstchon
Created June 7, 2019 19:56
Show Gist options
  • Save hengstchon/1b54cd3a8a1fc82c4772f5e3249b0f0e to your computer and use it in GitHub Desktop.
Save hengstchon/1b54cd3a8a1fc82c4772f5e3249b0f0e to your computer and use it in GitHub Desktop.
import requests
from bs4 import BeautifulSoup
import os
def get_episode(url):
res = requests.get(url).text
soup = BeautifulSoup(res, 'lxml')
mp3_href = soup.find(string='Herunterladen').parent['href']
mp3_url = base_url + mp3_href
mp3_name = mp3_href.split('/')[-1]
if not os.path.exists(os.getcwd() + '/' + mp3_name):
get_content(mp3_url, mp3_name)
pdf_href = soup.find(id='podcastSprachenDesScripts').find(string='deutsch').parent['href']
pdf_url = base_url + pdf_href
pdf_name = mp3_name.replace('mp3', 'pdf')
if not os.path.exists(os.getcwd() + '/' + pdf_name):
get_content(pdf_url, pdf_name)
return
def get_content(url, name):
res = requests.get(url)
with open(name, 'wb') as f:
f.write(res.content)
return
root_dir = os.getcwd() + '/studentstories'
if not os.path.exists(root_dir):
os.mkdir(root_dir)
os.chdir(root_dir)
base_url = 'http://www.studentstories.de/'
res = requests.get('http://www.studentstories.de/alle-folgen.html')
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'lxml')
episodes = soup.find_all('div', class_='views-field-title')
for e in episodes[:10]:
dir_name = e.find('a').string
if not os.path.exists(os.getcwd() + '/' + dir_name):
os.mkdir(dir_name)
os.chdir(dir_name)
href = e.find('a')['href']
url = base_url + href
get_episode(url)
os.chdir(root_dir)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment