Created
June 7, 2019 19:56
-
-
Save hengstchon/1b54cd3a8a1fc82c4772f5e3249b0f0e to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import os | |
def get_episode(url): | |
res = requests.get(url).text | |
soup = BeautifulSoup(res, 'lxml') | |
mp3_href = soup.find(string='Herunterladen').parent['href'] | |
mp3_url = base_url + mp3_href | |
mp3_name = mp3_href.split('/')[-1] | |
if not os.path.exists(os.getcwd() + '/' + mp3_name): | |
get_content(mp3_url, mp3_name) | |
pdf_href = soup.find(id='podcastSprachenDesScripts').find(string='deutsch').parent['href'] | |
pdf_url = base_url + pdf_href | |
pdf_name = mp3_name.replace('mp3', 'pdf') | |
if not os.path.exists(os.getcwd() + '/' + pdf_name): | |
get_content(pdf_url, pdf_name) | |
return | |
def get_content(url, name): | |
res = requests.get(url) | |
with open(name, 'wb') as f: | |
f.write(res.content) | |
return | |
root_dir = os.getcwd() + '/studentstories' | |
if not os.path.exists(root_dir): | |
os.mkdir(root_dir) | |
os.chdir(root_dir) | |
base_url = 'http://www.studentstories.de/' | |
res = requests.get('http://www.studentstories.de/alle-folgen.html') | |
res.encoding = 'utf-8' | |
soup = BeautifulSoup(res.text, 'lxml') | |
episodes = soup.find_all('div', class_='views-field-title') | |
for e in episodes[:10]: | |
dir_name = e.find('a').string | |
if not os.path.exists(os.getcwd() + '/' + dir_name): | |
os.mkdir(dir_name) | |
os.chdir(dir_name) | |
href = e.find('a')['href'] | |
url = base_url + href | |
get_episode(url) | |
os.chdir(root_dir) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment