Created
February 23, 2018 00:43
-
-
Save cassiotbatista/d89f99a1ad6d8ef9451fcffbe89d45f8 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
# | |
# A quite simple script to download the German lessons from DW's Audiotrainer | |
# webpage: http://www.dw.com/en/learn-german/audiotrainer/s-9677 | |
# It basically 'crawls' a specific page in order to download all 100 mp3 audio | |
# files with their respectives pdfs worksheets (200 files in total) | |
# | |
# Disclaimer: I had trouble with unicode values, so I created two lists to | |
# perform the conversion. Not sure it is the best way to do that, but it works. | |
# :) | |
# | |
# Author: Feb 2018 | |
# Cassio Batista - cassio.batista.13@gmail.com | |
# | |
# "References": | |
# supress curl output: https://stackoverflow.com/questions/7373752/how-do-i-get-curl-to-not-show-the-progress-bar | |
# printf in bash: https://stackoverflow.com/questions/4409399/padding-characters-in-printf | |
# remainder of divis.: http://tldp.org/LDP/abs/html/ops.html | |
# char to unicode/hex: http://www.utf8-chartable.de/ | |
# dealing with arrays: http://tldp.org/LDP/Bash-Beginners-Guide/html/sect_10_02.html | |
# get array length: https://www.cyberciti.biz/faq/finding-bash-shell-array-length-elements/ | |
# grep after href html: https://stackoverflow.com/questions/21264626/how-to-strip-out-all-of-the-links-of-an-html-file-in-bash-or-grep-or-batch-and-s | |
DEGUB=true | |
chars=(\ | |
'À' 'Á' 'Â' 'Ã' 'Ä' 'Å' 'Æ' 'Ç' \ | |
'È' 'É' 'Ê' 'Ë' 'Ì' 'Í' 'Î' 'Ï' \ | |
'Ð' 'Ñ' 'Ò' 'Ó' 'Ô' 'Õ' 'Ö' '×' \ | |
'Ø' 'Ù' 'Ú' 'Û' 'Ü' 'Ý' 'Þ' 'ß' \ | |
'à' 'á' 'â' 'ã' 'ä' 'å' 'æ' 'ç' \ | |
'è' 'é' 'ê' 'ë' 'ì' 'í' 'î' 'ï' \ | |
'ð' 'ñ' 'ò' 'ó' 'ô' 'õ' 'ö' '÷' \ | |
'ø' 'ù' 'ú' 'û' 'ü' 'ý' 'þ' 'ÿ' ) | |
hexs=(\ | |
'%C3%80' '%C3%81' '%C3%82' '%C3%83' '%C3%84' '%C3%85' '%C3%86' '%C3%87'\ | |
'%C3%88' '%C3%89' '%C3%8A' '%C3%8B' '%C3%8C' '%C3%8D' '%C3%8E' '%C3%8F'\ | |
'%C3%90' '%C3%91' '%C3%92' '%C3%93' '%C3%94' '%C3%95' '%C3%96' '%C3%97'\ | |
'%C3%98' '%C3%99' '%C3%9A' '%C3%9B' '%C3%9C' '%C3%9D' '%C3%9E' '%C3%9F'\ | |
'%C3%A0' '%C3%A1' '%C3%A2' '%C3%A3' '%C3%A4' '%C3%A5' '%C3%A6' '%C3%A7'\ | |
'%C3%A8' '%C3%A9' '%C3%AA' '%C3%AB' '%C3%AC' '%C3%AD' '%C3%AE' '%C3%AF'\ | |
'%C3%B0' '%C3%B1' '%C3%B2' '%C3%B3' '%C3%B4' '%C3%B5' '%C3%B6' '%C3%B7'\ | |
'%C3%B8' '%C3%B9' '%C3%BA' '%C3%BB' '%C3%BC' '%C3%BD' '%C3%BE' '%C3%BF') | |
arr_len=${#chars[@]} | |
baselink="http://www.dw.com" | |
mp3_baselink="http://radio-download.dw.com/Events/dwelle/deutschkurse/audiotrainer/eng/" | |
for i in $(seq 1 100) | |
do | |
if [[ $(expr ${i} % 5) -eq 1 ]] | |
then | |
dir=$(printf 'lesson_%02d-%02d\n' {${i},$((i+4))}) | |
mkdir -p $dir | |
fi | |
link=$(curl -s \ | |
${baselink}/en/learn-german/audiotrainer-lessons/s-9678 |\ | |
grep -w lesson-${i} |\ | |
awk -F'[\"\"]' '{print $2}') | |
for pos in $(seq 0 $((arr_len-1))) | |
do | |
link=$(echo ${link} | sed -e s/${chars[$pos]}/${hexs[$pos]}/g) | |
done | |
pdf=$(curl -s \ | |
${baselink}${link} |\ | |
grep 'pdf' |\ | |
sed -n 's/.*href="\([^"]*\).*/\1/p') | |
mp3=$(printf 'Audiotrainer_Englisch_Lektion%03d_dwdownload.mp3' ${i}) | |
if [[ $DEGUB == true ]] | |
then | |
printf 'dir:%s, lesson:%03d (<%s> <%s>)\n' {${dir},${i},${mp3},${pdf}} | |
fi | |
# first, download PDF, then the MP3 after | |
wget -q -P $dir ${baselink}${pdf} | |
wget -q -P $dir ${mp3_baselink}${mp3} | |
done | |
# Exs.: | |
# http://radio-download.dw.com/Events/dwelle/deutschkurse/audiotrainer/eng/Audiotrainer_Englisch_Lektion001_dwdownload.mp3 | |
# http://www.dw.com/downloads/26607509/lektion-1pdf.pdf | |
# http://radio-download.dw.com/Events/dwelle/deutschkurse/audiotrainer/eng/Audiotrainer_Englisch_Lektion002_dwdownload.mp3 | |
# http://www.dw.com/downloads/26607560/lektion-2pdf.pdf | |
# http://radio-download.dw.com/Events/dwelle/deutschkurse/audiotrainer/eng/Audiotrainer_Englisch_Lektion051_dwdownload.mp3 | |
# http://www.dw.com/downloads/26608357/lektion-51pdf.pdf | |
### EOF ### |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
http://www.dw.com/en/learn-german/audiotrainer-lessons/s-9678