cassiotbatista/get_dw_audiotrainer.sh

## get_dw_audiotrainer.sh
#!/bin/bash
#
# A quite simple script to download the German lessons from DW's Audiotrainer
# webpage: http://www.dw.com/en/learn-german/audiotrainer/s-9677
# It basically 'crawls' a specific page in order to download all 100 mp3 audio
# files with their respectives pdfs worksheets (200 files in total)
#
# Disclaimer: I had trouble with unicode values, so I created two lists to
# perform the conversion. Not sure it is the best way to do that, but it works.
# :)
#
# Author: Feb 2018
# Cassio Batista - cassio.batista.13@gmail.com
#
# "References":
# supress curl output:  https://stackoverflow.com/questions/7373752/how-do-i-get-curl-to-not-show-the-progress-bar
# printf in bash:       https://stackoverflow.com/questions/4409399/padding-characters-in-printf
# remainder of divis.:  http://tldp.org/LDP/abs/html/ops.html
# char to unicode/hex:  http://www.utf8-chartable.de/
# dealing with arrays:  http://tldp.org/LDP/Bash-Beginners-Guide/html/sect_10_02.html
# get array length:     https://www.cyberciti.biz/faq/finding-bash-shell-array-length-elements/
# grep after href html: https://stackoverflow.com/questions/21264626/how-to-strip-out-all-of-the-links-of-an-html-file-in-bash-or-grep-or-batch-and-s

DEGUB=true

chars=(\
        'À'      'Á'      'Â'      'Ã'      'Ä'      'Å'      'Æ'      'Ç'     \
        'È'      'É'      'Ê'      'Ë'      'Ì'      'Í'      'Î'      'Ï'     \
        'Ð'      'Ñ'      'Ò'      'Ó'      'Ô'      'Õ'      'Ö'      '×'     \
        'Ø'      'Ù'      'Ú'      'Û'      'Ü'      'Ý'      'Þ'      'ß'     \
        'à'      'á'      'â'      'ã'      'ä'      'å'      'æ'      'ç'     \
        'è'      'é'      'ê'      'ë'      'ì'      'í'      'î'      'ï'     \
        'ð'      'ñ'      'ò'      'ó'      'ô'      'õ'      'ö'      '÷'     \
        'ø'      'ù'      'ú'      'û'      'ü'      'ý'      'þ'      'ÿ'     )

hexs=(\
        '%C3%80' '%C3%81' '%C3%82' '%C3%83' '%C3%84' '%C3%85' '%C3%86' '%C3%87'\
        '%C3%88' '%C3%89' '%C3%8A' '%C3%8B' '%C3%8C' '%C3%8D' '%C3%8E' '%C3%8F'\
        '%C3%90' '%C3%91' '%C3%92' '%C3%93' '%C3%94' '%C3%95' '%C3%96' '%C3%97'\
        '%C3%98' '%C3%99' '%C3%9A' '%C3%9B' '%C3%9C' '%C3%9D' '%C3%9E' '%C3%9F'\
        '%C3%A0' '%C3%A1' '%C3%A2' '%C3%A3' '%C3%A4' '%C3%A5' '%C3%A6' '%C3%A7'\
        '%C3%A8' '%C3%A9' '%C3%AA' '%C3%AB' '%C3%AC' '%C3%AD' '%C3%AE' '%C3%AF'\
        '%C3%B0' '%C3%B1' '%C3%B2' '%C3%B3' '%C3%B4' '%C3%B5' '%C3%B6' '%C3%B7'\
        '%C3%B8' '%C3%B9' '%C3%BA' '%C3%BB' '%C3%BC' '%C3%BD' '%C3%BE' '%C3%BF')

arr_len=${#chars[@]}
baselink="http://www.dw.com"
mp3_baselink="http://radio-download.dw.com/Events/dwelle/deutschkurse/audiotrainer/eng/"
for i in $(seq 1 100)
do
        if [[ $(expr ${i} % 5) -eq 1 ]]
        then
                dir=$(printf 'lesson_%02d-%02d\n' {${i},$((i+4))})
                mkdir -p $dir
        fi

        link=$(curl -s \
                                ${baselink}/en/learn-german/audiotrainer-lessons/s-9678 |\
                                grep -w lesson-${i} |\
                                awk -F'[\"\"]' '{print $2}')

        for pos in $(seq 0 $((arr_len-1)))
        do
                link=$(echo ${link} | sed -e s/${chars[$pos]}/${hexs[$pos]}/g)
        done

        pdf=$(curl -s \
                                ${baselink}${link} |\
                                grep 'pdf' |\
                                sed -n 's/.*href="\([^"]*\).*/\1/p')

        mp3=$(printf 'Audiotrainer_Englisch_Lektion%03d_dwdownload.mp3' ${i})

        if [[ $DEGUB == true ]]
        then
                printf 'dir:%s, lesson:%03d (<%s> <%s>)\n' {${dir},${i},${mp3},${pdf}}
        fi

        # first, download PDF, then the MP3 after
        wget -q -P $dir ${baselink}${pdf}
        wget -q -P $dir ${mp3_baselink}${mp3}
done

# Exs.:
# http://radio-download.dw.com/Events/dwelle/deutschkurse/audiotrainer/eng/Audiotrainer_Englisch_Lektion001_dwdownload.mp3
# http://www.dw.com/downloads/26607509/lektion-1pdf.pdf

# http://radio-download.dw.com/Events/dwelle/deutschkurse/audiotrainer/eng/Audiotrainer_Englisch_Lektion002_dwdownload.mp3
# http://www.dw.com/downloads/26607560/lektion-2pdf.pdf

# http://radio-download.dw.com/Events/dwelle/deutschkurse/audiotrainer/eng/Audiotrainer_Englisch_Lektion051_dwdownload.mp3
# http://www.dw.com/downloads/26608357/lektion-51pdf.pdf

### EOF ###
	#!/bin/bash
	#
	# A quite simple script to download the German lessons from DW's Audiotrainer
	# webpage: http://www.dw.com/en/learn-german/audiotrainer/s-9677
	# It basically 'crawls' a specific page in order to download all 100 mp3 audio
	# files with their respectives pdfs worksheets (200 files in total)
	#
	# Disclaimer: I had trouble with unicode values, so I created two lists to
	# perform the conversion. Not sure it is the best way to do that, but it works.
	# :)
	#
	# Author: Feb 2018
	# Cassio Batista - cassio.batista.13@gmail.com
	#
	# "References":
	# supress curl output: https://stackoverflow.com/questions/7373752/how-do-i-get-curl-to-not-show-the-progress-bar
	# printf in bash: https://stackoverflow.com/questions/4409399/padding-characters-in-printf
	# remainder of divis.: http://tldp.org/LDP/abs/html/ops.html
	# char to unicode/hex: http://www.utf8-chartable.de/
	# dealing with arrays: http://tldp.org/LDP/Bash-Beginners-Guide/html/sect_10_02.html
	# get array length: https://www.cyberciti.biz/faq/finding-bash-shell-array-length-elements/
	# grep after href html: https://stackoverflow.com/questions/21264626/how-to-strip-out-all-of-the-links-of-an-html-file-in-bash-or-grep-or-batch-and-s

	DEGUB=true

	chars=(\
	'À' 'Á' 'Â' 'Ã' 'Ä' 'Å' 'Æ' 'Ç' \
	'È' 'É' 'Ê' 'Ë' 'Ì' 'Í' 'Î' 'Ï' \
	'Ð' 'Ñ' 'Ò' 'Ó' 'Ô' 'Õ' 'Ö' '×' \
	'Ø' 'Ù' 'Ú' 'Û' 'Ü' 'Ý' 'Þ' 'ß' \
	'à' 'á' 'â' 'ã' 'ä' 'å' 'æ' 'ç' \
	'è' 'é' 'ê' 'ë' 'ì' 'í' 'î' 'ï' \
	'ð' 'ñ' 'ò' 'ó' 'ô' 'õ' 'ö' '÷' \
	'ø' 'ù' 'ú' 'û' 'ü' 'ý' 'þ' 'ÿ' )

	hexs=(\
	'%C3%80' '%C3%81' '%C3%82' '%C3%83' '%C3%84' '%C3%85' '%C3%86' '%C3%87'\
	'%C3%88' '%C3%89' '%C3%8A' '%C3%8B' '%C3%8C' '%C3%8D' '%C3%8E' '%C3%8F'\
	'%C3%90' '%C3%91' '%C3%92' '%C3%93' '%C3%94' '%C3%95' '%C3%96' '%C3%97'\
	'%C3%98' '%C3%99' '%C3%9A' '%C3%9B' '%C3%9C' '%C3%9D' '%C3%9E' '%C3%9F'\
	'%C3%A0' '%C3%A1' '%C3%A2' '%C3%A3' '%C3%A4' '%C3%A5' '%C3%A6' '%C3%A7'\
	'%C3%A8' '%C3%A9' '%C3%AA' '%C3%AB' '%C3%AC' '%C3%AD' '%C3%AE' '%C3%AF'\
	'%C3%B0' '%C3%B1' '%C3%B2' '%C3%B3' '%C3%B4' '%C3%B5' '%C3%B6' '%C3%B7'\
	'%C3%B8' '%C3%B9' '%C3%BA' '%C3%BB' '%C3%BC' '%C3%BD' '%C3%BE' '%C3%BF')

	arr_len=${#chars[@]}
	baselink="http://www.dw.com"
	mp3_baselink="http://radio-download.dw.com/Events/dwelle/deutschkurse/audiotrainer/eng/"
	for i in $(seq 1 100)
	do
	if [[ $(expr ${i} % 5) -eq 1 ]]
	then
	dir=$(printf 'lesson_%02d-%02d\n' {${i},$((i+4))})
	mkdir -p $dir
	fi

	link=$(curl -s \
	${baselink}/en/learn-german/audiotrainer-lessons/s-9678 \|\
	grep -w lesson-${i} \|\
	awk -F'[\"\"]' '{print $2}')

	for pos in $(seq 0 $((arr_len-1)))
	do
	link=$(echo ${link} \| sed -e s/${chars[$pos]}/${hexs[$pos]}/g)
	done

	pdf=$(curl -s \
	${baselink}${link} \|\
	grep 'pdf' \|\
	sed -n 's/.href="\([^"]\).*/\1/p')

	mp3=$(printf 'Audiotrainer_Englisch_Lektion%03d_dwdownload.mp3' ${i})

	if [[ $DEGUB == true ]]
	then
	printf 'dir:%s, lesson:%03d (<%s> <%s>)\n' {${dir},${i},${mp3},${pdf}}
	fi

	# first, download PDF, then the MP3 after
	wget -q -P $dir ${baselink}${pdf}
	wget -q -P $dir ${mp3_baselink}${mp3}
	done

	# Exs.:
	# http://radio-download.dw.com/Events/dwelle/deutschkurse/audiotrainer/eng/Audiotrainer_Englisch_Lektion001_dwdownload.mp3
	# http://www.dw.com/downloads/26607509/lektion-1pdf.pdf

	# http://radio-download.dw.com/Events/dwelle/deutschkurse/audiotrainer/eng/Audiotrainer_Englisch_Lektion002_dwdownload.mp3
	# http://www.dw.com/downloads/26607560/lektion-2pdf.pdf

	# http://radio-download.dw.com/Events/dwelle/deutschkurse/audiotrainer/eng/Audiotrainer_Englisch_Lektion051_dwdownload.mp3
	# http://www.dw.com/downloads/26608357/lektion-51pdf.pdf

	### EOF ###