Skip to content

Instantly share code, notes, and snippets.

@whatvn
Last active July 6, 2016 05:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save whatvn/9be2d2894a95db93ed0853647dca24c2 to your computer and use it in GitHub Desktop.
Save whatvn/9be2d2894a95db93ed0853647dca24c2 to your computer and use it in GitHub Desktop.
download all radio espisode from theoyeucau.com in one shot
# -*- coding: utf-8 -*-
import re
from collections import OrderedDict
import os
import urllib2
ESPISODE_LINK="http://www.theoyeucau.com/episode"
ESPISODE_LINK_REGEX = re.compile("http\:\/\/www\.theoyeucau\.com\/episode\/[a-zA-Z-]+\/")
# http://www.theoyeucau.com/wp-content/uploads/sites/3/2015/12/Tai-sao-phai-song-nhu-mot-con-nguoi.mp3
VIETNAMESE="".join([ch.encode('utf8') for ch in unicode("ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđ", 'utf8')])
ESPISODE_MP3_LINK_REGEX=re.compile("http\:\/\/www\.theoyeucau.com\/wp-content\/uploads\/sites\/\d+\/\d+\/\d+\/[a-zA-Z -._\d" + VIETNAMESE + "]+\.mp3")
FALLBACK_MP3_REGEX=re.compile("http\:\/\/www\.theoyeucau.com\/wp-content\/uploads\/sites\/\d+\/Shows\/[a-zA-Z -._\d]+\/[a-zA-Z -_.\d]+\/[a-zA-Z -._\d" + VIETNAMESE + "]+\.mp3")
# http://www.theoyeucau.com/wp-content/uploads/sites/3/Shows/Ket noi yeu thuong/2012/2012-07-05 Ta muon hoc cach quen nguoi/Ket noi yeu thuong - Ta muon hoc cach quen nguoi - episode_81622 - track_0_81623 - Ta muon hoc cach quen nguoi.mp
PROCESSED_LINKS = [ l.strip() for l in open("links.txt").readlines() ] if os.path.exists("links.txt") else []
def getLinkContent(link):
req = urllib2.Request(link)
req.add_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36")
req.add_header("Accept", "*/*")
req.add_header("Connection", "keep-alive")
req.add_header("Accept-Encoding", "identity;q=1, *;q=0, val")
response = urllib2.urlopen(req)
return response.read()
def getPageLink(pageNum):
return ESPISODE_LINK + "/page/" + str(pageNum) + "/"
def getEspisodeLink(pageNum):
print "Processing page num: " + str(pageNum)
pageLink = getPageLink(pageNum)
pageContent = getLinkContent(pageLink)
result = filter(lambda l: "page" not in l and "feed" not in l, list(OrderedDict.fromkeys(ESPISODE_LINK_REGEX.findall(pageContent))))
return result
def getMp3Link(espisodeLink):
print "Get mp3 link on page " + espisodeLink
pageContent = getLinkContent(espisodeLink)
try:
result = ESPISODE_MP3_LINK_REGEX.findall(pageContent)[0] if len(ESPISODE_MP3_LINK_REGEX.findall(pageContent)) > 0 else FALLBACK_MP3_REGEX.findall(pageContent)[0]
return result
except IndexError: # page without mp3 file available
print "They lost its content"
return None
def downloadMp3(mp3Link, espisodeLink):
if mp3Link == None: return
print "Download " + mp3Link
year = mp3Link.split("/")[7]
try:
os.mkdir(year)
except OSError:
pass
try:
with open(year + "/" + mp3Link.split("/")[-1],'wb') as mp3: mp3.write(getLinkContent(mp3Link))
except urllib2.HTTPError:
import urllib
urllib.urlretrieve(mp3Link, year + "/" + mp3Link.split("/")[-1])
# not to download link already downloaded
with open("links.txt", 'a+') as f: f.write(espisodeLink + "\n")
def downThemAll(pageNum):
linkList = filter(lambda l: l not in PROCESSED_LINKS, getEspisodeLink(pageNum))
for l in linkList: downloadMp3(getMp3Link(l), l)
if __name__ == '__main__':
for i in range(1, 117): downThemAll(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment