whatvn/theoyeucau.py

## theoyeucau.py
# -*- coding: utf-8 -*-

import re
from collections import OrderedDict
import os
import urllib2

ESPISODE_LINK="http://www.theoyeucau.com/episode"
ESPISODE_LINK_REGEX = re.compile("http\:\/\/www\.theoyeucau\.com\/episode\/[a-zA-Z-]+\/")

# http://www.theoyeucau.com/wp-content/uploads/sites/3/2015/12/Tai-sao-phai-song-nhu-mot-con-nguoi.mp3
VIETNAMESE="".join([ch.encode('utf8') for ch in unicode("ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđ", 'utf8')])

ESPISODE_MP3_LINK_REGEX=re.compile("http\:\/\/www\.theoyeucau.com\/wp-content\/uploads\/sites\/\d+\/\d+\/\d+\/[a-zA-Z -._\d" + VIETNAMESE + "]+\.mp3")
FALLBACK_MP3_REGEX=re.compile("http\:\/\/www\.theoyeucau.com\/wp-content\/uploads\/sites\/\d+\/Shows\/[a-zA-Z -._\d]+\/[a-zA-Z -_.\d]+\/[a-zA-Z -._\d" + VIETNAMESE + "]+\.mp3")


# http://www.theoyeucau.com/wp-content/uploads/sites/3/Shows/Ket noi yeu thuong/2012/2012-07-05 Ta muon hoc cach quen nguoi/Ket noi yeu thuong - Ta muon hoc cach quen nguoi - episode_81622 - track_0_81623 - Ta muon hoc cach quen nguoi.mp

PROCESSED_LINKS = [ l.strip() for l in open("links.txt").readlines() ] if os.path.exists("links.txt") else []

def getLinkContent(link):
	req = urllib2.Request(link)
	req.add_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36")
	req.add_header("Accept", "*/*")
	req.add_header("Connection", "keep-alive")
	req.add_header("Accept-Encoding", "identity;q=1, *;q=0, val")
	response = urllib2.urlopen(req)
	return response.read()

def getPageLink(pageNum):
	return ESPISODE_LINK + "/page/" + str(pageNum) + "/"

def getEspisodeLink(pageNum):
	print "Processing page num: " + str(pageNum)
	pageLink = getPageLink(pageNum)
	pageContent = getLinkContent(pageLink)
	result = filter(lambda l: "page" not in l and "feed" not in l, list(OrderedDict.fromkeys(ESPISODE_LINK_REGEX.findall(pageContent))))
	return result

def getMp3Link(espisodeLink):
	print "Get mp3 link on page " + espisodeLink
	pageContent = getLinkContent(espisodeLink)
	try:
		result = ESPISODE_MP3_LINK_REGEX.findall(pageContent)[0] if len(ESPISODE_MP3_LINK_REGEX.findall(pageContent)) > 0 else FALLBACK_MP3_REGEX.findall(pageContent)[0]
		return result
	except IndexError: # page without mp3 file available
		print "They lost its content"
		return None

def downloadMp3(mp3Link, espisodeLink):
	if mp3Link == None: return
	print "Download " + mp3Link
	year = mp3Link.split("/")[7]
	try:
		os.mkdir(year)
	except OSError:
		pass
	try:
		with open(year + "/" + mp3Link.split("/")[-1],'wb') as mp3: mp3.write(getLinkContent(mp3Link))
	except urllib2.HTTPError:
		import urllib
		urllib.urlretrieve(mp3Link, year + "/" + mp3Link.split("/")[-1])
	# not to download link already downloaded
  	with open("links.txt", 'a+') as f: f.write(espisodeLink + "\n")

def downThemAll(pageNum):
	linkList = filter(lambda l: l not in PROCESSED_LINKS, getEspisodeLink(pageNum))
	for l in linkList: downloadMp3(getMp3Link(l), l)


if __name__ == '__main__':
	for i in range(1, 117): downThemAll(i)
	# -- coding: utf-8 --

	import re
	from collections import OrderedDict
	import os
	import urllib2

	ESPISODE_LINK="http://www.theoyeucau.com/episode"
	ESPISODE_LINK_REGEX = re.compile("http\:\/\/www\.theoyeucau\.com\/episode\/[a-zA-Z-]+\/")

	# http://www.theoyeucau.com/wp-content/uploads/sites/3/2015/12/Tai-sao-phai-song-nhu-mot-con-nguoi.mp3
	VIETNAMESE="".join([ch.encode('utf8') for ch in unicode("ạảãàáâậầấẩẫăắằặẳẵóòọõỏôộổỗồốơờớợởỡéèẻẹẽêếềệểễúùụủũưựữửừứíìịỉĩýỳỷỵỹđ", 'utf8')])

	ESPISODE_MP3_LINK_REGEX=re.compile("http\:\/\/www\.theoyeucau.com\/wp-content\/uploads\/sites\/\d+\/\d+\/\d+\/[a-zA-Z -._\d" + VIETNAMESE + "]+\.mp3")
	FALLBACK_MP3_REGEX=re.compile("http\:\/\/www\.theoyeucau.com\/wp-content\/uploads\/sites\/\d+\/Shows\/[a-zA-Z -._\d]+\/[a-zA-Z -_.\d]+\/[a-zA-Z -._\d" + VIETNAMESE + "]+\.mp3")


	# http://www.theoyeucau.com/wp-content/uploads/sites/3/Shows/Ket noi yeu thuong/2012/2012-07-05 Ta muon hoc cach quen nguoi/Ket noi yeu thuong - Ta muon hoc cach quen nguoi - episode_81622 - track_0_81623 - Ta muon hoc cach quen nguoi.mp

	PROCESSED_LINKS = [ l.strip() for l in open("links.txt").readlines() ] if os.path.exists("links.txt") else []

	def getLinkContent(link):
	req = urllib2.Request(link)
	req.add_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36")
	req.add_header("Accept", "/")
	req.add_header("Connection", "keep-alive")
	req.add_header("Accept-Encoding", "identity;q=1, *;q=0, val")
	response = urllib2.urlopen(req)
	return response.read()

	def getPageLink(pageNum):
	return ESPISODE_LINK + "/page/" + str(pageNum) + "/"

	def getEspisodeLink(pageNum):
	print "Processing page num: " + str(pageNum)
	pageLink = getPageLink(pageNum)
	pageContent = getLinkContent(pageLink)
	result = filter(lambda l: "page" not in l and "feed" not in l, list(OrderedDict.fromkeys(ESPISODE_LINK_REGEX.findall(pageContent))))
	return result

	def getMp3Link(espisodeLink):
	print "Get mp3 link on page " + espisodeLink
	pageContent = getLinkContent(espisodeLink)
	try:
	result = ESPISODE_MP3_LINK_REGEX.findall(pageContent)[0] if len(ESPISODE_MP3_LINK_REGEX.findall(pageContent)) > 0 else FALLBACK_MP3_REGEX.findall(pageContent)[0]
	return result
	except IndexError: # page without mp3 file available
	print "They lost its content"
	return None

	def downloadMp3(mp3Link, espisodeLink):
	if mp3Link == None: return
	print "Download " + mp3Link
	year = mp3Link.split("/")[7]
	try:
	os.mkdir(year)
	except OSError:
	pass
	try:
	with open(year + "/" + mp3Link.split("/")[-1],'wb') as mp3: mp3.write(getLinkContent(mp3Link))
	except urllib2.HTTPError:
	import urllib
	urllib.urlretrieve(mp3Link, year + "/" + mp3Link.split("/")[-1])
	# not to download link already downloaded
	with open("links.txt", 'a+') as f: f.write(espisodeLink + "\n")

	def downThemAll(pageNum):
	linkList = filter(lambda l: l not in PROCESSED_LINKS, getEspisodeLink(pageNum))
	for l in linkList: downloadMp3(getMp3Link(l), l)


	if __name__ == '__main__':
	for i in range(1, 117): downThemAll(i)