shede333/extract.py

## extract.py
#_*_ coding:UTF-8 _*_
__author__ = 'shaowei'

import requests
import os
import re

print "\nAll files will be downloaded here:", os.getcwd()

re_video_HD = re.compile(r'<a href="(http://devstreaming.apple.com/videos/wwdc/2013/[^"]*-HD\.mov)')
re_video_SD = re.compile(r'<a href="(http://devstreaming.apple.com/videos/wwdc/2013/[^"]*-SD\.mov)')
re_pdf = re.compile(r'<a href="(http://devstreaming.apple.com/videos/wwdc/2013/[^"]*\.pdf)')

html_file_path = os.path.join(os.getcwd(), "wwdc_video.html")

if os.path.exists(html_file_path):
	print "file is exist"
else:
	print "no exist file:", html_file_path

html_file = open(html_file_path)
html_content = html_file.read()

pdf_url_list = re_pdf.findall(html_content)
sd_url_list = re_video_SD.findall(html_content)
hd_url_list = re_video_HD.findall(html_content)

print "pdf file count:", len(pdf_url_list)
print "video-sd file count:", len(sd_url_list)
print "video-hd file count:", len(hd_url_list)

# start downLoad


def down_files(url_list, save_folder):
	dl_count = len(url_list)
	dl_finish_count = 0
	for file_url in url_list:
		dl_finish_count += 1
		dl_progress = str(dl_count) + '/' + str(dl_finish_count)
		file_path = os.path.join(save_folder, file_url[file_url.rindex("/") + 1:])
		# check file exist
		if os.path.exists(file_path):
			print "file %s has exist,don`t downLoad " % dl_progress, file_path
			continue  # file has exist,don`t downLoad

		print "\nstart download: ", dl_progress, file_url
		file_content = requests.get(file_url).content
		file_obj = open(file_path, "wb")
		file_obj.write(file_content)
		print "downLoad finish %s, file saved in:" % dl_progress, file_path, "\n"


pdf_folder_path = os.path.join(os.getcwd(), "pdf")
sd_folder_path = os.path.join(os.getcwd(), "video-sd")
hd_folder_path = os.path.join(os.getcwd(), "video-hd")

if not os.path.exists(pdf_folder_path):
	os.mkdir(pdf_folder_path)

if not os.path.exists(sd_folder_path):
	os.mkdir(sd_folder_path)

if not os.path.exists(hd_folder_path):
	os.mkdir(hd_folder_path)

print "start downLoad pdf file:", len(pdf_url_list), "\n"
down_files(pdf_url_list, pdf_folder_path)
print " all pdf file download finish."

# print "start downLoad video-sd file:", len(sd_url_list), "\n"
# down_files(sd_url_list, sd_folder_path)
# print " all video-sd file download finish."
#
# print "start downLoad video-hd file:", len(hd_url_list), "\n"
# down_files(hd_url_list, hd_folder_path)
# print " all video-hd file download finish."
	#__ coding:UTF-8 __
	__author__ = 'shaowei'

	import requests
	import os
	import re

	print "\nAll files will be downloaded here:", os.getcwd()

	re_video_HD = re.compile(r'<a href="(http://devstreaming.apple.com/videos/wwdc/2013/[^"]*-HD\.mov)')
	re_video_SD = re.compile(r'<a href="(http://devstreaming.apple.com/videos/wwdc/2013/[^"]*-SD\.mov)')
	re_pdf = re.compile(r'<a href="(http://devstreaming.apple.com/videos/wwdc/2013/[^"]*\.pdf)')

	html_file_path = os.path.join(os.getcwd(), "wwdc_video.html")

	if os.path.exists(html_file_path):
	print "file is exist"
	else:
	print "no exist file:", html_file_path

	html_file = open(html_file_path)
	html_content = html_file.read()

	pdf_url_list = re_pdf.findall(html_content)
	sd_url_list = re_video_SD.findall(html_content)
	hd_url_list = re_video_HD.findall(html_content)

	print "pdf file count:", len(pdf_url_list)
	print "video-sd file count:", len(sd_url_list)
	print "video-hd file count:", len(hd_url_list)

	# start downLoad


	def down_files(url_list, save_folder):
	dl_count = len(url_list)
	dl_finish_count = 0
	for file_url in url_list:
	dl_finish_count += 1
	dl_progress = str(dl_count) + '/' + str(dl_finish_count)
	file_path = os.path.join(save_folder, file_url[file_url.rindex("/") + 1:])
	# check file exist
	if os.path.exists(file_path):
	print "file %s has exist,don`t downLoad " % dl_progress, file_path
	continue # file has exist,don`t downLoad

	print "\nstart download: ", dl_progress, file_url
	file_content = requests.get(file_url).content
	file_obj = open(file_path, "wb")
	file_obj.write(file_content)
	print "downLoad finish %s, file saved in:" % dl_progress, file_path, "\n"


	pdf_folder_path = os.path.join(os.getcwd(), "pdf")
	sd_folder_path = os.path.join(os.getcwd(), "video-sd")
	hd_folder_path = os.path.join(os.getcwd(), "video-hd")

	if not os.path.exists(pdf_folder_path):
	os.mkdir(pdf_folder_path)

	if not os.path.exists(sd_folder_path):
	os.mkdir(sd_folder_path)

	if not os.path.exists(hd_folder_path):
	os.mkdir(hd_folder_path)

	print "start downLoad pdf file:", len(pdf_url_list), "\n"
	down_files(pdf_url_list, pdf_folder_path)
	print " all pdf file download finish."

	# print "start downLoad video-sd file:", len(sd_url_list), "\n"
	# down_files(sd_url_list, sd_folder_path)
	# print " all video-sd file download finish."
	#
	# print "start downLoad video-hd file:", len(hd_url_list), "\n"
	# down_files(hd_url_list, hd_folder_path)
	# print " all video-hd file download finish."