klb3713/crawl_youku.py

## crawl_youku.py
# -*- coding: utf-8 -*-
__author__ = 'klb3713'


import re
import json
import urllib2
from lxml import etree
from multiprocessing import Process
from pymongo import Connection

con = Connection('127.0.0.1:27017')
youku_videos = con.weibo.youkuVideos

def get_html(url, coding='utf-8'):
    try:
        res = urllib2.urlopen(url)
        return res.read().decode(coding)
    except:
        return ""

def getVideoInfo(urls, channel_name):
    videos = []
    for url in urls:
        try:
            html = get_html(url)
            vid = re.search(r"videoId = '(\d+)'", html).groups()[0]
            tree = etree.HTML(html)
            video = {}
            video['url'] = url
            video["navigation"] = tree.xpath('//div[@class="guide"]/div/a/text()')
            title = tree.xpath('/html/head/title/text()')[0]
            video["title"] = title[:title.find(u'—在线播放')]
            video["upVideoTimes"] = int(tree.xpath('//*[@id="upVideoTimes"]/text()')[0].replace(',', ''))
            video["downVideoTimes"] = int(tree.xpath('//*[@id="downVideoTimes"]/text()')[0].replace(',', ''))
            video_info = json.loads(get_html("http://v.youku.com/QVideo/~ajax/getVideoPlayInfo?__rt=1&__ro=&id=%s&sid=0&type=vv" % vid))
            video["playTimes"] = video_info['vv']

            if tree.xpath('//*[@id="text_long"]'):
                video["description"] = tree.xpath('//*[@id="text_long"]/text()')[0]
            else:
                video["description"] = ""
            video["channel"] = channel_name
            videos.append(video)

        except Exception, e:
            print url
            print e
            continue

    return videos


def getVideos(channel_url, channel_name):
    base_url = "http://www.youku.com"
    html = get_html(channel_url)
    tree = etree.HTML(html)
    page_url = base_url + tree.xpath('//ul[@class="pages"]/li[last()]/a/@href')[0]
    page_url_pre = re.split(r'\d+\.html$', page_url)[0]
    max_page = int(re.search(r'(\d+)\.html$', page_url).groups()[0])

    #urls = re.findall(r'http://v.youku.com/v_show/id_[^?"]+', html)
    urls = []
    for page in range(1, max_page+1):
        try:
            page_url = page_url_pre + str(page) + ".html"
            page_html = get_html(page_url)
            # urls.extend(re.findall(r'http://v.youku.com/v_show/id_[^?"]+', html))
            urls.extend(re.findall(r'http://v.youku.com/v_show/id_[^?"]+', page_html))
        except Exception, e:
            print page_url
            print e
            continue

    urls = list(set(urls))
    length = 0
    if len(urls) > 1000:
        urls = urls[0:1000]
        length = 1000
    else:
        length = len(urls)
    index = 0
    while index < length:
        videos_info = getVideoInfo(urls[index:index+100], channel_name)
        youku_videos.insert(videos_info)
        index += 100

def main():
    channels = [
        ["http://www.youku.com/v_showlist/c91.html", "news"],
        ["http://www.youku.com/v_showlist/c105.html", "tech"],
        ["http://www.youku.com/v_showlist/c86.html", "ent"],
        ["http://www.youku.com/v_showlist/c94.html", "fun"],
        ["http://www.youku.com/v_showlist/c88.html", "travel"],
        ["http://www.youku.com/v_showlist/c90.html", "baby"],

        ["http://www.youku.com/v_olist/c_87.html", "edu"],
        ["http://www.youku.com/v_olist/c_84.html", "jilupian"],
        ["http://www.youku.com/v_showlist/c92.html", "dv"],
        ["http://www.youku.com/v_showlist/c98.html", "sports"],
        ["http://www.youku.com/v_showlist/c89.html", "fashion"],
        ["http://www.youku.com/v_showlist/c99.html", "game"],
        ["http://www.youku.com/v_showlist/c104.html", "auto"],
    ]

    for channel in channels:
        getVideos(channel[0], channel[1])

    # Multi-process
    # record = []
    # for channel in channels:
    #     # channel_name = re.search(r'http://([^\.]+)\.', channel).groups()[0]
    #     process = Process(target=getVideos, args=(channel[0], channel[1]))
    #     process.start()
    #     record.append(process)
    #
    # for process in record:
    #     process.join()


if __name__ == "__main__":
    main()
    # getVideos("http://www.youku.com/v_showlist/c91.html", "news")
    # getVideoInfo(["http://v.youku.com/v_show/id_XNTkzNjc3MzA0.html"])
	# -- coding: utf-8 --
	__author__ = 'klb3713'


	import re
	import json
	import urllib2
	from lxml import etree
	from multiprocessing import Process
	from pymongo import Connection

	con = Connection('127.0.0.1:27017')
	youku_videos = con.weibo.youkuVideos

	def get_html(url, coding='utf-8'):
	try:
	res = urllib2.urlopen(url)
	return res.read().decode(coding)
	except:
	return ""

	def getVideoInfo(urls, channel_name):
	videos = []
	for url in urls:
	try:
	html = get_html(url)
	vid = re.search(r"videoId = '(\d+)'", html).groups()[0]
	tree = etree.HTML(html)
	video = {}
	video['url'] = url
	video["navigation"] = tree.xpath('//div[@class="guide"]/div/a/text()')
	title = tree.xpath('/html/head/title/text()')[0]
	video["title"] = title[:title.find(u'—在线播放')]
	video["upVideoTimes"] = int(tree.xpath('//*[@id="upVideoTimes"]/text()')[0].replace(',', ''))
	video["downVideoTimes"] = int(tree.xpath('//*[@id="downVideoTimes"]/text()')[0].replace(',', ''))
	video_info = json.loads(get_html("http://v.youku.com/QVideo/~ajax/getVideoPlayInfo?__rt=1&__ro=&id=%s&sid=0&type=vv" % vid))
	video["playTimes"] = video_info['vv']

	if tree.xpath('//*[@id="text_long"]'):
	video["description"] = tree.xpath('//*[@id="text_long"]/text()')[0]
	else:
	video["description"] = ""
	video["channel"] = channel_name
	videos.append(video)

	except Exception, e:
	print url
	print e
	continue

	return videos


	def getVideos(channel_url, channel_name):
	base_url = "http://www.youku.com"
	html = get_html(channel_url)
	tree = etree.HTML(html)
	page_url = base_url + tree.xpath('//ul[@class="pages"]/li[last()]/a/@href')[0]
	page_url_pre = re.split(r'\d+\.html$', page_url)[0]
	max_page = int(re.search(r'(\d+)\.html$', page_url).groups()[0])

	#urls = re.findall(r'http://v.youku.com/v_show/id_[^?"]+', html)
	urls = []
	for page in range(1, max_page+1):
	try:
	page_url = page_url_pre + str(page) + ".html"
	page_html = get_html(page_url)
	# urls.extend(re.findall(r'http://v.youku.com/v_show/id_[^?"]+', html))
	urls.extend(re.findall(r'http://v.youku.com/v_show/id_[^?"]+', page_html))
	except Exception, e:
	print page_url
	print e
	continue

	urls = list(set(urls))
	length = 0
	if len(urls) > 1000:
	urls = urls[0:1000]
	length = 1000
	else:
	length = len(urls)
	index = 0
	while index < length:
	videos_info = getVideoInfo(urls[index:index+100], channel_name)
	youku_videos.insert(videos_info)
	index += 100

	def main():
	channels = [
	["http://www.youku.com/v_showlist/c91.html", "news"],
	["http://www.youku.com/v_showlist/c105.html", "tech"],
	["http://www.youku.com/v_showlist/c86.html", "ent"],
	["http://www.youku.com/v_showlist/c94.html", "fun"],
	["http://www.youku.com/v_showlist/c88.html", "travel"],
	["http://www.youku.com/v_showlist/c90.html", "baby"],

	["http://www.youku.com/v_olist/c_87.html", "edu"],
	["http://www.youku.com/v_olist/c_84.html", "jilupian"],
	["http://www.youku.com/v_showlist/c92.html", "dv"],
	["http://www.youku.com/v_showlist/c98.html", "sports"],
	["http://www.youku.com/v_showlist/c89.html", "fashion"],
	["http://www.youku.com/v_showlist/c99.html", "game"],
	["http://www.youku.com/v_showlist/c104.html", "auto"],
	]

	for channel in channels:
	getVideos(channel[0], channel[1])

	# Multi-process
	# record = []
	# for channel in channels:
	# # channel_name = re.search(r'http://([^\.]+)\.', channel).groups()[0]
	# process = Process(target=getVideos, args=(channel[0], channel[1]))
	# process.start()
	# record.append(process)
	#
	# for process in record:
	# process.join()


	if __name__ == "__main__":
	main()
	# getVideos("http://www.youku.com/v_showlist/c91.html", "news")
	# getVideoInfo(["http://v.youku.com/v_show/id_XNTkzNjc3MzA0.html"])