Skip to content

Instantly share code, notes, and snippets.

@klb3713
Last active December 20, 2015 22:59
Show Gist options
  • Save klb3713/6209281 to your computer and use it in GitHub Desktop.
Save klb3713/6209281 to your computer and use it in GitHub Desktop.
python 爬取优酷视频信息,并存储到mongodb
# -*- coding: utf-8 -*-
__author__ = 'klb3713'
import re
import json
import urllib2
from lxml import etree
from multiprocessing import Process
from pymongo import Connection
con = Connection('127.0.0.1:27017')
youku_videos = con.weibo.youkuVideos
def get_html(url, coding='utf-8'):
try:
res = urllib2.urlopen(url)
return res.read().decode(coding)
except:
return ""
def getVideoInfo(urls, channel_name):
videos = []
for url in urls:
try:
html = get_html(url)
vid = re.search(r"videoId = '(\d+)'", html).groups()[0]
tree = etree.HTML(html)
video = {}
video['url'] = url
video["navigation"] = tree.xpath('//div[@class="guide"]/div/a/text()')
title = tree.xpath('/html/head/title/text()')[0]
video["title"] = title[:title.find(u'—在线播放')]
video["upVideoTimes"] = int(tree.xpath('//*[@id="upVideoTimes"]/text()')[0].replace(',', ''))
video["downVideoTimes"] = int(tree.xpath('//*[@id="downVideoTimes"]/text()')[0].replace(',', ''))
video_info = json.loads(get_html("http://v.youku.com/QVideo/~ajax/getVideoPlayInfo?__rt=1&__ro=&id=%s&sid=0&type=vv" % vid))
video["playTimes"] = video_info['vv']
if tree.xpath('//*[@id="text_long"]'):
video["description"] = tree.xpath('//*[@id="text_long"]/text()')[0]
else:
video["description"] = ""
video["channel"] = channel_name
videos.append(video)
except Exception, e:
print url
print e
continue
return videos
def getVideos(channel_url, channel_name):
base_url = "http://www.youku.com"
html = get_html(channel_url)
tree = etree.HTML(html)
page_url = base_url + tree.xpath('//ul[@class="pages"]/li[last()]/a/@href')[0]
page_url_pre = re.split(r'\d+\.html$', page_url)[0]
max_page = int(re.search(r'(\d+)\.html$', page_url).groups()[0])
#urls = re.findall(r'http://v.youku.com/v_show/id_[^?"]+', html)
urls = []
for page in range(1, max_page+1):
try:
page_url = page_url_pre + str(page) + ".html"
page_html = get_html(page_url)
# urls.extend(re.findall(r'http://v.youku.com/v_show/id_[^?"]+', html))
urls.extend(re.findall(r'http://v.youku.com/v_show/id_[^?"]+', page_html))
except Exception, e:
print page_url
print e
continue
urls = list(set(urls))
length = 0
if len(urls) > 1000:
urls = urls[0:1000]
length = 1000
else:
length = len(urls)
index = 0
while index < length:
videos_info = getVideoInfo(urls[index:index+100], channel_name)
youku_videos.insert(videos_info)
index += 100
def main():
channels = [
["http://www.youku.com/v_showlist/c91.html", "news"],
["http://www.youku.com/v_showlist/c105.html", "tech"],
["http://www.youku.com/v_showlist/c86.html", "ent"],
["http://www.youku.com/v_showlist/c94.html", "fun"],
["http://www.youku.com/v_showlist/c88.html", "travel"],
["http://www.youku.com/v_showlist/c90.html", "baby"],
["http://www.youku.com/v_olist/c_87.html", "edu"],
["http://www.youku.com/v_olist/c_84.html", "jilupian"],
["http://www.youku.com/v_showlist/c92.html", "dv"],
["http://www.youku.com/v_showlist/c98.html", "sports"],
["http://www.youku.com/v_showlist/c89.html", "fashion"],
["http://www.youku.com/v_showlist/c99.html", "game"],
["http://www.youku.com/v_showlist/c104.html", "auto"],
]
for channel in channels:
getVideos(channel[0], channel[1])
# Multi-process
# record = []
# for channel in channels:
# # channel_name = re.search(r'http://([^\.]+)\.', channel).groups()[0]
# process = Process(target=getVideos, args=(channel[0], channel[1]))
# process.start()
# record.append(process)
#
# for process in record:
# process.join()
if __name__ == "__main__":
main()
# getVideos("http://www.youku.com/v_showlist/c91.html", "news")
# getVideoInfo(["http://v.youku.com/v_show/id_XNTkzNjc3MzA0.html"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment