Skip to content

Instantly share code, notes, and snippets.

@xcaptain
Created April 23, 2014 15:52
Show Gist options
  • Save xcaptain/11220918 to your computer and use it in GitHub Desktop.
Save xcaptain/11220918 to your computer and use it in GitHub Desktop.
爬取搜狐页面的爬虫,获得视频播放页的url
create table tv_info(
id int auto_increment primary key,
url varchar(150),
type tinyint(2),
title varchar(50),
comment varchar(100),
status tinyint(2));
#-*-encoding: utf-8-*-
#!/usr/bin/env python
import re
import urllib2
import MySQLdb as mdb
from bs4 import BeautifulSoup
def get_url(root, max = 1000):
'''
根据根url获得相应的符合要求的url
'''
urls = [root]
pt1 = 'http://tv.sohu.com/[0-9]*/n[0-9]*\.shtml$' #匹配视频播放页
pt2 = 'http://my.tv.sohu.com/[a-zA-Z]*/(\d+)/(\d+).shtml$' #匹配原创视频播放页
pt3 = 'http://tv.sohu.com/[0-9]*/n[0-9]*\.shtml\?spayid\S' #匹配所有付费视频
url_pt1 = set()
url_pt2 = set()
url_pt3 = set()
header = {"Referer":"http://www.baidu.com",
"User-Agent":"Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0"}
while len(url_pt1) < max:
res = urllib2.urlopen(urls[0]).read()
urls.pop(0)
soup = BeautifulSoup(res)
for items in soup.find_all('a'):
h = items.get('href')
if h != None and h not in urls:
if re.search(pt1, h):
url_pt1.add(h)
urls.append(h)
print h
elif re.search(pt2, h):
url_pt2.add(h)
urls.append(h)
print h
elif re.search(pt3, h):
url_pt3.add(h)
urls.append(h)
print h
return list(url_pt1), list(url_pt2), list(url_pt3)
def write_into_db(l, types):
'''
t是一个元组,把t中各个列表写入数据库
'''
for i in range(len(l)):
l[i] = (l[i], types)
conn = mdb.connect('localhost', 'username', 'password', 'sohu')
cur = conn.cursor()
cur.executemany("insert into tv_info (url, type) values (%s, %s)", l)
conn.commit()
conn.close()
if __name__ == '__main__':
root = 'http://tv.sohu.com'
(h1, h2, h3) = get_url(root, 100)
write_into_db(h1, 1)
write_into_db(h2, 2)
write_into_db(h3, 3)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment