xcaptain/create_table

## create_table
create table tv_info(
  id int auto_increment primary key,
  url varchar(150),
  type tinyint(2),
  title varchar(50),
  comment varchar(100),
  status tinyint(2));

## get_sohu_urls.py
#-*-encoding: utf-8-*-
#!/usr/bin/env python

import re
import urllib2
import MySQLdb as mdb
from bs4 import BeautifulSoup

def get_url(root, max = 1000):
    '''
    根据根url获得相应的符合要求的url
    '''
    urls = [root]
    pt1 = 'http://tv.sohu.com/[0-9]*/n[0-9]*\.shtml$' #匹配视频播放页
    pt2 = 'http://my.tv.sohu.com/[a-zA-Z]*/(\d+)/(\d+).shtml$' #匹配原创视频播放页
    pt3 = 'http://tv.sohu.com/[0-9]*/n[0-9]*\.shtml\?spayid\S' #匹配所有付费视频

    url_pt1 = set()
    url_pt2 = set()
    url_pt3 = set()
    header = {"Referer":"http://www.baidu.com",
              "User-Agent":"Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0"}
    while len(url_pt1) < max:
        res = urllib2.urlopen(urls[0]).read()
        urls.pop(0)
        soup = BeautifulSoup(res)
        for items in soup.find_all('a'):
            h = items.get('href')
            if h != None and h not in urls:
                if re.search(pt1, h):
                    url_pt1.add(h)
                    urls.append(h)
                    print h
                elif re.search(pt2, h):
                    url_pt2.add(h)
                    urls.append(h)
                    print h
                elif re.search(pt3, h):
                    url_pt3.add(h)
                    urls.append(h)
                    print h
    return list(url_pt1), list(url_pt2), list(url_pt3)

def write_into_db(l, types):
    '''
    t是一个元组，把t中各个列表写入数据库
    '''
    for i in range(len(l)):
        l[i] = (l[i], types)
    conn = mdb.connect('localhost', 'username', 'password', 'sohu')
    cur = conn.cursor()
    cur.executemany("insert into tv_info (url, type) values (%s, %s)", l)
    conn.commit()
    conn.close()

if __name__ == '__main__':
    root = 'http://tv.sohu.com'
    (h1, h2, h3) = get_url(root, 100)
    write_into_db(h1, 1)
    write_into_db(h2, 2)
    write_into_db(h3, 3)
	create table tv_info(
	id int auto_increment primary key,
	url varchar(150),
	type tinyint(2),
	title varchar(50),
	comment varchar(100),
	status tinyint(2));
	#--encoding: utf-8--
	#!/usr/bin/env python

	import re
	import urllib2
	import MySQLdb as mdb
	from bs4 import BeautifulSoup

	def get_url(root, max = 1000):
	'''
	根据根url获得相应的符合要求的url
	'''
	urls = [root]
	pt1 = 'http://tv.sohu.com/[0-9]/n[0-9]\.shtml$' #匹配视频播放页
	pt2 = 'http://my.tv.sohu.com/[a-zA-Z]*/(\d+)/(\d+).shtml$' #匹配原创视频播放页
	pt3 = 'http://tv.sohu.com/[0-9]/n[0-9]\.shtml\?spayid\S' #匹配所有付费视频

	url_pt1 = set()
	url_pt2 = set()
	url_pt3 = set()
	header = {"Referer":"http://www.baidu.com",
	"User-Agent":"Mozilla/5.0 (X11; Linux x86_64; rv:28.0) Gecko/20100101 Firefox/28.0"}
	while len(url_pt1) < max:
	res = urllib2.urlopen(urls[0]).read()
	urls.pop(0)
	soup = BeautifulSoup(res)
	for items in soup.find_all('a'):
	h = items.get('href')
	if h != None and h not in urls:
	if re.search(pt1, h):
	url_pt1.add(h)
	urls.append(h)
	print h
	elif re.search(pt2, h):
	url_pt2.add(h)
	urls.append(h)
	print h
	elif re.search(pt3, h):
	url_pt3.add(h)
	urls.append(h)
	print h
	return list(url_pt1), list(url_pt2), list(url_pt3)

	def write_into_db(l, types):
	'''
	t是一个元组，把t中各个列表写入数据库
	'''
	for i in range(len(l)):
	l[i] = (l[i], types)
	conn = mdb.connect('localhost', 'username', 'password', 'sohu')
	cur = conn.cursor()
	cur.executemany("insert into tv_info (url, type) values (%s, %s)", l)
	conn.commit()
	conn.close()

	if __name__ == '__main__':
	root = 'http://tv.sohu.com'
	(h1, h2, h3) = get_url(root, 100)
	write_into_db(h1, 1)
	write_into_db(h2, 2)
	write_into_db(h3, 3)