guoylyy/gist:7a776db486ff40bb156c

## gistfile1.txt
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql

# database arrangement
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='sse', charset='utf8')
cur = conn.cursor()

# for all news
def parse_news(url):
    '''
    Use BeaufitulSoup to separate each part of the news HTML.
    Return title, content, date, attach_links
    '''

    res = urlopen(url)
    html = res.read().decode('utf-8')
    soup = BeautifulSoup(''.join(html))

    title = soup.find('span', {'id':'title'}).text.strip()
    date = soup.find('span', {'id':'date'}).text.strip()
    content = soup.find('div', {'id':'content'}).text.strip()

    attachs = soup.find('div', {'id':'attachment'}).findAll('a')
    attach_links = ''
    for attach in attachs:
        if attach_links != '':
            attach_links += ','
        link = attach.attrs['href'].strip()
        download_index = link.find('Download')
        if download_index <0 or download_index >= len(link):
            continue
        link = 'http://sse.tongji.edu.cn/' + link[download_index:]
        attach_links += link

    return title, content, date, attach_links

def title_exist(title):
    cur = conn.cursor()
    count = cur.execute('select title from news where title = "%s"' % title)
    if count == 0:
        return False
    return True

def insert_news(title, content, date, attach_links, url):
    '''
    Return True if successfully inserted. Otherwise return False.
    '''

    cur = conn.cursor()
    insert_sql = '''insert into news (title, content, date, attach_links, origin_page_url ) values("%s", "%s", "%s", "%s", "%s")'''
    count = cur.execute(insert_sql % (title, content, date, attach_links, url))
    conn.commit()
    if count == 1:
        return True
    return False

def get_news(url):
    title, content, date, attach_links = parse_news(url)
    if title_exist(title):
        print ('Already exist: %s.' % title)
        return
    insert_result = insert_news(title, content, date, attach_links, url)
    if insert_result is True:
        print ("Successfully inserted: %s." % title)
    else :
        print ("Fail to insert: %s" % title)


if __name__ == "__main__":
    for i in range(870, 2923):
        index = 1000000 + i
        url = "http://sse.tongji.edu.cn/Notice/%d" % index
        get_news(url)
        a = input("click to continue")
	from urllib.request import urlopen
	from bs4 import BeautifulSoup
	import pymysql

	# database arrangement
	conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='sse', charset='utf8')
	cur = conn.cursor()

	# for all news
	def parse_news(url):
	'''
	Use BeaufitulSoup to separate each part of the news HTML.
	Return title, content, date, attach_links
	'''

	res = urlopen(url)
	html = res.read().decode('utf-8')
	soup = BeautifulSoup(''.join(html))

	title = soup.find('span', {'id':'title'}).text.strip()
	date = soup.find('span', {'id':'date'}).text.strip()
	content = soup.find('div', {'id':'content'}).text.strip()

	attachs = soup.find('div', {'id':'attachment'}).findAll('a')
	attach_links = ''
	for attach in attachs:
	if attach_links != '':
	attach_links += ','
	link = attach.attrs['href'].strip()
	download_index = link.find('Download')
	if download_index <0 or download_index >= len(link):
	continue
	link = 'http://sse.tongji.edu.cn/' + link[download_index:]
	attach_links += link

	return title, content, date, attach_links

	def title_exist(title):
	cur = conn.cursor()
	count = cur.execute('select title from news where title = "%s"' % title)
	if count == 0:
	return False
	return True

	def insert_news(title, content, date, attach_links, url):
	'''
	Return True if successfully inserted. Otherwise return False.
	'''

	cur = conn.cursor()
	insert_sql = '''insert into news (title, content, date, attach_links, origin_page_url ) values("%s", "%s", "%s", "%s", "%s")'''
	count = cur.execute(insert_sql % (title, content, date, attach_links, url))
	conn.commit()
	if count == 1:
	return True
	return False

	def get_news(url):
	title, content, date, attach_links = parse_news(url)
	if title_exist(title):
	print ('Already exist: %s.' % title)
	return
	insert_result = insert_news(title, content, date, attach_links, url)
	if insert_result is True:
	print ("Successfully inserted: %s." % title)
	else :
	print ("Fail to insert: %s" % title)


	if __name__ == "__main__":
	for i in range(870, 2923):
	index = 1000000 + i
	url = "http://sse.tongji.edu.cn/Notice/%d" % index
	get_news(url)
	a = input("click to continue")