Last active
August 29, 2015 14:02
-
-
Save guoylyy/7a776db486ff40bb156c to your computer and use it in GitHub Desktop.
SSE News Getter
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib.request import urlopen | |
from bs4 import BeautifulSoup | |
import pymysql | |
# database arrangement | |
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='sse', charset='utf8') | |
cur = conn.cursor() | |
# for all news | |
def parse_news(url): | |
''' | |
Use BeaufitulSoup to separate each part of the news HTML. | |
Return title, content, date, attach_links | |
''' | |
res = urlopen(url) | |
html = res.read().decode('utf-8') | |
soup = BeautifulSoup(''.join(html)) | |
title = soup.find('span', {'id':'title'}).text.strip() | |
date = soup.find('span', {'id':'date'}).text.strip() | |
content = soup.find('div', {'id':'content'}).text.strip() | |
attachs = soup.find('div', {'id':'attachment'}).findAll('a') | |
attach_links = '' | |
for attach in attachs: | |
if attach_links != '': | |
attach_links += ',' | |
link = attach.attrs['href'].strip() | |
download_index = link.find('Download') | |
if download_index <0 or download_index >= len(link): | |
continue | |
link = 'http://sse.tongji.edu.cn/' + link[download_index:] | |
attach_links += link | |
return title, content, date, attach_links | |
def title_exist(title): | |
cur = conn.cursor() | |
count = cur.execute('select title from news where title = "%s"' % title) | |
if count == 0: | |
return False | |
return True | |
def insert_news(title, content, date, attach_links, url): | |
''' | |
Return True if successfully inserted. Otherwise return False. | |
''' | |
cur = conn.cursor() | |
insert_sql = '''insert into news (title, content, date, attach_links, origin_page_url ) values("%s", "%s", "%s", "%s", "%s")''' | |
count = cur.execute(insert_sql % (title, content, date, attach_links, url)) | |
conn.commit() | |
if count == 1: | |
return True | |
return False | |
def get_news(url): | |
title, content, date, attach_links = parse_news(url) | |
if title_exist(title): | |
print ('Already exist: %s.' % title) | |
return | |
insert_result = insert_news(title, content, date, attach_links, url) | |
if insert_result is True: | |
print ("Successfully inserted: %s." % title) | |
else : | |
print ("Fail to insert: %s" % title) | |
if __name__ == "__main__": | |
for i in range(870, 2923): | |
index = 1000000 + i | |
url = "http://sse.tongji.edu.cn/Notice/%d" % index | |
get_news(url) | |
a = input("click to continue") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment