Skip to content

Instantly share code, notes, and snippets.

@guoylyy
Last active August 29, 2015 14:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save guoylyy/7a776db486ff40bb156c to your computer and use it in GitHub Desktop.
Save guoylyy/7a776db486ff40bb156c to your computer and use it in GitHub Desktop.
SSE News Getter
from urllib.request import urlopen
from bs4 import BeautifulSoup
import pymysql
# database arrangement
conn = pymysql.connect(host='127.0.0.1', port=3306, user='root', passwd='root', db='sse', charset='utf8')
cur = conn.cursor()
# for all news
def parse_news(url):
'''
Use BeaufitulSoup to separate each part of the news HTML.
Return title, content, date, attach_links
'''
res = urlopen(url)
html = res.read().decode('utf-8')
soup = BeautifulSoup(''.join(html))
title = soup.find('span', {'id':'title'}).text.strip()
date = soup.find('span', {'id':'date'}).text.strip()
content = soup.find('div', {'id':'content'}).text.strip()
attachs = soup.find('div', {'id':'attachment'}).findAll('a')
attach_links = ''
for attach in attachs:
if attach_links != '':
attach_links += ','
link = attach.attrs['href'].strip()
download_index = link.find('Download')
if download_index <0 or download_index >= len(link):
continue
link = 'http://sse.tongji.edu.cn/' + link[download_index:]
attach_links += link
return title, content, date, attach_links
def title_exist(title):
cur = conn.cursor()
count = cur.execute('select title from news where title = "%s"' % title)
if count == 0:
return False
return True
def insert_news(title, content, date, attach_links, url):
'''
Return True if successfully inserted. Otherwise return False.
'''
cur = conn.cursor()
insert_sql = '''insert into news (title, content, date, attach_links, origin_page_url ) values("%s", "%s", "%s", "%s", "%s")'''
count = cur.execute(insert_sql % (title, content, date, attach_links, url))
conn.commit()
if count == 1:
return True
return False
def get_news(url):
title, content, date, attach_links = parse_news(url)
if title_exist(title):
print ('Already exist: %s.' % title)
return
insert_result = insert_news(title, content, date, attach_links, url)
if insert_result is True:
print ("Successfully inserted: %s." % title)
else :
print ("Fail to insert: %s" % title)
if __name__ == "__main__":
for i in range(870, 2923):
index = 1000000 + i
url = "http://sse.tongji.edu.cn/Notice/%d" % index
get_news(url)
a = input("click to continue")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment