Created
April 14, 2013 07:42
-
-
Save monsterxx03/5381822 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf-8 | |
#!/usr/bin/python | |
import re | |
import json | |
from bs4 import BeautifulSoup | |
with open('t1.txt') as f: | |
doc = f.read() | |
soup = BeautifulSoup(doc) | |
scripts = soup.find_all('script') | |
feed_list_script = scripts[-8].text # 所有的weibo是放在js的,feed_list是里面倒数第8个script 标签 | |
pattern = re.compile(r'({.*})') # 用正则找出json格式的真正数据内容 | |
match = pattern.search(feed_list_script) | |
json_data = json.loads(match.group()) | |
html = json_data.get('html') | |
soup = BeautifulSoup(html) | |
feed_list = soup.find_all('p', {'node-type': 'feed_list_content'}) # get all feed list | |
first_feed = feed_list[0] | |
print "Title of first feed --->", first_feed.a.text # 第一条feed 的发布者 | |
print "Content of first feed ---->", first_feed.em.text # 第一条feed的内容 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment