Skip to content

Instantly share code, notes, and snippets.

@monsterxx03
Created April 14, 2013 07:42
Show Gist options
  • Save monsterxx03/5381822 to your computer and use it in GitHub Desktop.
Save monsterxx03/5381822 to your computer and use it in GitHub Desktop.
#coding=utf-8
#!/usr/bin/python
import re
import json
from bs4 import BeautifulSoup
with open('t1.txt') as f:
doc = f.read()
soup = BeautifulSoup(doc)
scripts = soup.find_all('script')
feed_list_script = scripts[-8].text # 所有的weibo是放在js的,feed_list是里面倒数第8个script 标签
pattern = re.compile(r'({.*})') # 用正则找出json格式的真正数据内容
match = pattern.search(feed_list_script)
json_data = json.loads(match.group())
html = json_data.get('html')
soup = BeautifulSoup(html)
feed_list = soup.find_all('p', {'node-type': 'feed_list_content'}) # get all feed list
first_feed = feed_list[0]
print "Title of first feed --->", first_feed.a.text # 第一条feed 的发布者
print "Content of first feed ---->", first_feed.em.text # 第一条feed的内容
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment