Skip to content

Instantly share code, notes, and snippets.

@ntuaha
Created December 30, 2014 14:54
Show Gist options
  • Save ntuaha/9defe6fdb3041517d68d to your computer and use it in GitHub Desktop.
Save ntuaha/9defe6fdb3041517d68d to your computer and use it in GitHub Desktop.
抓連結的爬蟲
# -*- coding: utf-8 -*-
import re
#處理掉unicode 和 str 在ascii上的問題
import sys
import os
import psycopg2
import cookielib, urllib2,urllib
from lxml import html,etree
import StringIO
reload(sys)
sys.setdefaultencoding('utf8')
if __name__ == "__main__":
print "GG"
# 抓網頁下來
url = 'http://news.cnyes.com/rollnews/list.shtml'
response = urllib2.build_opener().open(url)
the_page = response.read()
response.close()
# 將網頁轉成結構化資料
parser = etree.HTMLParser()
root = etree.parse(StringIO.StringIO(the_page),parser)
# 抓指定位置的連結
print root.xpath('//*[@id="container"]/div[4]/div[1]/div[2]/ul[2]/li[1]/a')[0].text
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment