Skip to content

Instantly share code, notes, and snippets.

@tinylamb
Last active December 28, 2015 18:29
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tinylamb/7543881 to your computer and use it in GitHub Desktop.
Save tinylamb/7543881 to your computer and use it in GitHub Desktop.
抓取豆瓣日志信息的爬虫,信息包括 日志名称,URL,评论数,发布时间
#encoding:utf-8
from Tkinter import *
import urllib2
from lxml import html
import os,sys
class App:
def __init__(self,master):
self.label=Label(master,text='输入日记首页地址:')
self.label.pack(side='left',padx=5)
self.button1=Button(master,text='清除',command=self.delete,bd=3)
self.button1.pack(side='right',padx=5)
self.button2=Button(master,text='提取',command=self.info,bd=3)
self.button2.pack(side='right',padx=5)
self.e=StringVar()
self.entry=Entry(master,textvariable=self.e,width=350,bd=3)
self.entry.pack(side='left',padx=5)
def delete(self):
self.entry.delete(0,last='end')
def get_dir(self):#得到输出保存的路径
path=sys.path[0]
if os.path.isdir(path):
return path
elif os.path.isfile(path):
return os.path.dirname(path)
def info(self):#抓取日志信息
urls=self.e.get()#获得链接
info=[]
info=self.fetch(urls)#获得链接指向的网页内容
dir_=self.get_dir()#获得输出保存路径
info_txt=open(dir_+'//info.txt','w')#写入文件
for i in info:
info_txt.write(i+'\n')
info_txt.close()
def fetch(self,url):#抓取过程要研究网页源码格式
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
info=[]
reguler={'re':"http://exslt.org/regular-expressions"}
while True:
req = urllib2.Request(url, headers=headers)
html_source=urllib2.urlopen(req).read()
parser=html.fromstring(html_source.decode('utf-8','ignore'))
diary=parser.xpath('body//div[re:test(@id,"note-\d+")]',namespaces=reguler)
for d in diary:
title_info = d.xpath('.//a[@title and re:test(@href,".*\d+")]',namespaces=reguler)[0]
title = title_info.attrib['title']
url = title_info.attrib['href']
date = d.xpath('.//span[@class="pl"]')[0].text
try:
comments=d.xpath('.//a[re:test(@href,".*comments")]',namespaces=reguler)[0].text[1:-3]
except IndexError:
comments='0'
info.append([title,date,comments,url])
print pagenumber
for i in range(10):
if d.tag=="div" and d.attrib.has_key("class") and d.attrib["class"]=="paginator":
break
d=d.getnext()
try:
url= d.xpath('.//link[@rel="next"]')[0].attrib['href']
except IndexError:
break
data_str=[]
for i in info:
data=''.join([d.encode('utf-8','ignore')+'\t' for d in i])
data_str.append(data)
return data_str
root=Tk()
root.title('fetch diary')
root.geometry('475x300+0+0')
app=App(root)
root.mainloop()
@tinylamb
Copy link
Author

输出路径的获取,如何做到跨平台?
答:获取当前执行文件的路径,将输出保存在该目录下

@tinylamb
Copy link
Author

mac路径 /x/x/info.txt windows路径x:\x\info.txt,如何做到跨平台?

@tinylamb
Copy link
Author

key: Xpath的使用

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment