Last active
December 28, 2015 18:29
-
-
Save tinylamb/7543881 to your computer and use it in GitHub Desktop.
抓取豆瓣日志信息的爬虫,信息包括 日志名称,URL,评论数,发布时间
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#encoding:utf-8 | |
from Tkinter import * | |
import urllib2 | |
from lxml import html | |
import os,sys | |
class App: | |
def __init__(self,master): | |
self.label=Label(master,text='输入日记首页地址:') | |
self.label.pack(side='left',padx=5) | |
self.button1=Button(master,text='清除',command=self.delete,bd=3) | |
self.button1.pack(side='right',padx=5) | |
self.button2=Button(master,text='提取',command=self.info,bd=3) | |
self.button2.pack(side='right',padx=5) | |
self.e=StringVar() | |
self.entry=Entry(master,textvariable=self.e,width=350,bd=3) | |
self.entry.pack(side='left',padx=5) | |
def delete(self): | |
self.entry.delete(0,last='end') | |
def get_dir(self):#得到输出保存的路径 | |
path=sys.path[0] | |
if os.path.isdir(path): | |
return path | |
elif os.path.isfile(path): | |
return os.path.dirname(path) | |
def info(self):#抓取日志信息 | |
urls=self.e.get()#获得链接 | |
info=[] | |
info=self.fetch(urls)#获得链接指向的网页内容 | |
dir_=self.get_dir()#获得输出保存路径 | |
info_txt=open(dir_+'//info.txt','w')#写入文件 | |
for i in info: | |
info_txt.write(i+'\n') | |
info_txt.close() | |
def fetch(self,url):#抓取过程要研究网页源码格式 | |
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'} | |
info=[] | |
reguler={'re':"http://exslt.org/regular-expressions"} | |
while True: | |
req = urllib2.Request(url, headers=headers) | |
html_source=urllib2.urlopen(req).read() | |
parser=html.fromstring(html_source.decode('utf-8','ignore')) | |
diary=parser.xpath('body//div[re:test(@id,"note-\d+")]',namespaces=reguler) | |
for d in diary: | |
title_info = d.xpath('.//a[@title and re:test(@href,".*\d+")]',namespaces=reguler)[0] | |
title = title_info.attrib['title'] | |
url = title_info.attrib['href'] | |
date = d.xpath('.//span[@class="pl"]')[0].text | |
try: | |
comments=d.xpath('.//a[re:test(@href,".*comments")]',namespaces=reguler)[0].text[1:-3] | |
except IndexError: | |
comments='0' | |
info.append([title,date,comments,url]) | |
print pagenumber | |
for i in range(10): | |
if d.tag=="div" and d.attrib.has_key("class") and d.attrib["class"]=="paginator": | |
break | |
d=d.getnext() | |
try: | |
url= d.xpath('.//link[@rel="next"]')[0].attrib['href'] | |
except IndexError: | |
break | |
data_str=[] | |
for i in info: | |
data=''.join([d.encode('utf-8','ignore')+'\t' for d in i]) | |
data_str.append(data) | |
return data_str | |
root=Tk() | |
root.title('fetch diary') | |
root.geometry('475x300+0+0') | |
app=App(root) | |
root.mainloop() |
mac路径 /x/x/info.txt windows路径x:\x\info.txt,如何做到跨平台?
key: Xpath的使用
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
输出路径的获取,如何做到跨平台?
答:获取当前执行文件的路径,将输出保存在该目录下