Skip to content

Instantly share code, notes, and snippets.

@jintao-zero
Created April 4, 2017 09:09
Show Gist options
  • Save jintao-zero/f395c8a9699ccae8ea427a0dfa708109 to your computer and use it in GitHub Desktop.
Save jintao-zero/f395c8a9699ccae8ea427a0dfa708109 to your computer and use it in GitHub Desktop.
将马蜂窝旅游网站中关于某个旅游景点的旅游攻略下载到本地
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
import urllib
import urllib2
from lxml import etree
import time
import random
def downloadpage(weibourl):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36'
}
req = urllib2.Request(url=weibourl, headers=header)
for i in range(5):
try:
sleeptime = random.randint(1, 3)
time.sleep(sleeptime)
html = urllib2.urlopen(req, timeout = 12)
data = html.read()
break
except:
print 'down load page fail:', weibourl
data = None
return data
class OneTravelPage:
'one travel page'
def __init__(self, url):
self.url = url
self.text =''
def parsePage(self):
#down load page
data = downloadpage(self.url)
if data == None:
print self.url, 'downloadpage fail'
# parse page
html = etree.HTML(data)
#print etree.tostring(html, encoding='utf-8', xml_declaration = True)
#post = html.xpath('//div[@class="post_main no-border"]')
post = html.xpath('//div[@class="a_con_text cont"]')
if len(post) == 0:
post = html.xpath('//div[@class="va_con _j_master_content"]')
if len(post) == 0:
post = html.xpath('//div[@class="post_con"]')
text = None
if len(post) >= 1:
text = post[0].xpath('string(.)')
lines = text.splitlines()
for l in lines:
l = l.strip(' \t\r\n')
if len(l) > 0:
#print u''.join(l).encode('utf-8')
self.text = self.text + u''.join((l, '\n')).encode('utf-8')
#print l.decode('unicode_escape'),' new line'
class MaSearchResultPage:
'one page of mafengwo search result'
def __init__(self, url):
self.url = url
def parseResultPage(self):
#down page
data = downloadpage(self.url)
open('mafengwo.html','w').write(data)
if data is None:
assert(False)
html = etree.HTML(data)
aTag = html.xpath('//div[@class="att-list"]/ul/li/div/div/h3/a')
print len(aTag)
for a in aTag:
print a.attrib['href'],
header = a.xpath('string(.)')
print 'header', header.encode('utf-8'), header.encode('utf-8').replace(' ', '')
i = 0
while i < 5:
travelPage = OneTravelPage(a.attrib['href'])
travelPage.parsePage()
if len(travelPage.text) > 0:
filename = 'mafengwo-fzm/' + header.replace(' ', '').replace('/','').strip()
f = open(filename, 'w')
f.write(travelPage.text)
f.close()
break
else:
time.sleep(3)
print 'sleep 3s'
i = i + 1
time.sleep(3)
if __name__ == '__main__':
# url = "http://www.mafengwo.cn/i/3395834.html"
# travelPage = OneTravelPage(url)
# travelPage.parsePage()
# print travelPage.text
#print data.encode('utf-8')
# return
urlPrefix = 'http://www.mafengwo.cn/group/s.php?q=%E5%A4%AB%E5%AD%90%E5%BA%99&t=info&kt=1&p='
for i in range(1,51):
url = urlPrefix + str(i)
print 'search page', url
resultPage = MaSearchResultPage(url)
resultPage.parseResultPage()
sleeptime = random.randint(1, 10)
time.sleep(sleeptime)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment