Created
April 4, 2017 09:09
-
-
Save jintao-zero/f395c8a9699ccae8ea427a0dfa708109 to your computer and use it in GitHub Desktop.
将马蜂窝旅游网站中关于某个旅游景点的旅游攻略下载到本地
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import sys | |
import urllib | |
import urllib2 | |
from lxml import etree | |
import time | |
import random | |
def downloadpage(weibourl): | |
header = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36' | |
} | |
req = urllib2.Request(url=weibourl, headers=header) | |
for i in range(5): | |
try: | |
sleeptime = random.randint(1, 3) | |
time.sleep(sleeptime) | |
html = urllib2.urlopen(req, timeout = 12) | |
data = html.read() | |
break | |
except: | |
print 'down load page fail:', weibourl | |
data = None | |
return data | |
class OneTravelPage: | |
'one travel page' | |
def __init__(self, url): | |
self.url = url | |
self.text ='' | |
def parsePage(self): | |
#down load page | |
data = downloadpage(self.url) | |
if data == None: | |
print self.url, 'downloadpage fail' | |
# parse page | |
html = etree.HTML(data) | |
#print etree.tostring(html, encoding='utf-8', xml_declaration = True) | |
#post = html.xpath('//div[@class="post_main no-border"]') | |
post = html.xpath('//div[@class="a_con_text cont"]') | |
if len(post) == 0: | |
post = html.xpath('//div[@class="va_con _j_master_content"]') | |
if len(post) == 0: | |
post = html.xpath('//div[@class="post_con"]') | |
text = None | |
if len(post) >= 1: | |
text = post[0].xpath('string(.)') | |
lines = text.splitlines() | |
for l in lines: | |
l = l.strip(' \t\r\n') | |
if len(l) > 0: | |
#print u''.join(l).encode('utf-8') | |
self.text = self.text + u''.join((l, '\n')).encode('utf-8') | |
#print l.decode('unicode_escape'),' new line' | |
class MaSearchResultPage: | |
'one page of mafengwo search result' | |
def __init__(self, url): | |
self.url = url | |
def parseResultPage(self): | |
#down page | |
data = downloadpage(self.url) | |
open('mafengwo.html','w').write(data) | |
if data is None: | |
assert(False) | |
html = etree.HTML(data) | |
aTag = html.xpath('//div[@class="att-list"]/ul/li/div/div/h3/a') | |
print len(aTag) | |
for a in aTag: | |
print a.attrib['href'], | |
header = a.xpath('string(.)') | |
print 'header', header.encode('utf-8'), header.encode('utf-8').replace(' ', '') | |
i = 0 | |
while i < 5: | |
travelPage = OneTravelPage(a.attrib['href']) | |
travelPage.parsePage() | |
if len(travelPage.text) > 0: | |
filename = 'mafengwo-fzm/' + header.replace(' ', '').replace('/','').strip() | |
f = open(filename, 'w') | |
f.write(travelPage.text) | |
f.close() | |
break | |
else: | |
time.sleep(3) | |
print 'sleep 3s' | |
i = i + 1 | |
time.sleep(3) | |
if __name__ == '__main__': | |
# url = "http://www.mafengwo.cn/i/3395834.html" | |
# travelPage = OneTravelPage(url) | |
# travelPage.parsePage() | |
# print travelPage.text | |
#print data.encode('utf-8') | |
# return | |
urlPrefix = 'http://www.mafengwo.cn/group/s.php?q=%E5%A4%AB%E5%AD%90%E5%BA%99&t=info&kt=1&p=' | |
for i in range(1,51): | |
url = urlPrefix + str(i) | |
print 'search page', url | |
resultPage = MaSearchResultPage(url) | |
resultPage.parseResultPage() | |
sleeptime = random.randint(1, 10) | |
time.sleep(sleeptime) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment