Last active
August 3, 2016 06:52
-
-
Save cjoushua/a6cc889d8e728c80297322479bbd53d4 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" | |
Created on Wed Jun 08 08:43:41 2016 | |
@author: Joushua | |
""" | |
#Mobile01 Crawler | |
#在此程式匯入requests套件 | |
import requests | |
#import beautifulsoup4 | |
from bs4 import BeautifulSoup | |
import urllib2, urllib, json,time | |
from elasticsearch import Elasticsearch | |
es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) | |
string = "" | |
#起始頁 | |
i = 1 | |
Page = 1 | |
for Page in range(1,2400) : | |
res = requests.get('http://www.mobile01.com/topiclist.php?f=291&p=' + str(Page)) | |
soup = BeautifulSoup(res.text) | |
print '第' + str(Page) + '頁' | |
#抓取首頁Url | |
for ListTopic in soup.findAll("span", {'class': 'subject-text'}): | |
ListUrl = 'http://www.mobile01.com/' + ListTopic.a.get('href') #抓取網址Url | |
print ListTopic.text | |
print ListUrl | |
res = requests.get(ListUrl) | |
soup = BeautifulSoup(res.text) | |
for Content in soup.findAll("div", {'class': 'single-post-content'}): | |
print 'id = ' + str(i) | |
print Content.text | |
#倒資料到elastic | |
es.index(index='fintech', doc_type='main', id=i, body=json.dumps({'Topic':ListTopic.text.encode("utf8"), 'url': ListUrl.encode("utf8") , 'Content' : Content.text.encode("utf8")})) | |
i = i + 1 | |
time.sleep(2.5) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment