Skip to content

Instantly share code, notes, and snippets.

@cjoushua
Last active August 3, 2016 06:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save cjoushua/a6cc889d8e728c80297322479bbd53d4 to your computer and use it in GitHub Desktop.
Save cjoushua/a6cc889d8e728c80297322479bbd53d4 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 08 08:43:41 2016
@author: Joushua
"""
#Mobile01 Crawler
#在此程式匯入requests套件
import requests
#import beautifulsoup4
from bs4 import BeautifulSoup
import urllib2, urllib, json,time
from elasticsearch import Elasticsearch
es = Elasticsearch([{'host': 'localhost', 'port': 9200}])
string = ""
#起始頁
i = 1
Page = 1
for Page in range(1,2400) :
res = requests.get('http://www.mobile01.com/topiclist.php?f=291&p=' + str(Page))
soup = BeautifulSoup(res.text)
print '第' + str(Page) + '頁'
#抓取首頁Url
for ListTopic in soup.findAll("span", {'class': 'subject-text'}):
ListUrl = 'http://www.mobile01.com/' + ListTopic.a.get('href') #抓取網址Url
print ListTopic.text
print ListUrl
res = requests.get(ListUrl)
soup = BeautifulSoup(res.text)
for Content in soup.findAll("div", {'class': 'single-post-content'}):
print 'id = ' + str(i)
print Content.text
#倒資料到elastic
es.index(index='fintech', doc_type='main', id=i, body=json.dumps({'Topic':ListTopic.text.encode("utf8"), 'url': ListUrl.encode("utf8") , 'Content' : Content.text.encode("utf8")}))
i = i + 1
time.sleep(2.5)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment