Created
May 1, 2017 07:28
-
-
Save Tsukasa007/d148cbf16498417ddd5096e7125f1c0d to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Author;Tsukasa | |
import pymongo | |
Mongo_Url='localhost' | |
Mongo_DB='fangtianxia' | |
Mongo_TABLE='fangtianxia_fs' | |
import json | |
import requests | |
from bs4 import BeautifulSoup | |
from requests.exceptions import RequestException | |
import pandas as pd | |
import time | |
from fake_useragent import UserAgent | |
from multiprocessing import Pool | |
ua = UserAgent() | |
headers1 = {'User-Agent':'ua.ramdom'} | |
client = pymongo.MongoClient(Mongo_Url) | |
db = client[Mongo_DB] | |
def get_url(user_in_city,user_in_nub): #获取user_in_city,user_in_nub的链接 | |
url_home = ('http://esf.'+ user_in_city + '.fang.com/house/i3{}/') | |
for url_next in range(1, int(user_in_nub)): | |
yield url_home.format(url_next) | |
def open_url(url,user_in_city): | |
try: | |
res = requests.get(url, headers=headers1) | |
if res.status_code == 200: | |
soup = BeautifulSoup(res.text, 'html5lib') | |
url_start = 'http://esf.' + user_in_city + '.fang.com' | |
for title in soup.select('.title'): # 网址链接列表 | |
url_end = title.select('a')[0]['href'] | |
yield url_start + url_end | |
except RequestException: | |
return print('检查open_url') | |
def one_page(house_url): | |
try: | |
res = requests.get(house_url, headers=headers1) | |
if res.status_code ==200: | |
soup = BeautifulSoup(res.text, 'html5lib') | |
info = {} | |
info['网页'] = house_url | |
info['标题'] = soup.select('h1')[0].text.strip() # 获取标题 | |
info['总价'] = soup.select('.red20b')[0].text + '万' # 总价 | |
info['联系电话'] = soup.select('#mobilecode')[0].text # 电话 | |
#now_time = time.strftime('%Y-%m-%d\t%H:%M',time.localtime(time.time())) | |
#info['Obj更新时间'] = now_time | |
for sl in soup.select('span'): # 获取发布时间 | |
if '发布时间' in sl.text.lstrip('<span>'): | |
key, value = (sl.text.strip().rstrip('(').split(':')) | |
info[key] = value + '*' + soup.select('#Time')[0].text | |
for dd in soup.select('dd'): # 获取详细内容 | |
if ':' in dd.text.strip(): | |
key, value = (dd.text.strip().split(':')) | |
info[key] = value | |
print(info) | |
return info | |
except RequestException: | |
return print('检查one_page') | |
def writer_to_text(text): | |
with open('房天下.text','a',encoding='utf-8')as f: | |
f.write(json.dumps(text,ensure_ascii=False)+'\n') | |
f.close() | |
def pandas_to_xlsx(pd_list): | |
pd_look = pd.DataFrame(pd_list) | |
pd_look.to_excel('房天下.xlsx',sheet_name='房天下') | |
def pandas_to_csv(pd_list): | |
pd_look = pd.DataFrame(pd_list) | |
pd_look.to_csv('房天下.csv',mode='a+',header=False) | |
def save_to_MongoDB(one_page): #添加到MongoDB | |
if db[Mongo_TABLE].insert(one_page): | |
print('储存到MongoDB OK!',one_page) | |
return True | |
return False | |
def update_to_MongoDB(one_page): #update到MongoDB | |
if db[Mongo_TABLE].update({'网页':one_page['网页']},{'$set':one_page},True): | |
print('储存MongoDB 成功!') | |
return True | |
return False | |
def main(url): | |
data=[] | |
save = one_page(url) | |
data.append(save) | |
pandas_to_csv(data) | |
update_to_MongoDB(save) | |
#writer_to_text(one_page(url)) | |
if __name__ == '__main__': | |
user_in_city = input('输入你所需要城市的字母简写:\n如:中山 zs , 广州 gz\n!!!不要乱输入,不然运行不了') | |
user_in_nub = 1 + int(input('输入爬取页数:')) | |
pool = Pool() | |
for url in get_url(user_in_city, user_in_nub): | |
pool.map(main,[url_open for url_open in open_url(url, user_in_city)]) | |
''' | |
屋里臭居居呼呼好大声(゚ー゚) | |
''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment