Created
December 23, 2018 04:04
-
-
Save zxyle/0911fe81981afd0322a98e082a8d6758 to your computer and use it in GitHub Desktop.
异步采集手机号归属地等信息
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
# Author: Zheng <me@zxyle.cn> | |
# 采集网站名称: 异步手机号归属地采集 | |
# 主页url: http://m.ip138.com/mobile.asp/ | |
# 使用步骤: | |
# Step0: python 3.5+ | |
# Step1: 安装依赖 pip install aiohttp==3.5.0 parsel==1.5.1 motor==2.0.0 | |
# Step2: 创建唯一索引 db.phone.ensureIndex({"phone":1},{"unique":true}) | |
# Step3: 修改START、END变量 | |
import asyncio | |
import time | |
import aiohttp | |
from motor import motor_asyncio | |
from parsel import Selector | |
from pymongo.errors import DuplicateKeyError | |
MONGO_URI = 'mongodb://localhost:27017' | |
client = motor_asyncio.AsyncIOMotorClient(MONGO_URI) | |
db = client["spider"] | |
post = db["phone"] | |
START = 1300000 | |
END = 1399999 | |
headers = { | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8", | |
"Accept-Encoding": "gzip, deflate", | |
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", | |
"Cache-Control": "max-age=0", | |
"Connection": "keep-alive", | |
"Cookie": "pgv_pvi=1990615040; ASPSESSIONIDQCDBTCAQ=JGMDDLKDMGJMLMDNEPAAMHFK", | |
"DNT": "1", | |
"Host": "m.ip138.com", | |
"Upgrade-Insecure-Requests": "1", | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)\ | |
Chrome/69.0.3497.81 Safari/537.36", | |
} | |
async def do_insert(document): | |
""" | |
异步写入mongo | |
:param document: | |
:return: | |
""" | |
try: | |
await post.insert_one(document) | |
except DuplicateKeyError: | |
print("重复:{}".format(document.get("phone"))) | |
else: | |
print("成功:{}".format(document.get("phone"))) | |
async def fetch(session, url): | |
""" | |
异步获取html | |
:param session: | |
:param url: | |
:return: | |
""" | |
async with session.get(url) as response: | |
return await response.text() | |
async def main(): | |
async with aiohttp.ClientSession(headers=headers) as session: | |
for i in range(START, END + 1): | |
url = "http://m.ip138.com/mobile.asp?mobile={}".format(i) | |
html = await fetch(session, url) | |
sel = Selector(html) | |
# 卡号归属地 | |
area = sel.xpath('//table/tr[2]/td[2]/span/text()').extract_first() | |
# 卡类型 | |
card_type = sel.xpath('//table/tr[3]/td[2]/span/text()').extract_first() | |
# 区号 | |
area_code = sel.xpath('//table/tr[4]/td[2]/span/text()').extract_first() | |
# 邮编 | |
post_code = sel.xpath('//table/tr[5]/td[2]/span/text()').extract_first() | |
data = {"phone": i, "area": area, "card_type": card_type, | |
"area_code": area_code, "post_code": post_code} | |
await do_insert(data) | |
start_time = time.time() | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(main()) | |
print(time.time() - start_time) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment