Skip to content

Instantly share code, notes, and snippets.

@zxyle
Created December 23, 2018 04:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zxyle/0911fe81981afd0322a98e082a8d6758 to your computer and use it in GitHub Desktop.
Save zxyle/0911fe81981afd0322a98e082a8d6758 to your computer and use it in GitHub Desktop.
异步采集手机号归属地等信息
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Author: Zheng <me@zxyle.cn>
# 采集网站名称: 异步手机号归属地采集
# 主页url: http://m.ip138.com/mobile.asp/
# 使用步骤:
# Step0: python 3.5+
# Step1: 安装依赖 pip install aiohttp==3.5.0 parsel==1.5.1 motor==2.0.0
# Step2: 创建唯一索引 db.phone.ensureIndex({"phone":1},{"unique":true})
# Step3: 修改START、END变量
import asyncio
import time
import aiohttp
from motor import motor_asyncio
from parsel import Selector
from pymongo.errors import DuplicateKeyError
MONGO_URI = 'mongodb://localhost:27017'
client = motor_asyncio.AsyncIOMotorClient(MONGO_URI)
db = client["spider"]
post = db["phone"]
START = 1300000
END = 1399999
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control": "max-age=0",
"Connection": "keep-alive",
"Cookie": "pgv_pvi=1990615040; ASPSESSIONIDQCDBTCAQ=JGMDDLKDMGJMLMDNEPAAMHFK",
"DNT": "1",
"Host": "m.ip138.com",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)\
Chrome/69.0.3497.81 Safari/537.36",
}
async def do_insert(document):
"""
异步写入mongo
:param document:
:return:
"""
try:
await post.insert_one(document)
except DuplicateKeyError:
print("重复:{}".format(document.get("phone")))
else:
print("成功:{}".format(document.get("phone")))
async def fetch(session, url):
"""
异步获取html
:param session:
:param url:
:return:
"""
async with session.get(url) as response:
return await response.text()
async def main():
async with aiohttp.ClientSession(headers=headers) as session:
for i in range(START, END + 1):
url = "http://m.ip138.com/mobile.asp?mobile={}".format(i)
html = await fetch(session, url)
sel = Selector(html)
# 卡号归属地
area = sel.xpath('//table/tr[2]/td[2]/span/text()').extract_first()
# 卡类型
card_type = sel.xpath('//table/tr[3]/td[2]/span/text()').extract_first()
# 区号
area_code = sel.xpath('//table/tr[4]/td[2]/span/text()').extract_first()
# 邮编
post_code = sel.xpath('//table/tr[5]/td[2]/span/text()').extract_first()
data = {"phone": i, "area": area, "card_type": card_type,
"area_code": area_code, "post_code": post_code}
await do_insert(data)
start_time = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
print(time.time() - start_time)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment