Created
September 25, 2019 15:09
-
-
Save Gowee/0a19c54e68b5f9493daa7d9eeb555dfe to your computer and use it in GitHub Desktop.
A python script to validate HTTP proxies with aiohttp, a side project of https://github.com/Gowee/NEMUserCrawler
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import asyncio | |
import aiohttp | |
import re | |
import sys | |
import time | |
from collections import namedtuple | |
from functools import wraps | |
# program-indenpendent functions start | |
def coroutine(func): | |
"""Decorator: primes `func` by advancing to first `yield`""" | |
@wraps(func) | |
def primer(*args, **kwargs): | |
gen = func(*args, **kwargs) | |
next(gen) | |
return gen | |
return primer | |
def eprint(*args, **kwargs): | |
print(*args, file=sys.stderr, **kwargs) | |
def timer(): | |
last = time.time() | |
while True: | |
now = time.time() | |
last = (yield now - last) or now | |
# end | |
ProxyServer = namedtuple("ProxyServer", ["hostname", "port"]) | |
PROXY_REGEX = re.compile( | |
r"(?P<hostname>[^:]+):(?P<port>\d+)") | |
TargetSite = namedtuple("TargetSite", ["url", "keyword"]) | |
TARGET_SITES = [TargetSite(*site.split(maxsplit=2)) for site in | |
["http://music.163.com 网易云音乐", | |
"http://music.163.com 网易云音乐"]] | |
EXPECTED_SUCC_RATIO = 0.5 | |
def proxies_in(file): | |
"""Wrap a input stream containing proxies line by line into a generator.""" | |
# try: | |
# pass | |
# except (IndexError, TypeError): | |
# raise SyntaxError( | |
# "The named groups in `PROXY_REGEX` does not match against fields in `ProxyServer`.") | |
for line in file: | |
m = PROXY_REGEX.match(line.strip()) | |
if m is None or m.group("hostname") == "" or m.group("port") == "": | |
continue | |
#raise ValueError("Line can not be recognized: {}.".format(line)) | |
else: | |
# print(ProxyServer(**m.groupdict())) | |
yield ProxyServer(**m.groupdict()) | |
@coroutine | |
def proxies_out(file): | |
while True: | |
proxy = yield | |
print("{hostname}:{port}".format(**proxy[0]._asdict()), file=file) | |
async def check_proxy(in_gen, out_coro): | |
t = timer() | |
for proxy in in_gen: | |
succ = 0 | |
total = 0 | |
next(t) | |
for site in TARGET_SITES: | |
async with aiohttp.ClientSession(read_timeout=15) as session: | |
try: | |
async with session.get(site.url, proxy="http://{hostname}:{port}".format(**proxy._asdict())) as resp: | |
if resp.status == 200 and site.keyword in await resp.text(): | |
succ += 1 | |
except (aiohttp.ClientError, asyncio.TimeoutError, UnicodeDecodeError) as e: | |
eprint("{!r} when accessing {} with {}".format(e, site, proxy)) | |
total += 1 | |
if succ >= round(total * EXPECTED_SUCC_RATIO): | |
elapsed = next(t) | |
out_coro.send((proxy, elapsed + (total - succ) * 15)) | |
def main(): | |
if len(sys.argv) != 3: | |
print("Usage: {} INPUT_FILE OUT_FILE\n" | |
"Notice: - can be used instead for stdin/out respectively.") | |
exit() | |
input_file = None if sys.argv[1] == "-" else open(sys.argv[1]) | |
output_file = None if sys.argv[2] == "-" else open(sys.argv[2], "a") | |
in_gen = proxies_in(input_file or sys.stdin) | |
out_gen = proxies_out(output_file or sys.stdout) | |
loop: asyncio.BaseEventLoop = asyncio.get_event_loop() | |
loop.run_until_complete(asyncio.gather( | |
*[check_proxy(in_gen, out_gen) for _ in range(256)] | |
)) | |
loop.run_until_complete(asyncio.sleep(0)) | |
loop.close() | |
input_file and input_file.close() | |
output_file and output_file.close() | |
if __name__ == "__main__": | |
main() | |
# | |
""" File: proxy_checker.py | |
Size: 3458 Blocks: 8 IO Block: 4096 regular file | |
Device: fe01h/65025d Inode: 1052831 Links: 1 | |
Access: (0755/-rwxr-xr-x) Uid: ( 1000/ gowe) Gid: ( 1000/ gowe) | |
Access: 2019-09-26 00:04:38.026662286 +0900 | |
Modify: 2019-06-26 09:16:10.840239430 +0900 | |
Change: 2019-06-26 09:16:10.840239430 +0900 | |
Birth: 2018-05-07 19:15:52.643333334 +0900""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment