Skip to content

Instantly share code, notes, and snippets.

@Gowee
Created September 25, 2019 15:09
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Gowee/0a19c54e68b5f9493daa7d9eeb555dfe to your computer and use it in GitHub Desktop.
Save Gowee/0a19c54e68b5f9493daa7d9eeb555dfe to your computer and use it in GitHub Desktop.
A python script to validate HTTP proxies with aiohttp, a side project of https://github.com/Gowee/NEMUserCrawler
#!/usr/bin/env python3
import asyncio
import aiohttp
import re
import sys
import time
from collections import namedtuple
from functools import wraps
# program-indenpendent functions start
def coroutine(func):
"""Decorator: primes `func` by advancing to first `yield`"""
@wraps(func)
def primer(*args, **kwargs):
gen = func(*args, **kwargs)
next(gen)
return gen
return primer
def eprint(*args, **kwargs):
print(*args, file=sys.stderr, **kwargs)
def timer():
last = time.time()
while True:
now = time.time()
last = (yield now - last) or now
# end
ProxyServer = namedtuple("ProxyServer", ["hostname", "port"])
PROXY_REGEX = re.compile(
r"(?P<hostname>[^:]+):(?P<port>\d+)")
TargetSite = namedtuple("TargetSite", ["url", "keyword"])
TARGET_SITES = [TargetSite(*site.split(maxsplit=2)) for site in
["http://music.163.com 网易云音乐",
"http://music.163.com 网易云音乐"]]
EXPECTED_SUCC_RATIO = 0.5
def proxies_in(file):
"""Wrap a input stream containing proxies line by line into a generator."""
# try:
# pass
# except (IndexError, TypeError):
# raise SyntaxError(
# "The named groups in `PROXY_REGEX` does not match against fields in `ProxyServer`.")
for line in file:
m = PROXY_REGEX.match(line.strip())
if m is None or m.group("hostname") == "" or m.group("port") == "":
continue
#raise ValueError("Line can not be recognized: {}.".format(line))
else:
# print(ProxyServer(**m.groupdict()))
yield ProxyServer(**m.groupdict())
@coroutine
def proxies_out(file):
while True:
proxy = yield
print("{hostname}:{port}".format(**proxy[0]._asdict()), file=file)
async def check_proxy(in_gen, out_coro):
t = timer()
for proxy in in_gen:
succ = 0
total = 0
next(t)
for site in TARGET_SITES:
async with aiohttp.ClientSession(read_timeout=15) as session:
try:
async with session.get(site.url, proxy="http://{hostname}:{port}".format(**proxy._asdict())) as resp:
if resp.status == 200 and site.keyword in await resp.text():
succ += 1
except (aiohttp.ClientError, asyncio.TimeoutError, UnicodeDecodeError) as e:
eprint("{!r} when accessing {} with {}".format(e, site, proxy))
total += 1
if succ >= round(total * EXPECTED_SUCC_RATIO):
elapsed = next(t)
out_coro.send((proxy, elapsed + (total - succ) * 15))
def main():
if len(sys.argv) != 3:
print("Usage: {} INPUT_FILE OUT_FILE\n"
"Notice: - can be used instead for stdin/out respectively.")
exit()
input_file = None if sys.argv[1] == "-" else open(sys.argv[1])
output_file = None if sys.argv[2] == "-" else open(sys.argv[2], "a")
in_gen = proxies_in(input_file or sys.stdin)
out_gen = proxies_out(output_file or sys.stdout)
loop: asyncio.BaseEventLoop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(
*[check_proxy(in_gen, out_gen) for _ in range(256)]
))
loop.run_until_complete(asyncio.sleep(0))
loop.close()
input_file and input_file.close()
output_file and output_file.close()
if __name__ == "__main__":
main()
#
""" File: proxy_checker.py
Size: 3458 Blocks: 8 IO Block: 4096 regular file
Device: fe01h/65025d Inode: 1052831 Links: 1
Access: (0755/-rwxr-xr-x) Uid: ( 1000/ gowe) Gid: ( 1000/ gowe)
Access: 2019-09-26 00:04:38.026662286 +0900
Modify: 2019-06-26 09:16:10.840239430 +0900
Change: 2019-06-26 09:16:10.840239430 +0900
Birth: 2018-05-07 19:15:52.643333334 +0900"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment