Last active
October 19, 2018 04:28
-
-
Save jjf012/a94776bea9fad6dab6e9695972774681 to your computer and use it in GitHub Desktop.
将原版的builtwith改良了下
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os | |
import re | |
import requests | |
import requests.adapters | |
import requests.utils | |
import requests.exceptions | |
import sys | |
# from functools import partial | |
requests.adapters.DEFAULT_RETRIES = 1 | |
HEADERS = { | |
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36", | |
'Connection': 'close'} | |
meta_regex = re.compile('<meta[^>]*?name=[\'"]([^>]*?)[\'"][^>]*?content=[\'"]([^>]*?)[\'"][^>]*?>', re.IGNORECASE) | |
def load_apps(filename='data/apps.json'): | |
"""Load apps from Wappalyzer JSON (https://github.com/ElbertF/Wappalyzer) | |
""" | |
# get the path of this filename relative to the current script | |
# XXX add support to download update | |
filename = os.path.join(os.getcwd(), os.path.dirname(__file__), filename) | |
return json.load(open(filename, encoding='utf-8')) | |
# data = load_apps() | |
def builtwith(data, url, headers=None, html=None, user_agent='builtwith'): | |
"""Detect the technology used to build a website | |
>>> builtwith('http://wordpress.com') | |
{u'blogs': [u'PHP', u'WordPress'], u'font-scripts': [u'Google Font API'], u'web-servers': [u'Nginx'], u'javascript-frameworks': [u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress']} | |
>>> builtwith('http://webscraping.com') | |
{u'javascript-frameworks': [u'jQuery', u'Modernizr'], u'web-frameworks': [u'Twitter Bootstrap'], u'web-servers': [u'Nginx']} | |
>>> builtwith('http://microsoft.com') | |
{u'javascript-frameworks': [u'jQuery'], u'mobile-frameworks': [u'jQuery Mobile'], u'operating-systems': [u'Windows Server'], u'web-servers': [u'IIS']} | |
>>> builtwith('http://jquery.com') | |
{u'cdn': [u'CloudFlare'], u'web-servers': [u'Nginx'], u'javascript-frameworks': [u'jQuery', u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress'], u'blogs': [u'PHP', u'WordPress']} | |
>>> builtwith('http://joomla.org') | |
{u'font-scripts': [u'Google Font API'], u'miscellaneous': [u'Gravatar'], u'web-servers': [u'LiteSpeed'], u'javascript-frameworks': [u'jQuery'], u'programming-languages': [u'PHP'], u'web-frameworks': [u'Twitter Bootstrap'], u'cms': [u'Joomla'], u'video-players': [u'YouTube']} | |
""" | |
techs = {} | |
# download content | |
if None in (headers, html): | |
try: | |
if html and headers is None: | |
# already have HTML so just need to make HEAD request for headers | |
resp = requests.head(url, headers=HEADERS) | |
headers = resp.headers | |
else: | |
resp = requests.get(url, headers=HEADERS) | |
headers = resp.headers | |
html = decode_response_text(resp) | |
except Exception as e: | |
print('[Error]: ', e) | |
# print(headers) | |
# print(len(html)) | |
# 原版是每判断一次就循环一次dict,浪费资源 | |
for app_name, app_spec in data['apps'].items(): | |
# check url | |
if 'url' in app_spec: | |
if contains(url, app_spec['url']): | |
add_app(data, techs, app_name, app_spec) | |
# check headers | |
if headers: | |
if 'headers' in app_spec: | |
if contains_dict(headers, app_spec['headers']): | |
add_app(data, techs, app_name, app_spec) | |
if html: | |
# check meta | |
metas = dict(meta_regex.findall(html)) | |
for name, content in app_spec.get('meta', {}).items(): | |
if name in metas: | |
if contains(metas[name], content): | |
add_app(data, techs, app_name, app_spec) | |
break | |
# check html | |
for key in 'html', 'script': | |
snippets = app_spec.get(key, []) | |
if not isinstance(snippets, list): | |
snippets = [snippets] | |
for snippet in snippets: | |
if contains(html, snippet): | |
add_app(data, techs, app_name, app_spec) | |
break | |
return techs | |
def decode_response_text(resp): | |
encodings = requests.utils.get_encodings_from_content(resp.text) | |
if encodings: | |
encoding = encodings[0] | |
else: | |
encoding = resp.apparent_encoding | |
encoding = "GBK" if encoding.lower() == "gb2312" else encoding | |
try: | |
text = resp.content.decode(encoding) | |
return text | |
except Exception: | |
print("Decode Faild:{}".format(resp.request.url)) | |
return resp.text | |
def add_app(data, techs, app_name, app_spec): | |
"""Add this app to technology | |
""" | |
for category in get_categories(data, app_spec): | |
try: | |
if category['name'] not in techs: | |
techs[category['name']] = [] | |
if app_name not in techs[category['name']]: | |
techs[category['name']].append(app_name) | |
implies = app_spec.get('implies', []) | |
if not isinstance(implies, list): | |
implies = [implies] | |
for app_name in implies: | |
add_app(data, techs, app_name, data['apps'][app_name]) | |
except Exception: | |
print(category) | |
def get_categories(data, app_spec): | |
"""Return category names for this app_spec | |
""" | |
# old version | |
# return [data['categories'][str(c_id)] for c_id in app_spec['cats']] | |
return [data['categories'][str(c_id)] for c_id in app_spec['cats']] | |
def contains(v, regex): | |
"""Removes meta data from regex then checks for a regex match | |
""" | |
# if six.PY3 and isinstance(v, bytes): | |
# v = v.decode() | |
return re.compile(regex.split('\\;')[0], flags=re.IGNORECASE).search(v) | |
def contains_dict(d1, d2): | |
"""Takes 2 dictionaries | |
Returns True if d1 contains all items in d2""" | |
for k2, v2 in d2.items(): | |
v1 = d1.get(k2) | |
if v1: | |
if not contains(v1, v2): | |
return False | |
else: | |
return False | |
return True | |
if __name__ == '__main__': | |
import time | |
urls = sys.argv[1:] | |
data = load_apps() | |
start = time.clock() | |
if urls: | |
for url in urls: | |
results = builtwith(data, url) | |
for result in sorted(results.items()): | |
print('%s: %s' % result) | |
print('%.2f' % (time.clock() - start)) | |
else: | |
print('Usage: %s url1 [url2 url3 ...]' % sys.argv[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment