Skip to content

Instantly share code, notes, and snippets.

@jjf012
Last active October 19, 2018 04:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jjf012/a94776bea9fad6dab6e9695972774681 to your computer and use it in GitHub Desktop.
Save jjf012/a94776bea9fad6dab6e9695972774681 to your computer and use it in GitHub Desktop.
将原版的builtwith改良了下
import json
import os
import re
import requests
import requests.adapters
import requests.utils
import requests.exceptions
import sys
# from functools import partial
requests.adapters.DEFAULT_RETRIES = 1
HEADERS = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
'Connection': 'close'}
meta_regex = re.compile('<meta[^>]*?name=[\'"]([^>]*?)[\'"][^>]*?content=[\'"]([^>]*?)[\'"][^>]*?>', re.IGNORECASE)
def load_apps(filename='data/apps.json'):
"""Load apps from Wappalyzer JSON (https://github.com/ElbertF/Wappalyzer)
"""
# get the path of this filename relative to the current script
# XXX add support to download update
filename = os.path.join(os.getcwd(), os.path.dirname(__file__), filename)
return json.load(open(filename, encoding='utf-8'))
# data = load_apps()
def builtwith(data, url, headers=None, html=None, user_agent='builtwith'):
"""Detect the technology used to build a website
>>> builtwith('http://wordpress.com')
{u'blogs': [u'PHP', u'WordPress'], u'font-scripts': [u'Google Font API'], u'web-servers': [u'Nginx'], u'javascript-frameworks': [u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress']}
>>> builtwith('http://webscraping.com')
{u'javascript-frameworks': [u'jQuery', u'Modernizr'], u'web-frameworks': [u'Twitter Bootstrap'], u'web-servers': [u'Nginx']}
>>> builtwith('http://microsoft.com')
{u'javascript-frameworks': [u'jQuery'], u'mobile-frameworks': [u'jQuery Mobile'], u'operating-systems': [u'Windows Server'], u'web-servers': [u'IIS']}
>>> builtwith('http://jquery.com')
{u'cdn': [u'CloudFlare'], u'web-servers': [u'Nginx'], u'javascript-frameworks': [u'jQuery', u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress'], u'blogs': [u'PHP', u'WordPress']}
>>> builtwith('http://joomla.org')
{u'font-scripts': [u'Google Font API'], u'miscellaneous': [u'Gravatar'], u'web-servers': [u'LiteSpeed'], u'javascript-frameworks': [u'jQuery'], u'programming-languages': [u'PHP'], u'web-frameworks': [u'Twitter Bootstrap'], u'cms': [u'Joomla'], u'video-players': [u'YouTube']}
"""
techs = {}
# download content
if None in (headers, html):
try:
if html and headers is None:
# already have HTML so just need to make HEAD request for headers
resp = requests.head(url, headers=HEADERS)
headers = resp.headers
else:
resp = requests.get(url, headers=HEADERS)
headers = resp.headers
html = decode_response_text(resp)
except Exception as e:
print('[Error]: ', e)
# print(headers)
# print(len(html))
# 原版是每判断一次就循环一次dict,浪费资源
for app_name, app_spec in data['apps'].items():
# check url
if 'url' in app_spec:
if contains(url, app_spec['url']):
add_app(data, techs, app_name, app_spec)
# check headers
if headers:
if 'headers' in app_spec:
if contains_dict(headers, app_spec['headers']):
add_app(data, techs, app_name, app_spec)
if html:
# check meta
metas = dict(meta_regex.findall(html))
for name, content in app_spec.get('meta', {}).items():
if name in metas:
if contains(metas[name], content):
add_app(data, techs, app_name, app_spec)
break
# check html
for key in 'html', 'script':
snippets = app_spec.get(key, [])
if not isinstance(snippets, list):
snippets = [snippets]
for snippet in snippets:
if contains(html, snippet):
add_app(data, techs, app_name, app_spec)
break
return techs
def decode_response_text(resp):
encodings = requests.utils.get_encodings_from_content(resp.text)
if encodings:
encoding = encodings[0]
else:
encoding = resp.apparent_encoding
encoding = "GBK" if encoding.lower() == "gb2312" else encoding
try:
text = resp.content.decode(encoding)
return text
except Exception:
print("Decode Faild:{}".format(resp.request.url))
return resp.text
def add_app(data, techs, app_name, app_spec):
"""Add this app to technology
"""
for category in get_categories(data, app_spec):
try:
if category['name'] not in techs:
techs[category['name']] = []
if app_name not in techs[category['name']]:
techs[category['name']].append(app_name)
implies = app_spec.get('implies', [])
if not isinstance(implies, list):
implies = [implies]
for app_name in implies:
add_app(data, techs, app_name, data['apps'][app_name])
except Exception:
print(category)
def get_categories(data, app_spec):
"""Return category names for this app_spec
"""
# old version
# return [data['categories'][str(c_id)] for c_id in app_spec['cats']]
return [data['categories'][str(c_id)] for c_id in app_spec['cats']]
def contains(v, regex):
"""Removes meta data from regex then checks for a regex match
"""
# if six.PY3 and isinstance(v, bytes):
# v = v.decode()
return re.compile(regex.split('\\;')[0], flags=re.IGNORECASE).search(v)
def contains_dict(d1, d2):
"""Takes 2 dictionaries
Returns True if d1 contains all items in d2"""
for k2, v2 in d2.items():
v1 = d1.get(k2)
if v1:
if not contains(v1, v2):
return False
else:
return False
return True
if __name__ == '__main__':
import time
urls = sys.argv[1:]
data = load_apps()
start = time.clock()
if urls:
for url in urls:
results = builtwith(data, url)
for result in sorted(results.items()):
print('%s: %s' % result)
print('%.2f' % (time.clock() - start))
else:
print('Usage: %s url1 [url2 url3 ...]' % sys.argv[0])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment