jjf012/builtwith_re.py

## builtwith_re.py
import json
import os
import re
import requests
import requests.adapters
import requests.utils
import requests.exceptions
import sys

# from functools import partial

requests.adapters.DEFAULT_RETRIES = 1
HEADERS = {
    'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
    'Connection': 'close'}
meta_regex = re.compile('<meta[^>]*?name=[\'"]([^>]*?)[\'"][^>]*?content=[\'"]([^>]*?)[\'"][^>]*?>', re.IGNORECASE)


def load_apps(filename='data/apps.json'):
    """Load apps from Wappalyzer JSON (https://github.com/ElbertF/Wappalyzer)
    """
    # get the path of this filename relative to the current script
    # XXX add support to download update
    filename = os.path.join(os.getcwd(), os.path.dirname(__file__), filename)
    return json.load(open(filename, encoding='utf-8'))


# data = load_apps()


def builtwith(data, url, headers=None, html=None, user_agent='builtwith'):
    """Detect the technology used to build a website

    >>> builtwith('http://wordpress.com')
    {u'blogs': [u'PHP', u'WordPress'], u'font-scripts': [u'Google Font API'], u'web-servers': [u'Nginx'], u'javascript-frameworks': [u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress']}
    >>> builtwith('http://webscraping.com')
    {u'javascript-frameworks': [u'jQuery', u'Modernizr'], u'web-frameworks': [u'Twitter Bootstrap'], u'web-servers': [u'Nginx']}
    >>> builtwith('http://microsoft.com')
    {u'javascript-frameworks': [u'jQuery'], u'mobile-frameworks': [u'jQuery Mobile'], u'operating-systems': [u'Windows Server'], u'web-servers': [u'IIS']}
    >>> builtwith('http://jquery.com')
    {u'cdn': [u'CloudFlare'], u'web-servers': [u'Nginx'], u'javascript-frameworks': [u'jQuery', u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress'], u'blogs': [u'PHP', u'WordPress']}
    >>> builtwith('http://joomla.org')
    {u'font-scripts': [u'Google Font API'], u'miscellaneous': [u'Gravatar'], u'web-servers': [u'LiteSpeed'], u'javascript-frameworks': [u'jQuery'], u'programming-languages': [u'PHP'], u'web-frameworks': [u'Twitter Bootstrap'], u'cms': [u'Joomla'], u'video-players': [u'YouTube']}
    """
    techs = {}

    # download content
    if None in (headers, html):
        try:
            if html and headers is None:
                # already have HTML so just need to make HEAD request for headers
                resp = requests.head(url, headers=HEADERS)
                headers = resp.headers
            else:
                resp = requests.get(url, headers=HEADERS)
                headers = resp.headers
                html = decode_response_text(resp)
        except Exception as e:
            print('[Error]: ', e)

    # print(headers)
    # print(len(html))
    # 原版是每判断一次就循环一次dict，浪费资源
    for app_name, app_spec in data['apps'].items():
        # check url
        if 'url' in app_spec:
            if contains(url, app_spec['url']):
                add_app(data, techs, app_name, app_spec)
        # check headers
        if headers:
            if 'headers' in app_spec:
                if contains_dict(headers, app_spec['headers']):
                    add_app(data, techs, app_name, app_spec)
        if html:
            # check meta
            metas = dict(meta_regex.findall(html))
            for name, content in app_spec.get('meta', {}).items():
                if name in metas:
                    if contains(metas[name], content):
                        add_app(data, techs, app_name, app_spec)
                        break

            # check html
            for key in 'html', 'script':
                snippets = app_spec.get(key, [])
                if not isinstance(snippets, list):
                    snippets = [snippets]
                for snippet in snippets:
                    if contains(html, snippet):
                        add_app(data, techs, app_name, app_spec)
                        break
    return techs


def decode_response_text(resp):
    encodings = requests.utils.get_encodings_from_content(resp.text)
    if encodings:
        encoding = encodings[0]
    else:
        encoding = resp.apparent_encoding
    encoding = "GBK" if encoding.lower() == "gb2312" else encoding
    try:
        text = resp.content.decode(encoding)
        return text
    except Exception:
        print("Decode Faild:{}".format(resp.request.url))
        return resp.text


def add_app(data, techs, app_name, app_spec):
    """Add this app to technology
    """
    for category in get_categories(data, app_spec):
        try:
            if category['name'] not in techs:
                techs[category['name']] = []
            if app_name not in techs[category['name']]:
                techs[category['name']].append(app_name)
                implies = app_spec.get('implies', [])
                if not isinstance(implies, list):
                    implies = [implies]
                for app_name in implies:
                    add_app(data, techs, app_name, data['apps'][app_name])
        except Exception:
            print(category)


def get_categories(data, app_spec):
    """Return category names for this app_spec
    """
    # old version
    # return [data['categories'][str(c_id)] for c_id in app_spec['cats']]
    return [data['categories'][str(c_id)] for c_id in app_spec['cats']]


def contains(v, regex):
    """Removes meta data from regex then checks for a regex match
    """
    # if six.PY3 and isinstance(v, bytes):
    #     v = v.decode()
    return re.compile(regex.split('\\;')[0], flags=re.IGNORECASE).search(v)


def contains_dict(d1, d2):
    """Takes 2 dictionaries

    Returns True if d1 contains all items in d2"""
    for k2, v2 in d2.items():
        v1 = d1.get(k2)
        if v1:
            if not contains(v1, v2):
                return False
        else:
            return False
    return True


if __name__ == '__main__':
    import time

    urls = sys.argv[1:]
    data = load_apps()
    start = time.clock()
    if urls:
        for url in urls:
            results = builtwith(data, url)
            for result in sorted(results.items()):
                print('%s: %s' % result)
        print('%.2f' % (time.clock() - start))
    else:
        print('Usage: %s url1 [url2 url3 ...]' % sys.argv[0])
	import json
	import os
	import re
	import requests
	import requests.adapters
	import requests.utils
	import requests.exceptions
	import sys

	# from functools import partial

	requests.adapters.DEFAULT_RETRIES = 1
	HEADERS = {
	'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36",
	'Connection': 'close'}
	meta_regex = re.compile('<meta[^>]?name=[\'"]([^>]?)[\'"][^>]?content=[\'"]([^>]?)[\'"][^>]*?>', re.IGNORECASE)


	def load_apps(filename='data/apps.json'):
	"""Load apps from Wappalyzer JSON (https://github.com/ElbertF/Wappalyzer)
	"""
	# get the path of this filename relative to the current script
	# XXX add support to download update
	filename = os.path.join(os.getcwd(), os.path.dirname(__file__), filename)
	return json.load(open(filename, encoding='utf-8'))


	# data = load_apps()


	def builtwith(data, url, headers=None, html=None, user_agent='builtwith'):
	"""Detect the technology used to build a website

	>>> builtwith('http://wordpress.com')
	{u'blogs': [u'PHP', u'WordPress'], u'font-scripts': [u'Google Font API'], u'web-servers': [u'Nginx'], u'javascript-frameworks': [u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress']}
	>>> builtwith('http://webscraping.com')
	{u'javascript-frameworks': [u'jQuery', u'Modernizr'], u'web-frameworks': [u'Twitter Bootstrap'], u'web-servers': [u'Nginx']}
	>>> builtwith('http://microsoft.com')
	{u'javascript-frameworks': [u'jQuery'], u'mobile-frameworks': [u'jQuery Mobile'], u'operating-systems': [u'Windows Server'], u'web-servers': [u'IIS']}
	>>> builtwith('http://jquery.com')
	{u'cdn': [u'CloudFlare'], u'web-servers': [u'Nginx'], u'javascript-frameworks': [u'jQuery', u'Modernizr'], u'programming-languages': [u'PHP'], u'cms': [u'WordPress'], u'blogs': [u'PHP', u'WordPress']}
	>>> builtwith('http://joomla.org')
	{u'font-scripts': [u'Google Font API'], u'miscellaneous': [u'Gravatar'], u'web-servers': [u'LiteSpeed'], u'javascript-frameworks': [u'jQuery'], u'programming-languages': [u'PHP'], u'web-frameworks': [u'Twitter Bootstrap'], u'cms': [u'Joomla'], u'video-players': [u'YouTube']}
	"""
	techs = {}

	# download content
	if None in (headers, html):
	try:
	if html and headers is None:
	# already have HTML so just need to make HEAD request for headers
	resp = requests.head(url, headers=HEADERS)
	headers = resp.headers
	else:
	resp = requests.get(url, headers=HEADERS)
	headers = resp.headers
	html = decode_response_text(resp)
	except Exception as e:
	print('[Error]: ', e)

	# print(headers)
	# print(len(html))
	# 原版是每判断一次就循环一次dict，浪费资源
	for app_name, app_spec in data['apps'].items():
	# check url
	if 'url' in app_spec:
	if contains(url, app_spec['url']):
	add_app(data, techs, app_name, app_spec)
	# check headers
	if headers:
	if 'headers' in app_spec:
	if contains_dict(headers, app_spec['headers']):
	add_app(data, techs, app_name, app_spec)
	if html:
	# check meta
	metas = dict(meta_regex.findall(html))
	for name, content in app_spec.get('meta', {}).items():
	if name in metas:
	if contains(metas[name], content):
	add_app(data, techs, app_name, app_spec)
	break

	# check html
	for key in 'html', 'script':
	snippets = app_spec.get(key, [])
	if not isinstance(snippets, list):
	snippets = [snippets]
	for snippet in snippets:
	if contains(html, snippet):
	add_app(data, techs, app_name, app_spec)
	break
	return techs


	def decode_response_text(resp):
	encodings = requests.utils.get_encodings_from_content(resp.text)
	if encodings:
	encoding = encodings[0]
	else:
	encoding = resp.apparent_encoding
	encoding = "GBK" if encoding.lower() == "gb2312" else encoding
	try:
	text = resp.content.decode(encoding)
	return text
	except Exception:
	print("Decode Faild:{}".format(resp.request.url))
	return resp.text


	def add_app(data, techs, app_name, app_spec):
	"""Add this app to technology
	"""
	for category in get_categories(data, app_spec):
	try:
	if category['name'] not in techs:
	techs[category['name']] = []
	if app_name not in techs[category['name']]:
	techs[category['name']].append(app_name)
	implies = app_spec.get('implies', [])
	if not isinstance(implies, list):
	implies = [implies]
	for app_name in implies:
	add_app(data, techs, app_name, data['apps'][app_name])
	except Exception:
	print(category)


	def get_categories(data, app_spec):
	"""Return category names for this app_spec
	"""
	# old version
	# return [data['categories'][str(c_id)] for c_id in app_spec['cats']]
	return [data['categories'][str(c_id)] for c_id in app_spec['cats']]


	def contains(v, regex):
	"""Removes meta data from regex then checks for a regex match
	"""
	# if six.PY3 and isinstance(v, bytes):
	# v = v.decode()
	return re.compile(regex.split('\\;')[0], flags=re.IGNORECASE).search(v)


	def contains_dict(d1, d2):
	"""Takes 2 dictionaries

	Returns True if d1 contains all items in d2"""
	for k2, v2 in d2.items():
	v1 = d1.get(k2)
	if v1:
	if not contains(v1, v2):
	return False
	else:
	return False
	return True


	if __name__ == '__main__':
	import time

	urls = sys.argv[1:]
	data = load_apps()
	start = time.clock()
	if urls:
	for url in urls:
	results = builtwith(data, url)
	for result in sorted(results.items()):
	print('%s: %s' % result)
	print('%.2f' % (time.clock() - start))
	else:
	print('Usage: %s url1 [url2 url3 ...]' % sys.argv[0])