github-endpoints.py
#!/usr/bin/python3 | |
# I don't believe in license. | |
# You can do whatever you want with this program. | |
# Author : Gwendal Le Coguic | |
# Original file: https://github.com/gwen001/github-search/blob/master/github-endpoints.py | |
import os | |
import sys | |
import re | |
import time | |
import requests | |
import random | |
import argparse | |
from urllib.parse import urlparse | |
from functools import partial | |
from colored import fg, bg, attr | |
from multiprocessing.dummy import Pool | |
TOKENS_FILE = os.path.dirname(os.path.realpath(__file__))+'/.github_tokens' | |
MIN_LENGTH = 5 | |
_url_chars = '[a-zA-Z0-9\-\.\?\#\$&@%=_:/\]\[]' | |
_not_url_chars = '[^a-zA-Z0-9\-\.\?\#\$&@%=_:/\]\[]' | |
t_endpoints = [] | |
t_exclude = [ | |
r'^http://$', | |
r'^https://$', | |
r'^javascript:$', | |
r'^tel:$', | |
r'^mailto:$', | |
r'^application/json$', | |
r'^text/plain$', | |
r'^text/x-python$', | |
r'^text/css$', | |
r'^image/png$', | |
r'^image/jpeg$', | |
r'^image/x-icon$', | |
r'^img/favicon.ico$', | |
r'^application/x-www-form-urlencoded$', | |
r'/Users/[0-9a-zA-Z\-\_]/Desktop', | |
r'www.w3.org', | |
r'schemas.android.com', | |
r'www.apple.com', | |
r'\.jpeg', | |
r'\.jpg', | |
r'\.gif', | |
r'\.tif', | |
r'\.tiff', | |
r'\.ttf', | |
r'\.png', | |
r'\.css', | |
r'\.ico', | |
r'\.woff2', | |
r'\.woff', | |
r'\.swf', | |
# r'^#', | |
# r'^\?', | |
# r'^javascript:', | |
# r'^mailto:', | |
] | |
t_regexp = [ | |
r'[\'"\(].*(http[s]?://'+_url_chars+'*?)[\'"\)]', | |
r'[\'"\(](http[s]?://'+_url_chars+')', | |
r'[\'"\(]('+_url_chars+'+\.sdirect'+_url_chars+'*)', | |
r'[\'"\(]('+_url_chars+'+\.htm'+_url_chars+'*)', | |
r'[\'"\(]('+_url_chars+'+\.php'+_url_chars+'*)', | |
r'[\'"\(]('+_url_chars+'+\.asp'+_url_chars+'*)', | |
r'[\'"\(]('+_url_chars+'+\.js'+_url_chars+'*)', | |
r'[\'"\(]('+_url_chars+'+\.xml'+_url_chars+'*)', | |
r'[\'"\(]('+_url_chars+'+\.ini'+_url_chars+'*)', | |
r'[\'"\(]('+_url_chars+'+\.conf'+_url_chars+'*)', | |
r'[\'"\(]('+_url_chars+'+\.cfm'+_url_chars+'*)', | |
r'href\s*[.=]\s*[\'"]('+_url_chars+'+)', | |
r'src\s*[.=]\s*[\'"]('+_url_chars+'+\.js*)', | |
r'url\s*[:=]\s*[\'"]('+_url_chars+'+\.js*)', | |
r'urlRoot\s*[:=]\s*[\'"]('+_url_chars+'+\.js*)', | |
r'endpoint[s]\s*[:=]\s*[\'"]('+_url_chars+'+\.js*)', | |
r'script[s]\s*[:=]\s*[\'"]('+_url_chars+'+\.js*)', | |
r'\.ajax\s*\(\s*[\'"]('+_url_chars+'+\.js*)', | |
r'\.get\s*\(\s*[\'"]('+_url_chars+'+\.js*)', | |
r'\.post\s*\(\s*[\'"]('+_url_chars+'+\.js*)', | |
r'\.load\s*\(\s*[\'"]('+_url_chars+'+\.js*)', | |
### a bit noisy | |
# r'[\'"](' + _url_chars + '+/' + _url_chars + '+)?[\'"]', | |
# r'content\s*[.=]\s*[\'"]('+_url_chars+'+)', | |
] | |
def githubApiSearchCode( token, search, page, sort, order ): | |
headers = { "Authorization":"token "+token } | |
url = 'https://api.github.com/search/code?per_page=100&s=' + sort + '&type=Code&o=' + order + '&q=' + search + '&page=' + str(page) | |
# print(">>> "+url) | |
try: | |
r = requests.get( url, headers=headers, timeout=5 ) | |
json = r.json() | |
# print(r.json) | |
# print(r.text) | |
return json | |
except Exception as e: | |
print( "%s[-] error occurred: %s%s" % (fg('red'),e,attr(0)) ) | |
return False | |
def getRawUrl( result ): | |
raw_url = result['html_url']; | |
raw_url = raw_url.replace( 'https://github.com/', 'https://raw.githubusercontent.com/' ) | |
raw_url = raw_url.replace( '/blob/', '/' ) | |
return raw_url; | |
def readCode( regexp, source, confirm, relative, alldomains, result ): | |
time.sleep( random.random() ) | |
url = getRawUrl( result ) | |
if url in t_history_urls: | |
return | |
str = '' | |
t_local_endpoints = [] | |
t_history_urls.append( url ) | |
code = doGetCode( url ) | |
# print( code ) | |
if code: | |
if source: | |
str = "\n%s>>> %s%s\n\n" % (fg('yellow'),result['html_url'],attr(0)) | |
matches = re.findall( regexp, code, re.IGNORECASE ) | |
if matches: | |
for r in t_regexp: | |
edpt = re.findall( r, code, re.IGNORECASE ) | |
if edpt: | |
for endpoint in edpt: | |
endpoint = endpoint.strip() | |
if len(endpoint) >= MIN_LENGTH: | |
goodbye = False | |
for exclude in t_exclude: | |
if re.match(exclude,endpoint): | |
goodbye = True | |
break | |
if goodbye: | |
continue | |
if endpoint.startswith('http'): | |
is_relative = False | |
else: | |
is_relative = True | |
if not relative and is_relative: | |
continue | |
if endpoint in t_local_endpoints: | |
continue | |
if not source and endpoint in t_endpoints: | |
continue | |
if not alldomains and not is_relative: | |
try: | |
t_url_parse = urlparse( endpoint ) | |
t_host_parse = tldextract.extract( t_url_parse.netloc ) | |
domain = t_host_parse.domain | |
sss = re.findall( regexp, t_url_parse.netloc ) | |
if not sss: | |
continue | |
except Exception as e: | |
sys.stdout.write( "%s[-] error occurred: %s%s\n" % (fg('red'),e,attr(0)) ) | |
t_endpoints.append( endpoint ) | |
t_local_endpoints.append( endpoint ) | |
if source: | |
str = str + ("%s\n" % endpoint) | |
else: | |
sys.stdout.write( "%s\n" % endpoint ) | |
if source and len(t_local_endpoints): | |
sys.stdout.write( str ) | |
def doGetCode( url ): | |
try: | |
r = requests.get( url, timeout=5 ) | |
except Exception as e: | |
sys.stdout.write( "%s[-] error occurred: %s%s\n" % (fg('red'),e,attr(0)) ) | |
return False | |
return r.text | |
parser = argparse.ArgumentParser() | |
parser.add_argument( "-t","--token",help="your github token (required)" ) | |
parser.add_argument( "-d","--domain",help="domain you are looking for (required)" ) | |
parser.add_argument( "-e","--extend",help="also look for <dummy>example.com", action="store_true" ) | |
parser.add_argument( "-a","--all",help="displays urls of all other domains", action="store_true" ) | |
parser.add_argument( "-r","--relative",help="also displays relative urls", action="store_true" ) | |
parser.add_argument( "-s","--source",help="display urls where endpoints are found", action="store_true" ) | |
parser.add_argument( "-v","--verbose",help="verbose mode, for debugging purpose", action="store_true" ) | |
parser.parse_args() | |
args = parser.parse_args() | |
t_tokens = [] | |
if args.token: | |
t_tokens = args.token.split(',') | |
else: | |
if os.path.isfile(TOKENS_FILE): | |
fp = open(TOKENS_FILE,'r') | |
t_tokens = fp.read().split("\n") | |
fp.close() | |
if not len(t_tokens): | |
parser.error( 'auth token is missing' ) | |
if args.source: | |
_source = True | |
else: | |
_source = False | |
if args.domain: | |
_domain = args.domain | |
else: | |
parser.error( 'domain is missing' ) | |
if args.relative: | |
_relative = True | |
else: | |
_relative = False | |
if args.all: | |
_alldomains = True | |
else: | |
_alldomains = False | |
t_sort_order = [ | |
{ 'sort':'indexed', 'order':'desc', }, | |
{ 'sort':'indexed', 'order':'asc', }, | |
{ 'sort':'', 'order':'desc', } | |
] | |
t_history = [] | |
t_history_urls = [] | |
_search = '"' + _domain + '"' | |
### this is a test, looks like we got more result that way | |
import tldextract | |
t_host_parse = tldextract.extract( _domain ) | |
if args.extend: | |
# which one is | |
_search = '"' + t_host_parse.domain + '"' | |
else: | |
# the most effective ? | |
_search = '"' + t_host_parse.domain + '.' + t_host_parse.suffix + '"' | |
# or simply ? | |
# _search = '"' + _domain + '"' | |
# print(_search) | |
# exit() | |
### | |
if args.extend: | |
_regexp = r'(([0-9a-z_\-\.]+\.)?([0-9a-z_\-]+)?'+t_host_parse.domain+'([0-9a-z_\-\.]+)?\.[a-z]{1,5})' | |
_confirm = t_host_parse.domain | |
else: | |
_regexp = r'((([0-9a-z_\-\.]+)\.)?' + _domain.replace('.','\.')+')' | |
_confirm = _domain | |
if args.verbose: | |
print( "Search: %s" % _search ) | |
print( "Regexp: %s" % _regexp) | |
print( "Confirm: %s" % readCode) | |
for so in t_sort_order: | |
page = 1 | |
if args.verbose: | |
print( '\n----- %s %s\n' % (so['sort'],so['order']) ) | |
while True: | |
if args.verbose: | |
print("page %d" % page) | |
time.sleep( random.random() ) | |
token = random.choice( t_tokens ) | |
t_json = githubApiSearchCode( token, _search, page, so['sort'], so['order'] ) | |
# print(t_json) | |
if not t_json or 'documentation_url' in t_json: | |
if args.verbose: | |
print(t_json) | |
t_tokens.remove(token) | |
if len(t_tokens) == 0: | |
exit() | |
continue | |
page = page + 1 | |
if 'items' in t_json and len(t_json['items']): | |
pool = Pool( 30 ) | |
pool.map( partial(readCode,_regexp,_source,_confirm,_relative,_alldomains), t_json['items'] ) | |
pool.close() | |
pool.join() | |
else: | |
break |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment