telecastr/svn_extractor.py

## svn_extractor.py
#!/usr/bin/env python

import requests
import sys
import argparse
import os
import sqlite3
import traceback
import re
# disable insecurerequestwarning
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)


def getext(filename):
    name, ext = os.path.splitext(filename)
    return ext[1:]


def readsvn(data, urli, match, proxy_dict):
    old_line = ""
    file_list = ""
    dir_list = ""
    user = ""
    pattern = re.compile(match, re.IGNORECASE)
    global author_list
    global excludes
    if not urli.endswith('/'):
        urli = urli + "/"
    for a in data.text.splitlines():
        # below functionality will find all usernames from svn entries file
        if a == "has-props":
            author_list.append(old_line)
        if a == "file":
            if not pattern.search(old_line):
                continue
            ignore = getext(old_line) in excludes
            if ignore:
                print('{}{}(not extracted)'.format(urli, old_line))
            else:
                print('{}{}'.format(urli, old_line))
            if no_extract and not ignore:
                save_url_svn(urli, old_line, proxy_dict)
            file_list = file_list + ";" + old_line
        if a == "dir":
            if old_line != "":
                folder_path = os.path.join("output", urli.replace("http://", "").replace("https://", "").replace("/", os.path.sep), old_line)
                if not os.path.exists(folder_path):
                    if no_extract:
                        os.makedirs(folder_path)
                dir_list = dir_list + ";" + old_line
                print('{}{}'.format(urli, old_line))
                try:
                    d = requests.get(urli + old_line + "/.svn/entries", verify=False, proxies=(proxy_dict))
                    readsvn(d, urli + old_line, match, proxy_dict)
                except Exception:
                    print("Error Reading {}{}/.svn/entries so killing".format(urli, old_line))

        old_line = a
    return file_list, dir_list, user


def readwc(data, urli, match, proxy_dict):
    folder = os.path.join("output", urli.replace("http://", "").replace("https://", "").replace("/", os.path.sep))
    pattern = re.compile(match, re.IGNORECASE)
    global author_list
    global excludes
    if not folder.endswith(os.path.sep):
        folder = folder + os.path.sep
    with open(folder + "wc.db", "wb") as f:
        f.write(data.content)
    conn = sqlite3.connect(folder + "wc.db")
    c = conn.cursor()
    try:
        c.execute('select local_relpath, ".svn/pristine/" || substr(checksum,7,2) || "/" || '
                  'substr(checksum,7) || ".svn-base" as alpha from NODES where kind="file";')
        list_items = c.fetchall()
        # below functionality will find all usernames who have commited atleast once.
        c.execute('select distinct changed_author from nodes;')
        author_list = [r[0] for r in c.fetchall()]
        c.close()
        for filename, url_path in list_items:
            if not pattern.search(filename):
                continue
            ignore = getext(filename) in excludes
            if ignore:
                print('{}{}(not extracted)'.format(urli, filename))
            else:
                print("{}{}".format(urli, filename))
            if no_extract and not ignore:
                save_url_wc(urli, filename, url_path, proxy_dict)
    except Exception:
        print("Error reading wc.db, either database corrupt or invalid file")
        if show_debug:
            traceback.print_exc()
        return 1
    return 0


def show_list(_list, statement):
    print(statement)
    cnt = 1
    for x in set(_list):
        print("{} : {}".format(cnt, x))
        cnt = cnt + 1


def save_url_wc(url, filename, svn_path, proxy_dict):
    global author_list
    if filename != "":
        if svn_path is None:
            folder_path = os.path.join("output", url.replace("http://", "").replace("https://", "").replace("/", os.path.sep, filename.replace("/", os.path.sep)))
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
        else:
            folder = os.path.join("output", url.replace("http://", "").replace("https://", "").replace("/", os.path.sep), os.path.dirname(filename).replace("/", os.path.sep))
            if not os.path.exists(folder):
                os.makedirs(folder)
            if not folder.endswith(os.path.sep):
                folder = folder + os.path.sep
            try:
                r = requests.get(url + "/" + svn_path, verify=False, proxies=(proxy_dict))
                with open(folder + os.path.basename(filename), "wb") as f:
                    f.write(r.content)
            except Exception:
                print("Error while accessing : {}{}".format(url, svn_path))
                if show_debug:
                    traceback.print_exc()

    return 0


def save_url_svn(url, filename, proxy_dict):
    global author_list
    folder = os.path.join("output", url.replace("http://", "").replace("https://", "").replace("/", os.path.sep))
    if not folder.endswith(os.path.sep):
        folder = folder + os.path.sep
    try:
        r = requests.get(url + "/.svn/text-base/" + filename + ".svn-base", verify=False, proxies=(proxy_dict))
        if not os.path.isdir(folder+filename):
            with open(folder + filename, "wb") as f:
                f.write(r.content)
    except Exception:
        print("Problem saving the URL")
        if show_debug:
            traceback.print_exc()
    return 0


def main(argv):
    # placing global variables outside all scopes
    global show_debug
    global no_extract
    global excludes
    global author_list
    author_list = []
    desc = """This program is used to extract the hidden SVN files from a webhost considering
either .svn entries file (<1.6)
or wc.db (> 1.7) are available online.
This program actually automates the directory navigation and text extraction process"""
    epilog = """Credit (C) Anant Shrivastava http://anantshri.info. Contributions from orf, sullo, paddlesteamer.
    Greets to Amol Naik, Akash Mahajan, Prasanna K, Lava Kumar for valuable inputs"""
    parser = argparse.ArgumentParser(description=desc, epilog=epilog)
    parser.add_argument("--url", help="Provide URL", dest='target', required=True)
    parser.add_argument("--debug", help="Provide debug information", action="store_true")
    parser.add_argument("--noextract", help="Don't extract files just show content", action="store_false")
    # using no extract in a compliment format if its defined then it will be false hence
    parser.add_argument("--userlist", help="show the usernames used for commit", action="store_true")
    parser.add_argument("--wcdb", help="check only wcdb", action="store_true")
    parser.add_argument("--entries", help="check only .svn/entries file", action="store_true")
    parser.add_argument("--proxy", help="Provide HTTP Proxy in http(s)://host:port format", dest='proxy', required=False)
    parser.add_argument("--match", help="only download files that match regex")
    parser.add_argument("--exclude", help="exclude files with extensions separated by ','", dest='excludes', default='')
    x = parser.parse_args()
    url = x.target
    no_extract = x.noextract
    show_debug = x.debug
    match = x.match
    excludes = x.excludes.split(',')
    prox = x.proxy
    proxy_dict = ""
    if prox is not None:
        print(prox)
        print("Proxy Defined")
        proxy_dict = {"http": prox, "https": prox}
    else:
        print("Proxy not defined")
        proxy_dict = ""
    if match:
        print("Only downloading matches to {}".format(match))
        match = "("+match+"|entries$|wc.db$)"  # need to allow entries$ and wc.db too
    else:
        match = ""
    if x.wcdb and x.entries:
        print("Checking both wc.db and .svn/entries (default behaviour no need to specify switch)")
        x.wcdb = False
        x.entries = False
    if url is None:
        exit()
    print(url)
    #if not url.endswith('/'):
      #  url = url + "/"
    print("Checking if URL is correct")
    try:
        r = requests.get(url, verify=False, proxies=(proxy_dict))
    except Exception as e:
        print("Problem connecting to URL")
        print(e)
        if show_debug:
            traceback.print_exc()
        exit()
    if [200, 403, 500].count(r.status_code) > 0:
        print("URL is active")
        if no_extract:
            folder_path = os.path.join("output", url.replace("http://", "").replace("https://", "").replace("/", os.path.sep))
            if not os.path.exists(folder_path):
                os.makedirs(folder_path)
        if not x.entries:
            print("Checking for presence of wc.db")
            r = requests.get(url + "/.svn/wc.db", verify=False, allow_redirects=False, proxies=(proxy_dict))
            if r.status_code == 200:
                print("WC.db found")
                rwc = readwc(r, url, match, proxy_dict)
                if rwc == 0:
                    if x.userlist:
                        show_list(author_list, "List of usernames used to commit in svn are listed below")
                        exit()
        else:
            if show_debug:
                print("Status code returned : {}".format(r.status_code))
                print("Full Response")
                print(r.text)
            print("WC.db Lookup FAILED")
        if not x.wcdb:
            print("lets see if we can find .svn/entries")
            # disabling redirection to make sure no redirection based 200ok is captured.
            r = requests.get(url + "/.svn/entries", verify=False, allow_redirects=True, proxies=(proxy_dict))
            if r.status_code == 200:
                print("SVN Entries Found if no file listed check wc.db too")
                readsvn(r, url, match, proxy_dict)
                if 'author_list' in globals() and x.userlist:
                    show_list(author_list, "List of Usernames used to commit in svn are listed below")
                    # print author_list
                    exit()
            else:
                if show_debug:
                    print("Status code returned : {}".format(r.status_code))
                    print("Full response")
                    print(r.text)
                print(".svn/entries lookup FAILED")
                print("{} doesn't contains any SVN repository in it".format(url))
    else:
        print("URL returns {}".format(r.status_code))
        exit()


if __name__ == "__main__":
    main(sys.argv[1:])
	#!/usr/bin/env python

	import requests
	import sys
	import argparse
	import os
	import sqlite3
	import traceback
	import re
	# disable insecurerequestwarning
	from requests.packages.urllib3.exceptions import InsecureRequestWarning
	requests.packages.urllib3.disable_warnings(InsecureRequestWarning)


	def getext(filename):
	name, ext = os.path.splitext(filename)
	return ext[1:]


	def readsvn(data, urli, match, proxy_dict):
	old_line = ""
	file_list = ""
	dir_list = ""
	user = ""
	pattern = re.compile(match, re.IGNORECASE)
	global author_list
	global excludes
	if not urli.endswith('/'):
	urli = urli + "/"
	for a in data.text.splitlines():
	# below functionality will find all usernames from svn entries file
	if a == "has-props":
	author_list.append(old_line)
	if a == "file":
	if not pattern.search(old_line):
	continue
	ignore = getext(old_line) in excludes
	if ignore:
	print('{}{}(not extracted)'.format(urli, old_line))
	else:
	print('{}{}'.format(urli, old_line))
	if no_extract and not ignore:
	save_url_svn(urli, old_line, proxy_dict)
	file_list = file_list + ";" + old_line
	if a == "dir":
	if old_line != "":
	folder_path = os.path.join("output", urli.replace("http://", "").replace("https://", "").replace("/", os.path.sep), old_line)
	if not os.path.exists(folder_path):
	if no_extract:
	os.makedirs(folder_path)
	dir_list = dir_list + ";" + old_line
	print('{}{}'.format(urli, old_line))
	try:
	d = requests.get(urli + old_line + "/.svn/entries", verify=False, proxies=(proxy_dict))
	readsvn(d, urli + old_line, match, proxy_dict)
	except Exception:
	print("Error Reading {}{}/.svn/entries so killing".format(urli, old_line))

	old_line = a
	return file_list, dir_list, user


	def readwc(data, urli, match, proxy_dict):
	folder = os.path.join("output", urli.replace("http://", "").replace("https://", "").replace("/", os.path.sep))
	pattern = re.compile(match, re.IGNORECASE)
	global author_list
	global excludes
	if not folder.endswith(os.path.sep):
	folder = folder + os.path.sep
	with open(folder + "wc.db", "wb") as f:
	f.write(data.content)
	conn = sqlite3.connect(folder + "wc.db")
	c = conn.cursor()
	try:
	c.execute('select local_relpath, ".svn/pristine/" \|\| substr(checksum,7,2) \|\| "/" \|\| '
	'substr(checksum,7) \|\| ".svn-base" as alpha from NODES where kind="file";')
	list_items = c.fetchall()
	# below functionality will find all usernames who have commited atleast once.
	c.execute('select distinct changed_author from nodes;')
	author_list = [r[0] for r in c.fetchall()]
	c.close()
	for filename, url_path in list_items:
	if not pattern.search(filename):
	continue
	ignore = getext(filename) in excludes
	if ignore:
	print('{}{}(not extracted)'.format(urli, filename))
	else:
	print("{}{}".format(urli, filename))
	if no_extract and not ignore:
	save_url_wc(urli, filename, url_path, proxy_dict)
	except Exception:
	print("Error reading wc.db, either database corrupt or invalid file")
	if show_debug:
	traceback.print_exc()
	return 1
	return 0


	def show_list(_list, statement):
	print(statement)
	cnt = 1
	for x in set(_list):
	print("{} : {}".format(cnt, x))
	cnt = cnt + 1


	def save_url_wc(url, filename, svn_path, proxy_dict):
	global author_list
	if filename != "":
	if svn_path is None:
	folder_path = os.path.join("output", url.replace("http://", "").replace("https://", "").replace("/", os.path.sep, filename.replace("/", os.path.sep)))
	if not os.path.exists(folder_path):
	os.makedirs(folder_path)
	else:
	folder = os.path.join("output", url.replace("http://", "").replace("https://", "").replace("/", os.path.sep), os.path.dirname(filename).replace("/", os.path.sep))
	if not os.path.exists(folder):
	os.makedirs(folder)
	if not folder.endswith(os.path.sep):
	folder = folder + os.path.sep
	try:
	r = requests.get(url + "/" + svn_path, verify=False, proxies=(proxy_dict))
	with open(folder + os.path.basename(filename), "wb") as f:
	f.write(r.content)
	except Exception:
	print("Error while accessing : {}{}".format(url, svn_path))
	if show_debug:
	traceback.print_exc()

	return 0


	def save_url_svn(url, filename, proxy_dict):
	global author_list
	folder = os.path.join("output", url.replace("http://", "").replace("https://", "").replace("/", os.path.sep))
	if not folder.endswith(os.path.sep):
	folder = folder + os.path.sep
	try:
	r = requests.get(url + "/.svn/text-base/" + filename + ".svn-base", verify=False, proxies=(proxy_dict))
	if not os.path.isdir(folder+filename):
	with open(folder + filename, "wb") as f:
	f.write(r.content)
	except Exception:
	print("Problem saving the URL")
	if show_debug:
	traceback.print_exc()
	return 0


	def main(argv):
	# placing global variables outside all scopes
	global show_debug
	global no_extract
	global excludes
	global author_list
	author_list = []
	desc = """This program is used to extract the hidden SVN files from a webhost considering
	either .svn entries file (<1.6)
	or wc.db (> 1.7) are available online.
	This program actually automates the directory navigation and text extraction process"""
	epilog = """Credit (C) Anant Shrivastava http://anantshri.info. Contributions from orf, sullo, paddlesteamer.
	Greets to Amol Naik, Akash Mahajan, Prasanna K, Lava Kumar for valuable inputs"""
	parser = argparse.ArgumentParser(description=desc, epilog=epilog)
	parser.add_argument("--url", help="Provide URL", dest='target', required=True)
	parser.add_argument("--debug", help="Provide debug information", action="store_true")
	parser.add_argument("--noextract", help="Don't extract files just show content", action="store_false")
	# using no extract in a compliment format if its defined then it will be false hence
	parser.add_argument("--userlist", help="show the usernames used for commit", action="store_true")
	parser.add_argument("--wcdb", help="check only wcdb", action="store_true")
	parser.add_argument("--entries", help="check only .svn/entries file", action="store_true")
	parser.add_argument("--proxy", help="Provide HTTP Proxy in http(s)://host:port format", dest='proxy', required=False)
	parser.add_argument("--match", help="only download files that match regex")
	parser.add_argument("--exclude", help="exclude files with extensions separated by ','", dest='excludes', default='')
	x = parser.parse_args()
	url = x.target
	no_extract = x.noextract
	show_debug = x.debug
	match = x.match
	excludes = x.excludes.split(',')
	prox = x.proxy
	proxy_dict = ""
	if prox is not None:
	print(prox)
	print("Proxy Defined")
	proxy_dict = {"http": prox, "https": prox}
	else:
	print("Proxy not defined")
	proxy_dict = ""
	if match:
	print("Only downloading matches to {}".format(match))
	match = "("+match+"\|entries$\|wc.db$)" # need to allow entries$ and wc.db too
	else:
	match = ""
	if x.wcdb and x.entries:
	print("Checking both wc.db and .svn/entries (default behaviour no need to specify switch)")
	x.wcdb = False
	x.entries = False
	if url is None:
	exit()
	print(url)
	#if not url.endswith('/'):
	# url = url + "/"
	print("Checking if URL is correct")
	try:
	r = requests.get(url, verify=False, proxies=(proxy_dict))
	except Exception as e:
	print("Problem connecting to URL")
	print(e)
	if show_debug:
	traceback.print_exc()
	exit()
	if [200, 403, 500].count(r.status_code) > 0:
	print("URL is active")
	if no_extract:
	folder_path = os.path.join("output", url.replace("http://", "").replace("https://", "").replace("/", os.path.sep))
	if not os.path.exists(folder_path):
	os.makedirs(folder_path)
	if not x.entries:
	print("Checking for presence of wc.db")
	r = requests.get(url + "/.svn/wc.db", verify=False, allow_redirects=False, proxies=(proxy_dict))
	if r.status_code == 200:
	print("WC.db found")
	rwc = readwc(r, url, match, proxy_dict)
	if rwc == 0:
	if x.userlist:
	show_list(author_list, "List of usernames used to commit in svn are listed below")
	exit()
	else:
	if show_debug:
	print("Status code returned : {}".format(r.status_code))
	print("Full Response")
	print(r.text)
	print("WC.db Lookup FAILED")
	if not x.wcdb:
	print("lets see if we can find .svn/entries")
	# disabling redirection to make sure no redirection based 200ok is captured.
	r = requests.get(url + "/.svn/entries", verify=False, allow_redirects=True, proxies=(proxy_dict))
	if r.status_code == 200:
	print("SVN Entries Found if no file listed check wc.db too")
	readsvn(r, url, match, proxy_dict)
	if 'author_list' in globals() and x.userlist:
	show_list(author_list, "List of Usernames used to commit in svn are listed below")
	# print author_list
	exit()
	else:
	if show_debug:
	print("Status code returned : {}".format(r.status_code))
	print("Full response")
	print(r.text)
	print(".svn/entries lookup FAILED")
	print("{} doesn't contains any SVN repository in it".format(url))
	else:
	print("URL returns {}".format(r.status_code))
	exit()


	if __name__ == "__main__":
	main(sys.argv[1:])