ontheklaud/tf_localizer.py

## tf_localizer.py
import os
import sys
import re
import pprint
import hashlib
import time
import urllib.request as urlreq

verbose_debug = True


def reporthook(count, block_size, total_size):
    # reference: https://blog.shichao.io/2012/10/04/progress_speed_indicator_for_urlretrieve_in_python.html
    global start_time
    if count == 0:
        start_time = time.time()
        return
    duration = time.time() - start_time
    progress_size = int(count * block_size)
    speed = int(progress_size / (1024 * duration))
    percent = int(count * block_size * 100 / total_size)
    sys.stdout.write("\r    ...%d%%, %d MB, %d KB/s, %d seconds passed" %
                    (percent, progress_size / (1024 * 1024), speed, duration))
    sys.stdout.flush()


def fetch_candidate_dict(targets):

    global verbose_debug

    fetch_candid_dict = dict()

    for target in targets:

        # for debugging
        print('target:', target)

        fetch_candidate = list()

        file_target = open(file=target, mode='r')
        filename_target = os.path.split(target)
        # print('filename:', filename_target)

        buffer = str()
        sw_buffering = False
        for line in file_target:
            # print(line.rstrip())
            # buffer switch trigger
            if line.strip().endswith('http_archive(') and not line.lstrip().startswith('#'):
                sw_buffering = True
            elif line.strip().endswith('mkl_repository(') and not line.lstrip().startswith('#'):
                sw_buffering = True
            elif line.strip().endswith('java_import_external(') and not line.lstrip().startswith('#'):
                sw_buffering = True
            elif line.strip().endswith('filegroup_external(') and not line.lstrip().startswith('#'):
                sw_buffering = True
            elif sw_buffering and line.strip() == ')':
                sw_buffering = False
                buffer += ")"
                fetch_candidate.append(buffer)
                buffer = str()
                continue

            # check buffering
            if sw_buffering:
                buffer += line.lstrip()
            else:
                pass

        if verbose_debug:
            print('>>:', len(fetch_candidate), fetch_candidate)

        if len(fetch_candidate) > 0:

            dict_setup = dict({
                'path_full': target,
                'candidates': parse_web_archive(fetch_candidate)
            })
            fetch_candid_dict.update({
                filename_target[1]: dict_setup
            })
        else:
            continue

    # fin
    return fetch_candid_dict


def parse_web_archive(_plain_list):

    global verbose_debug

    parse_dict = dict()

    for item in _plain_list:
        inline = str()
        for line in item.split('\n'):
            # for debugging
            # print('>', line)
            if not line.strip().startswith('#'):
                inline += line.strip().split('#')[0]
        # for debugging
        # print('inline:', inline)
        m = re.match(pattern=r'([A-Za-z._]+)\((.*)\)', string=inline)

        item_type = m.group(1)
        item_contents = m.group(2)
        if verbose_debug:
            print(item_contents)
        item_name = re.findall('name = \"(.+?)\",', item_contents)[-1]

        # print('item_contents:', item_contents, 'item_type:', item_type)
        try:
            # print(item_type, file=sys.stderr)
            if item_type != 'filegroup_external':
                item_sha256 = re.findall('(jar_sha256|sha256)(\s)*=(\s)*\"(.+?)\",', item_contents)[-1][-1]
                # item_sha256 = item_sha256[-1]
                item_urls_set = re.findall('(jar_urls|urls)(\s)*=(\s)*\[(.+?)\],', item_contents)[-1][-1]
            else:
                # in case of 'filegroup_external',
                sha256_urls = re.findall(r'sha256_urls(\s)*=(\s)*{\"(.+?)\"(\s)*:(\s)*\[(.+?)\},', item_contents)[-1]
                item_sha256 = sha256_urls[2]
                item_urls_set = sha256_urls[5].replace('],', '').replace(']', '')
            item_urls_ = [x.replace('"', '') for x in item_urls_set.split(',')]
            item_urls = list()
            for url in item_urls_:
                if url.startswith('http'):
                    item_urls.append(url)
                else:
                    continue
            item_urls_.clear()

            parse_dict.update({
                item_name:dict({
                    'type': item_type,
                    'sha256': item_sha256,
                    'urls': item_urls,
                    'topurl': item_urls[0],
                })
            })
        except Exception as err:
            print(err, inline, file=sys.stderr)

    # fin
    return parse_dict


def download_dependencies(path_root, dicts):

    path_local_repo = os.path.join(path_root, 'tf_localrepo')

    try:
        os.makedirs(path_local_repo, exist_ok=True)
    except FileExistsError:
        print('[E] Directory \'localrepo\' already exist in tensorflow directory; Please remove it first.')
        sys.exit(-1)

    # loop through target file

    for key_target, value_target in dicts.items():

        # target_name = key_target
        # path_full = value_target['path_full']
        candidates = value_target['candidates']

        keys_item = candidates.keys()
        for key_item in keys_item:

            print('[I] fetching {:s}...'.format(key_item))

            key_sha256 = candidates[key_item]['sha256']
            urls = candidates[key_item]['urls']

            for i in range(len(urls)):
                print('    [{:02d}] trying at {:s} ...'.format(i + 1, urls[i]))

                path_local_repo_key = os.path.join(path_local_repo, key_item)
                os.makedirs(path_local_repo_key, exist_ok=True)

                try:
                    filename = urls[i].split('/')[-1]

                    path_local_repo_key_full = os.path.join(path_local_repo_key, filename)

                    if os.path.exists(path_local_repo_key_full):
                        os.remove(path_local_repo_key_full)

                    urlreq.urlretrieve(url=urls[i], filename=path_local_repo_key_full, reporthook=reporthook)
                    print()
                except Exception as e:
                    print ('[E] {:s}'.format(str(e)))
                    continue

                if os.path.exists(path_local_repo_key_full):
                    with open(path_local_repo_key_full, mode='rb') as f_test:
                        if hashlib.sha256(f_test.read()).hexdigest() == key_sha256:
                            print('[I] download complete ({:s}) as {:s} with valid sha256 checksum.'.format(
                                key_item, path_local_repo_key_full
                            ))
                            break
                        else:
                            continue

    # fin
    return


def insert_localized_deps(dicts, localrepo='/tmp/tf_localrepo'):

    global verbose_debug

    for key_target, value_target in dicts.items():

        target_name = key_target
        path_full = value_target['path_full']

        path_target_contents = str()

        with open(path_full, mode='r') as file_check:
            for line in file_check:
                path_target_contents += line

        with open(path_full+".bak", mode='w') as file_backup:
            file_backup.write(path_target_contents)
        print('[I] backup of {:s} complete as {:s}.'.format(target_name, path_full+".bak"))

        if verbose_debug:
            print('[ORIGINAL]:{:s}'.format(target_name))
            print('Contents:', path_target_contents)

        candidates = value_target['candidates']

        keys_item = candidates.keys()
        for key_item in keys_item:

            key_rendered = "name = \"{:s}\"".format(key_item)
            topurl = "\n(.*)\"{:s}\"".format(candidates[key_item]['topurl'])
            # print(key_item, path_full, candidates[key_item]['sha256'])
            m_key = re.search(key_rendered, path_target_contents)
            # print(m_key.start(), m_key.end(), path_target_contents[m_key.start():m_key.end()])

            m_urls = re.finditer(topurl, path_target_contents)
            m_url = None
            for _m_url in m_urls:
                if m_key.start() < _m_url.start() and m_key.end() < _m_url.start():
                    m_url = _m_url
                    break
                else:
                    continue

            m_url_fetched = path_target_contents[m_url.start():m_url.end()]
            whitespace = m_url_fetched.replace('"'+candidates[key_item]['topurl']+'"', '')
            # print('whitespace:', len(whitespace), whitespace)
            local_url_join = os.path.join(localrepo, key_item, candidates[key_item]['topurl'].split('/')[-1])
            inject_local_url = "{:s}\"file://{:s}\",".format(whitespace, local_url_join)
            path_target_contents = \
                path_target_contents[:m_url.start()] + inject_local_url + path_target_contents[m_url.start():]

        with open(path_full, mode='w') as file_edit:
            file_edit.write(path_target_contents)
        print('[I] Local Repository ({:s}) update done.'.format(target_name))
        if verbose_debug:
            print('[UPDATED]:{:s}'.format(target_name))
            print(path_target_contents)

    # fin
    return


def main():

    # tensorflow/tensorflow commit 6b6d843ccab78f9f91c3b98a43ca09ffecad4747 requires 2 modifications:
    # WORKSPACEgREP
    # tensorflow/workspace.bzl

    global verbose_debug

    tf_rel = 'tensorflow-1.10.0'
    path_root = os.path.join(os.getcwd(), tf_rel)
    path_localrepo = '/tmp/tf_localrepo'
    # path_cmake_external = 'tensorflow/contrib/cmake/external'

    targets = list()
    targets.append(os.path.join(path_root, 'WORKSPACE'))
    targets.append(os.path.join(path_root, 'tensorflow/workspace.bzl'))

    # for _path, dirs, files in os.walk(top=os.path.join(path_root, path_cmake_external)):
    #     for _file in files:
    #         targets.append(os.path.join(_path, _file))

    fetch_candid_dict = fetch_candidate_dict(targets=targets)

    if verbose_debug:
        pp = pprint.PrettyPrinter()
        pp.pprint(fetch_candid_dict)

    # download dependencies
    download_dependencies(path_root=path_root, dicts=fetch_candid_dict)

    # insert local dependencies
    insert_localized_deps(dicts=fetch_candid_dict, localrepo=path_localrepo)

    # fin
    return


if __name__ == '__main__':
    main()
	import os
	import sys
	import re
	import pprint
	import hashlib
	import time
	import urllib.request as urlreq

	verbose_debug = True


	def reporthook(count, block_size, total_size):
	# reference: https://blog.shichao.io/2012/10/04/progress_speed_indicator_for_urlretrieve_in_python.html
	global start_time
	if count == 0:
	start_time = time.time()
	return
	duration = time.time() - start_time
	progress_size = int(count * block_size)
	speed = int(progress_size / (1024 * duration))
	percent = int(count * block_size * 100 / total_size)
	sys.stdout.write("\r ...%d%%, %d MB, %d KB/s, %d seconds passed" %
	(percent, progress_size / (1024 * 1024), speed, duration))
	sys.stdout.flush()


	def fetch_candidate_dict(targets):

	global verbose_debug

	fetch_candid_dict = dict()

	for target in targets:

	# for debugging
	print('target:', target)

	fetch_candidate = list()

	file_target = open(file=target, mode='r')
	filename_target = os.path.split(target)
	# print('filename:', filename_target)

	buffer = str()
	sw_buffering = False
	for line in file_target:
	# print(line.rstrip())
	# buffer switch trigger
	if line.strip().endswith('http_archive(') and not line.lstrip().startswith('#'):
	sw_buffering = True
	elif line.strip().endswith('mkl_repository(') and not line.lstrip().startswith('#'):
	sw_buffering = True
	elif line.strip().endswith('java_import_external(') and not line.lstrip().startswith('#'):
	sw_buffering = True
	elif line.strip().endswith('filegroup_external(') and not line.lstrip().startswith('#'):
	sw_buffering = True
	elif sw_buffering and line.strip() == ')':
	sw_buffering = False
	buffer += ")"
	fetch_candidate.append(buffer)
	buffer = str()
	continue

	# check buffering
	if sw_buffering:
	buffer += line.lstrip()
	else:
	pass

	if verbose_debug:
	print('>>:', len(fetch_candidate), fetch_candidate)

	if len(fetch_candidate) > 0:

	dict_setup = dict({
	'path_full': target,
	'candidates': parse_web_archive(fetch_candidate)
	})
	fetch_candid_dict.update({
	filename_target[1]: dict_setup
	})
	else:
	continue

	# fin
	return fetch_candid_dict


	def parse_web_archive(_plain_list):

	global verbose_debug

	parse_dict = dict()

	for item in _plain_list:
	inline = str()
	for line in item.split('\n'):
	# for debugging
	# print('>', line)
	if not line.strip().startswith('#'):
	inline += line.strip().split('#')[0]
	# for debugging
	# print('inline:', inline)
	m = re.match(pattern=r'([A-Za-z._]+)\((.*)\)', string=inline)

	item_type = m.group(1)
	item_contents = m.group(2)
	if verbose_debug:
	print(item_contents)
	item_name = re.findall('name = \"(.+?)\",', item_contents)[-1]

	# print('item_contents:', item_contents, 'item_type:', item_type)
	try:
	# print(item_type, file=sys.stderr)
	if item_type != 'filegroup_external':
	item_sha256 = re.findall('(jar_sha256\|sha256)(\s)=(\s)\"(.+?)\",', item_contents)[-1][-1]
	# item_sha256 = item_sha256[-1]
	item_urls_set = re.findall('(jar_urls\|urls)(\s)=(\s)\[(.+?)\],', item_contents)[-1][-1]
	else:
	# in case of 'filegroup_external',
	sha256_urls = re.findall(r'sha256_urls(\s)=(\s){\"(.+?)\"(\s):(\s)\[(.+?)\},', item_contents)[-1]
	item_sha256 = sha256_urls[2]
	item_urls_set = sha256_urls[5].replace('],', '').replace(']', '')
	item_urls_ = [x.replace('"', '') for x in item_urls_set.split(',')]
	item_urls = list()
	for url in item_urls_:
	if url.startswith('http'):
	item_urls.append(url)
	else:
	continue
	item_urls_.clear()

	parse_dict.update({
	item_name:dict({
	'type': item_type,
	'sha256': item_sha256,
	'urls': item_urls,
	'topurl': item_urls[0],
	})
	})
	except Exception as err:
	print(err, inline, file=sys.stderr)

	# fin
	return parse_dict


	def download_dependencies(path_root, dicts):

	path_local_repo = os.path.join(path_root, 'tf_localrepo')

	try:
	os.makedirs(path_local_repo, exist_ok=True)
	except FileExistsError:
	print('[E] Directory \'localrepo\' already exist in tensorflow directory; Please remove it first.')
	sys.exit(-1)

	# loop through target file

	for key_target, value_target in dicts.items():

	# target_name = key_target
	# path_full = value_target['path_full']
	candidates = value_target['candidates']

	keys_item = candidates.keys()
	for key_item in keys_item:

	print('[I] fetching {:s}...'.format(key_item))

	key_sha256 = candidates[key_item]['sha256']
	urls = candidates[key_item]['urls']

	for i in range(len(urls)):
	print(' [{:02d}] trying at {:s} ...'.format(i + 1, urls[i]))

	path_local_repo_key = os.path.join(path_local_repo, key_item)
	os.makedirs(path_local_repo_key, exist_ok=True)

	try:
	filename = urls[i].split('/')[-1]

	path_local_repo_key_full = os.path.join(path_local_repo_key, filename)

	if os.path.exists(path_local_repo_key_full):
	os.remove(path_local_repo_key_full)

	urlreq.urlretrieve(url=urls[i], filename=path_local_repo_key_full, reporthook=reporthook)
	print()
	except Exception as e:
	print ('[E] {:s}'.format(str(e)))
	continue

	if os.path.exists(path_local_repo_key_full):
	with open(path_local_repo_key_full, mode='rb') as f_test:
	if hashlib.sha256(f_test.read()).hexdigest() == key_sha256:
	print('[I] download complete ({:s}) as {:s} with valid sha256 checksum.'.format(
	key_item, path_local_repo_key_full
	))
	break
	else:
	continue

	# fin
	return


	def insert_localized_deps(dicts, localrepo='/tmp/tf_localrepo'):

	global verbose_debug

	for key_target, value_target in dicts.items():

	target_name = key_target
	path_full = value_target['path_full']

	path_target_contents = str()

	with open(path_full, mode='r') as file_check:
	for line in file_check:
	path_target_contents += line

	with open(path_full+".bak", mode='w') as file_backup:
	file_backup.write(path_target_contents)
	print('[I] backup of {:s} complete as {:s}.'.format(target_name, path_full+".bak"))

	if verbose_debug:
	print('[ORIGINAL]:{:s}'.format(target_name))
	print('Contents:', path_target_contents)

	candidates = value_target['candidates']

	keys_item = candidates.keys()
	for key_item in keys_item:

	key_rendered = "name = \"{:s}\"".format(key_item)
	topurl = "\n(.*)\"{:s}\"".format(candidates[key_item]['topurl'])
	# print(key_item, path_full, candidates[key_item]['sha256'])
	m_key = re.search(key_rendered, path_target_contents)
	# print(m_key.start(), m_key.end(), path_target_contents[m_key.start():m_key.end()])

	m_urls = re.finditer(topurl, path_target_contents)
	m_url = None
	for _m_url in m_urls:
	if m_key.start() < _m_url.start() and m_key.end() < _m_url.start():
	m_url = _m_url
	break
	else:
	continue

	m_url_fetched = path_target_contents[m_url.start():m_url.end()]
	whitespace = m_url_fetched.replace('"'+candidates[key_item]['topurl']+'"', '')
	# print('whitespace:', len(whitespace), whitespace)
	local_url_join = os.path.join(localrepo, key_item, candidates[key_item]['topurl'].split('/')[-1])
	inject_local_url = "{:s}\"file://{:s}\",".format(whitespace, local_url_join)
	path_target_contents = \
	path_target_contents[:m_url.start()] + inject_local_url + path_target_contents[m_url.start():]

	with open(path_full, mode='w') as file_edit:
	file_edit.write(path_target_contents)
	print('[I] Local Repository ({:s}) update done.'.format(target_name))
	if verbose_debug:
	print('[UPDATED]:{:s}'.format(target_name))
	print(path_target_contents)

	# fin
	return


	def main():

	# tensorflow/tensorflow commit 6b6d843ccab78f9f91c3b98a43ca09ffecad4747 requires 2 modifications:
	# WORKSPACEgREP
	# tensorflow/workspace.bzl

	global verbose_debug

	tf_rel = 'tensorflow-1.10.0'
	path_root = os.path.join(os.getcwd(), tf_rel)
	path_localrepo = '/tmp/tf_localrepo'
	# path_cmake_external = 'tensorflow/contrib/cmake/external'

	targets = list()
	targets.append(os.path.join(path_root, 'WORKSPACE'))
	targets.append(os.path.join(path_root, 'tensorflow/workspace.bzl'))

	# for _path, dirs, files in os.walk(top=os.path.join(path_root, path_cmake_external)):
	# for _file in files:
	# targets.append(os.path.join(_path, _file))

	fetch_candid_dict = fetch_candidate_dict(targets=targets)

	if verbose_debug:
	pp = pprint.PrettyPrinter()
	pp.pprint(fetch_candid_dict)

	# download dependencies
	download_dependencies(path_root=path_root, dicts=fetch_candid_dict)

	# insert local dependencies
	insert_localized_deps(dicts=fetch_candid_dict, localrepo=path_localrepo)

	# fin
	return


	if __name__ == '__main__':
	main()