scovetta/update-to-https.py

## update-to-https.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
The purpose of this script is to scan a directory (default: ports) for
port configuration files that contain http-based URLS, and convert
them to https if possible.

It only looks for files with an extension of .cmake or .json.

The "if possible" check is whether or not the bytes returned from a
GET request to the http URL are identical to those returned from a
GET request to the https URL.

The only external module needed is requests (pip install requests).

Author: Michael Scovetta <michael.scovetta@microsoft.com>
License: MIT
Copyright: Microsoft Corporation
Last Updated: 1/3/2022
"""
import os
import re
import sys
import requests
import hashlib
import logging

VERSION = "1.0.0"

logging.basicConfig(format='%(levelname)s %(message)s', level=logging.DEBUG)

class HttpsChecker:
    # List of files modified
    updated_files = []

    # Request timeout for the network calls
    REQUEST_TIMEOUT = 30

    def is_upgradeable(self, http_link: str) -> bool:
        """
        Check to see if the http URL is upgradeable.

        Returns:
            True iff it is upgradeable
            False if it is not, or on any error.
        """
        logging.debug('is_upgradeable(%s)', http_link)

        if '${' in http_link:
            logging.debug('URL is not upgradeable because it contains a variable.')
            return False

        try:
            http_res = requests.get(http_link, timeout=self.REQUEST_TIMEOUT)
            http_res.raise_for_status()
        except Exception as msg:
            logging.debug('Error loading %s: %s', http_link, msg)
            return False

        try:
            https_link = re.sub('^http://', 'https://', http_link, 1, re.IGNORECASE)
            https_res = requests.get(https_link, timeout=self.REQUEST_TIMEOUT)
            https_res.raise_for_status()
        except Exception as msg:
            logging.debug('Error loading %s: %s', https_link, msg)
            return False

        if http_res.content == https_res.content:
            logging.info("Upgradeable URL: %s, Size=%d, Hash=%s", http_link,
                       len(http_res.content), hashlib.sha256(http_res.content).hexdigest())
            return True
        return False

    def process_file(self, filename):
        """
        Processes a given file. This means extracting URLs, checking each one, and
        modifying the file with the new URL if safe.

        Returns: None
        """
        logging.info('Processing file: %s', filename)

        if not os.path.isfile(filename):
            return

        with open(filename, 'rb') as f:
            content = f.read()

        # Handles ports like antlr4, but needs more work. For now, ignore them.
        # Replace "${NAME}" with data from 'set(NAME VALUE)'"
        # replacement_content = content
        # for key, value in re.findall(bytes(r'set\(([^\s]+) ([^\s]+)\)', encoding='ascii'), content):
        #     replacement_content = replacement_content.replace(bytes('${' + key + '}', encoding='ascii'), value)

        content_modified = False

        # This regex was modified from one found at:
        # https://stackoverflow.com/questions/55663644/how-to-search-for-a-href-from-a-text-file-through-python-regex
        urls = re.findall(bytes(r'http://(?:[-\w.\${}/]|(?:%[\da-fA-F]{2}))+', encoding='ascii'), content)
        logging.debug("Found %d URLs: [%s]", len(urls), urls)

        for url in set(urls):
            url_str = url.decode('utf-8')
            if self.is_upgradeable(url_str):
                logging.debug('URL [%s] was upgradeable.', url_str)
                https_link = re.sub('^http://', 'https://', url_str, 1, re.IGNORECASE)
                content = content.replace(url, bytes(https_link, encoding='utf-8'))
                content_modified = True
                self.updated_files.append(filename)
            else:
                logging.debug('URL [%s] was NOT upgradeable.', url_str)

        if content_modified:
            with open(filename, 'wb') as f:
                f.write(content)

        return

if __name__ == '__main__':
    checker = HttpsChecker()

    directory = sys.argv[1] if len(sys.argv) > 1 else 'ports'
    if not os.path.isdir(directory):
        print("Usage: python update_https.py DIRECTORY")
        sys.exit(1)

    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith('.cmake') or file.endswith('.json'):
                checker.process_file(os.path.join(root, file))

    print("Updated %d file(s)." % len(checker.updated_files))
    sys.exit(0)
	#!/usr/bin/env python
	# -- coding: utf-8 --

	"""
	The purpose of this script is to scan a directory (default: ports) for
	port configuration files that contain http-based URLS, and convert
	them to https if possible.

	It only looks for files with an extension of .cmake or .json.

	The "if possible" check is whether or not the bytes returned from a
	GET request to the http URL are identical to those returned from a
	GET request to the https URL.

	The only external module needed is requests (pip install requests).

	Author: Michael Scovetta <michael.scovetta@microsoft.com>
	License: MIT
	Copyright: Microsoft Corporation
	Last Updated: 1/3/2022
	"""
	import os
	import re
	import sys
	import requests
	import hashlib
	import logging

	VERSION = "1.0.0"

	logging.basicConfig(format='%(levelname)s %(message)s', level=logging.DEBUG)

	class HttpsChecker:
	# List of files modified
	updated_files = []

	# Request timeout for the network calls
	REQUEST_TIMEOUT = 30

	def is_upgradeable(self, http_link: str) -> bool:
	"""
	Check to see if the http URL is upgradeable.

	Returns:
	True iff it is upgradeable
	False if it is not, or on any error.
	"""
	logging.debug('is_upgradeable(%s)', http_link)

	if '${' in http_link:
	logging.debug('URL is not upgradeable because it contains a variable.')
	return False

	try:
	http_res = requests.get(http_link, timeout=self.REQUEST_TIMEOUT)
	http_res.raise_for_status()
	except Exception as msg:
	logging.debug('Error loading %s: %s', http_link, msg)
	return False

	try:
	https_link = re.sub('^http://', 'https://', http_link, 1, re.IGNORECASE)
	https_res = requests.get(https_link, timeout=self.REQUEST_TIMEOUT)
	https_res.raise_for_status()
	except Exception as msg:
	logging.debug('Error loading %s: %s', https_link, msg)
	return False

	if http_res.content == https_res.content:
	logging.info("Upgradeable URL: %s, Size=%d, Hash=%s", http_link,
	len(http_res.content), hashlib.sha256(http_res.content).hexdigest())
	return True
	return False

	def process_file(self, filename):
	"""
	Processes a given file. This means extracting URLs, checking each one, and
	modifying the file with the new URL if safe.

	Returns: None
	"""
	logging.info('Processing file: %s', filename)

	if not os.path.isfile(filename):
	return

	with open(filename, 'rb') as f:
	content = f.read()

	# Handles ports like antlr4, but needs more work. For now, ignore them.
	# Replace "${NAME}" with data from 'set(NAME VALUE)'"
	# replacement_content = content
	# for key, value in re.findall(bytes(r'set\(([^\s]+) ([^\s]+)\)', encoding='ascii'), content):
	# replacement_content = replacement_content.replace(bytes('${' + key + '}', encoding='ascii'), value)

	content_modified = False

	# This regex was modified from one found at:
	# https://stackoverflow.com/questions/55663644/how-to-search-for-a-href-from-a-text-file-through-python-regex
	urls = re.findall(bytes(r'http://(?:[-\w.\${}/]\|(?:%[\da-fA-F]{2}))+', encoding='ascii'), content)
	logging.debug("Found %d URLs: [%s]", len(urls), urls)

	for url in set(urls):
	url_str = url.decode('utf-8')
	if self.is_upgradeable(url_str):
	logging.debug('URL [%s] was upgradeable.', url_str)
	https_link = re.sub('^http://', 'https://', url_str, 1, re.IGNORECASE)
	content = content.replace(url, bytes(https_link, encoding='utf-8'))
	content_modified = True
	self.updated_files.append(filename)
	else:
	logging.debug('URL [%s] was NOT upgradeable.', url_str)

	if content_modified:
	with open(filename, 'wb') as f:
	f.write(content)

	return

	if __name__ == '__main__':
	checker = HttpsChecker()

	directory = sys.argv[1] if len(sys.argv) > 1 else 'ports'
	if not os.path.isdir(directory):
	print("Usage: python update_https.py DIRECTORY")
	sys.exit(1)

	for root, _, files in os.walk(directory):
	for file in files:
	if file.endswith('.cmake') or file.endswith('.json'):
	checker.process_file(os.path.join(root, file))

	print("Updated %d file(s)." % len(checker.updated_files))
	sys.exit(0)