jigpu/license-merge.py

## license-merge.py
#!/usr/bin/env python

import sys
import re

"""
Extract and merge the copyright headers from a list of files.

Foreach file:
  1. Find a copyright header
  2. Split it into "copyright notice" and "licence text" sections
  3. Merge copyright notices into a common list (ignore minor differences)
  4. Merge license texts into a common list (ignore minor differences)
When done, print out merged copyright notices and license texts
"""

def getFiletype(filename):
    extension = filename.lower().rpartition(".")[2]
    return extension

def readBlockComments(fileText, startMarker, endMarker=None):
    blocks = []
    block = None
    inCommentBlock = False
    for line in fileText.splitlines():
        if startMarker in line:
            inCommentBlock = True
            if block is None:
                block = []
        elif endMarker is None:
            inCommentBlock = False
            if block is not None:
                blocks.append("\n".join(block))
                block = None

        if inCommentBlock:
            string = line
            partitions = string.partition(startMarker)
            if partitions[1] != '':
                string = partitions[2]
            if endMarker is not None:
                partitions = string.rpartition(endMarker)
                if partitions[1] != '':
                    string = partitions[0]
            block.append(string)

        if endMarker is not None and endMarker in line:
            inCommentBlock = False
            if block is not None:
                blocks.append("\n".join(block))
                block = None

    if block is not None:
        blocks.append("\n".join(block))

    return blocks

def readCComments(fileText):
    result = []
    comments = readBlockComments(fileText, '/*', '*/')
    for comment in comments:
        comment = re.sub(r"^\s*\*+ ", "", comment, flags=re.MULTILINE)
        comment = re.sub(r"^\s*\*+$", "\n", comment, flags=re.MULTILINE)
        result.append(comment)
    return result

def readCPPComments(fileText):
    return readBlockComments(fileText, '//')

def readShellComments(fileText):
    comments = readBlockComments(fileText, '#')
    if len(comments) > 0 and comments[0].startswith("!/"):
        comments = comments[1:]
    return comments

def readXMLComments(fileText):
    result = []
    comments = readBlockComments(fileText, '<!--', '-->')
    for comment in comments:
        comment = re.sub(r"^\s*~+ ", "", comment, flags=re.MULTILINE)
        comment = re.sub(r"^\s*~+$", "\n", comment, flags=re.MULTILINE)
        result.append(comment)
    return result

def readPythonBlockComments(fileText):
    return readBlockComments(fileText, '"""', '"""')

def findCommentBlocks(fileText, fileHint=None):
    blocks = []
    if fileHint is None \
       or fileHint == 'c' \
       or fileHint == 'cpp' \
       or fileHint == 'java' \
       or fileHint == 'kt':
        blocks.extend(readCComments(fileText))
        blocks.extend(readCPPComments(fileText))
    if fileHint is None \
       or fileHint == 'sh' \
       or fileHint == 'py':
        blocks.extend(readShellComments(fileText))
    if fileHint is None \
       or fileHint == 'py':
       blocks.extend(readPythonBlockComments(fileText))
    if fileHint is None \
       or fileHint == 'xml' \
       or fileHint == 'html':
        blocks.extend(readXMLComments(fileText))
    return blocks

def findHeaders(fileText, filetype):
    commentBlocks = findCommentBlocks(fileText, filetype)
    commentBlocks = [ block for block in commentBlocks if "copyright" in block.lower() ]
    return commentBlocks

def splitHeader(headerText):
    notices = []
    licenses = []
    license = []
    inLicenseBlock = False

    lastLinematch = None
    for line in headerText.splitlines():
        linematch = line.lower().strip()

        if lastLinematch == '':
            inLicenseBlock = False

        if inLicenseBlock:
            license.append(line)
        elif linematch.startswith("copyright"):
            notices.append(line)
            inLicenseBlock = False
            if len(license) != 0:
                licenses.append("\n".join(license))
                license = []
        elif linematch.startswith("all rights reserved"):
            notices.append(line)
            inLicenseBlock = False
            if len(license) != 0:
                licenses.append("\n".join(license))
                license = []
        elif linematch != '':
            inLicenseBlock = True
            license.append(line)

        lastLinematch = linematch
    if len(license) != 0:
        licenses.append("\n".join(license))
    #print("Notices: {}, Licenses: {}".format(notices, licenses))
    return (notices, licenses)

def splitHeaders(headerBlocks):
    notices = []
    licenses = []
    for block in headerBlocks:
        n, l = splitHeader(block)
        notices.extend(n)
        licenses.extend(l)
    return (notices, licenses)

def simpleStringMatch(stringA, stringB):
    return re.sub(r"\s","", stringA.lower()) == \
           re.sub(r"\s","", stringB.lower())

def mergeDuplicatesInPlace(inputList, compare_fn):
    i = 0
    while i < len(inputList):
        itemA = inputList[i]
        inputList[i+1:] = [itemB for itemB in inputList[i+1:] if not compare_fn(itemA, itemB)]
        i = i + 1

def processFiles(filenames):
    noticeList = []
    licenseList = []

    for name in filenames:
        with open(name) as file:
            try:
                fileText = file.read()
            except UnicodeDecodeError:
                # Probably not a text file, so lets ignore it.
                continue
            filetype = getFiletype(name)
            fileHeaderBlocks = findHeaders(fileText, filetype)
            fileNotices, fileLicenses = splitHeaders(fileHeaderBlocks)
            noticeList.extend(fileNotices)
            licenseList.extend(fileLicenses)

    mergeDuplicatesInPlace(noticeList, simpleStringMatch)
    mergeDuplicatesInPlace(licenseList, simpleStringMatch)
    return (noticeList, licenseList)

def printResults(noticeList, licenseList):
    print("===== Copyright Notices =====\n{}\n\n===== Licenses =====\n{}".format(\
        "\n".join(noticeList), \
        "\n###\n".join(licenseList)))
    pass

def main(args):
    noticeList, licenseList = processFiles(args[1:])
    printResults(noticeList, licenseList)

if __name__ == "__main__":
    main(sys.argv)
	#!/usr/bin/env python

	import sys
	import re

	"""
	Extract and merge the copyright headers from a list of files.

	Foreach file:
	1. Find a copyright header
	2. Split it into "copyright notice" and "licence text" sections
	3. Merge copyright notices into a common list (ignore minor differences)
	4. Merge license texts into a common list (ignore minor differences)
	When done, print out merged copyright notices and license texts
	"""

	def getFiletype(filename):
	extension = filename.lower().rpartition(".")[2]
	return extension

	def readBlockComments(fileText, startMarker, endMarker=None):
	blocks = []
	block = None
	inCommentBlock = False
	for line in fileText.splitlines():
	if startMarker in line:
	inCommentBlock = True
	if block is None:
	block = []
	elif endMarker is None:
	inCommentBlock = False
	if block is not None:
	blocks.append("\n".join(block))
	block = None

	if inCommentBlock:
	string = line
	partitions = string.partition(startMarker)
	if partitions[1] != '':
	string = partitions[2]
	if endMarker is not None:
	partitions = string.rpartition(endMarker)
	if partitions[1] != '':
	string = partitions[0]
	block.append(string)

	if endMarker is not None and endMarker in line:
	inCommentBlock = False
	if block is not None:
	blocks.append("\n".join(block))
	block = None

	if block is not None:
	blocks.append("\n".join(block))

	return blocks

	def readCComments(fileText):
	result = []
	comments = readBlockComments(fileText, '/', '/')
	for comment in comments:
	comment = re.sub(r"^\s\+ ", "", comment, flags=re.MULTILINE)
	comment = re.sub(r"^\s\+$", "\n", comment, flags=re.MULTILINE)
	result.append(comment)
	return result

	def readCPPComments(fileText):
	return readBlockComments(fileText, '//')

	def readShellComments(fileText):
	comments = readBlockComments(fileText, '#')
	if len(comments) > 0 and comments[0].startswith("!/"):
	comments = comments[1:]
	return comments

	def readXMLComments(fileText):
	result = []
	comments = readBlockComments(fileText, '<!--', '-->')
	for comment in comments:
	comment = re.sub(r"^\s*~+ ", "", comment, flags=re.MULTILINE)
	comment = re.sub(r"^\s*~+$", "\n", comment, flags=re.MULTILINE)
	result.append(comment)
	return result

	def readPythonBlockComments(fileText):
	return readBlockComments(fileText, '"""', '"""')

	def findCommentBlocks(fileText, fileHint=None):
	blocks = []
	if fileHint is None \
	or fileHint == 'c' \
	or fileHint == 'cpp' \
	or fileHint == 'java' \
	or fileHint == 'kt':
	blocks.extend(readCComments(fileText))
	blocks.extend(readCPPComments(fileText))
	if fileHint is None \
	or fileHint == 'sh' \
	or fileHint == 'py':
	blocks.extend(readShellComments(fileText))
	if fileHint is None \
	or fileHint == 'py':
	blocks.extend(readPythonBlockComments(fileText))
	if fileHint is None \
	or fileHint == 'xml' \
	or fileHint == 'html':
	blocks.extend(readXMLComments(fileText))
	return blocks

	def findHeaders(fileText, filetype):
	commentBlocks = findCommentBlocks(fileText, filetype)
	commentBlocks = [ block for block in commentBlocks if "copyright" in block.lower() ]
	return commentBlocks

	def splitHeader(headerText):
	notices = []
	licenses = []
	license = []
	inLicenseBlock = False

	lastLinematch = None
	for line in headerText.splitlines():
	linematch = line.lower().strip()

	if lastLinematch == '':
	inLicenseBlock = False

	if inLicenseBlock:
	license.append(line)
	elif linematch.startswith("copyright"):
	notices.append(line)
	inLicenseBlock = False
	if len(license) != 0:
	licenses.append("\n".join(license))
	license = []
	elif linematch.startswith("all rights reserved"):
	notices.append(line)
	inLicenseBlock = False
	if len(license) != 0:
	licenses.append("\n".join(license))
	license = []
	elif linematch != '':
	inLicenseBlock = True
	license.append(line)

	lastLinematch = linematch
	if len(license) != 0:
	licenses.append("\n".join(license))
	#print("Notices: {}, Licenses: {}".format(notices, licenses))
	return (notices, licenses)

	def splitHeaders(headerBlocks):
	notices = []
	licenses = []
	for block in headerBlocks:
	n, l = splitHeader(block)
	notices.extend(n)
	licenses.extend(l)
	return (notices, licenses)

	def simpleStringMatch(stringA, stringB):
	return re.sub(r"\s","", stringA.lower()) == \
	re.sub(r"\s","", stringB.lower())

	def mergeDuplicatesInPlace(inputList, compare_fn):
	i = 0
	while i < len(inputList):
	itemA = inputList[i]
	inputList[i+1:] = [itemB for itemB in inputList[i+1:] if not compare_fn(itemA, itemB)]
	i = i + 1

	def processFiles(filenames):
	noticeList = []
	licenseList = []

	for name in filenames:
	with open(name) as file:
	try:
	fileText = file.read()
	except UnicodeDecodeError:
	# Probably not a text file, so lets ignore it.
	continue
	filetype = getFiletype(name)
	fileHeaderBlocks = findHeaders(fileText, filetype)
	fileNotices, fileLicenses = splitHeaders(fileHeaderBlocks)
	noticeList.extend(fileNotices)
	licenseList.extend(fileLicenses)

	mergeDuplicatesInPlace(noticeList, simpleStringMatch)
	mergeDuplicatesInPlace(licenseList, simpleStringMatch)
	return (noticeList, licenseList)

	def printResults(noticeList, licenseList):
	print("===== Copyright Notices =====\n{}\n\n===== Licenses =====\n{}".format(\
	"\n".join(noticeList), \
	"\n###\n".join(licenseList)))
	pass

	def main(args):
	noticeList, licenseList = processFiles(args[1:])
	printResults(noticeList, licenseList)

	if __name__ == "__main__":
	main(sys.argv)