Skip to content

Instantly share code, notes, and snippets.

@jigpu
Created January 12, 2023 18:39
Show Gist options
  • Save jigpu/d1d6311de32f82fbfdf55a5f4fc52315 to your computer and use it in GitHub Desktop.
Save jigpu/d1d6311de32f82fbfdf55a5f4fc52315 to your computer and use it in GitHub Desktop.
Extract and merge the copyright headers from a list of files
#!/usr/bin/env python
import sys
import re
"""
Extract and merge the copyright headers from a list of files.
Foreach file:
1. Find a copyright header
2. Split it into "copyright notice" and "licence text" sections
3. Merge copyright notices into a common list (ignore minor differences)
4. Merge license texts into a common list (ignore minor differences)
When done, print out merged copyright notices and license texts
"""
def getFiletype(filename):
extension = filename.lower().rpartition(".")[2]
return extension
def readBlockComments(fileText, startMarker, endMarker=None):
blocks = []
block = None
inCommentBlock = False
for line in fileText.splitlines():
if startMarker in line:
inCommentBlock = True
if block is None:
block = []
elif endMarker is None:
inCommentBlock = False
if block is not None:
blocks.append("\n".join(block))
block = None
if inCommentBlock:
string = line
partitions = string.partition(startMarker)
if partitions[1] != '':
string = partitions[2]
if endMarker is not None:
partitions = string.rpartition(endMarker)
if partitions[1] != '':
string = partitions[0]
block.append(string)
if endMarker is not None and endMarker in line:
inCommentBlock = False
if block is not None:
blocks.append("\n".join(block))
block = None
if block is not None:
blocks.append("\n".join(block))
return blocks
def readCComments(fileText):
result = []
comments = readBlockComments(fileText, '/*', '*/')
for comment in comments:
comment = re.sub(r"^\s*\*+ ", "", comment, flags=re.MULTILINE)
comment = re.sub(r"^\s*\*+$", "\n", comment, flags=re.MULTILINE)
result.append(comment)
return result
def readCPPComments(fileText):
return readBlockComments(fileText, '//')
def readShellComments(fileText):
comments = readBlockComments(fileText, '#')
if len(comments) > 0 and comments[0].startswith("!/"):
comments = comments[1:]
return comments
def readXMLComments(fileText):
result = []
comments = readBlockComments(fileText, '<!--', '-->')
for comment in comments:
comment = re.sub(r"^\s*~+ ", "", comment, flags=re.MULTILINE)
comment = re.sub(r"^\s*~+$", "\n", comment, flags=re.MULTILINE)
result.append(comment)
return result
def readPythonBlockComments(fileText):
return readBlockComments(fileText, '"""', '"""')
def findCommentBlocks(fileText, fileHint=None):
blocks = []
if fileHint is None \
or fileHint == 'c' \
or fileHint == 'cpp' \
or fileHint == 'java' \
or fileHint == 'kt':
blocks.extend(readCComments(fileText))
blocks.extend(readCPPComments(fileText))
if fileHint is None \
or fileHint == 'sh' \
or fileHint == 'py':
blocks.extend(readShellComments(fileText))
if fileHint is None \
or fileHint == 'py':
blocks.extend(readPythonBlockComments(fileText))
if fileHint is None \
or fileHint == 'xml' \
or fileHint == 'html':
blocks.extend(readXMLComments(fileText))
return blocks
def findHeaders(fileText, filetype):
commentBlocks = findCommentBlocks(fileText, filetype)
commentBlocks = [ block for block in commentBlocks if "copyright" in block.lower() ]
return commentBlocks
def splitHeader(headerText):
notices = []
licenses = []
license = []
inLicenseBlock = False
lastLinematch = None
for line in headerText.splitlines():
linematch = line.lower().strip()
if lastLinematch == '':
inLicenseBlock = False
if inLicenseBlock:
license.append(line)
elif linematch.startswith("copyright"):
notices.append(line)
inLicenseBlock = False
if len(license) != 0:
licenses.append("\n".join(license))
license = []
elif linematch.startswith("all rights reserved"):
notices.append(line)
inLicenseBlock = False
if len(license) != 0:
licenses.append("\n".join(license))
license = []
elif linematch != '':
inLicenseBlock = True
license.append(line)
lastLinematch = linematch
if len(license) != 0:
licenses.append("\n".join(license))
#print("Notices: {}, Licenses: {}".format(notices, licenses))
return (notices, licenses)
def splitHeaders(headerBlocks):
notices = []
licenses = []
for block in headerBlocks:
n, l = splitHeader(block)
notices.extend(n)
licenses.extend(l)
return (notices, licenses)
def simpleStringMatch(stringA, stringB):
return re.sub(r"\s","", stringA.lower()) == \
re.sub(r"\s","", stringB.lower())
def mergeDuplicatesInPlace(inputList, compare_fn):
i = 0
while i < len(inputList):
itemA = inputList[i]
inputList[i+1:] = [itemB for itemB in inputList[i+1:] if not compare_fn(itemA, itemB)]
i = i + 1
def processFiles(filenames):
noticeList = []
licenseList = []
for name in filenames:
with open(name) as file:
try:
fileText = file.read()
except UnicodeDecodeError:
# Probably not a text file, so lets ignore it.
continue
filetype = getFiletype(name)
fileHeaderBlocks = findHeaders(fileText, filetype)
fileNotices, fileLicenses = splitHeaders(fileHeaderBlocks)
noticeList.extend(fileNotices)
licenseList.extend(fileLicenses)
mergeDuplicatesInPlace(noticeList, simpleStringMatch)
mergeDuplicatesInPlace(licenseList, simpleStringMatch)
return (noticeList, licenseList)
def printResults(noticeList, licenseList):
print("===== Copyright Notices =====\n{}\n\n===== Licenses =====\n{}".format(\
"\n".join(noticeList), \
"\n###\n".join(licenseList)))
pass
def main(args):
noticeList, licenseList = processFiles(args[1:])
printResults(noticeList, licenseList)
if __name__ == "__main__":
main(sys.argv)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment