miraculixx/getlicense.py

## getlicense.py
def getlicenses(dir=None):
    """
    simple licence collector
    """
    import re
    dir = dir or '.'
    LICENSE_FILES = r'LICENSE.*'
    SOURCE_FILES = r'(.*css$)|(.*js$)|(.*min$)|(.*json$)'
    LICENSE_IDS = r'.*(MIT|BSD|GPL|GNU.GPL|LPGL|APACHE).*'
    SOURCE_LINKS = r'.*(http.*://\W*\s)'
    COPYRIGHT = r'.*(copyright.*|\(c\).[0-9]+.*|©.[0-9]+.*).*'
    TITLE = r'.*({file}|@package.*|\s\*\s.*)'
    #EMAIL = r'\s?([^@]+@[^@]+\.[^@]+)|(\w*.at.[^@]+\.[^@]+)\s+'
    EMAIL = r'\s([^@]+@[^@]+\.[^@]+)\s'
    EMAIL = r"^\w+([-+.']\w+)*@\w+([-.]\w+)*\.\w+([-.]\w+)*$"
    lic_match = lambda lic : re.match(LICENSE_IDS, lic, re.IGNORECASE)
    cr_match = lambda cr : re.match(COPYRIGHT, cr, re.IGNORECASE)
    title_match = lambda title : re.match(TITLE.format(**opts), title, re.IGNORECASE)
    email_match = lambda email : re.match(EMAIL, email)
    src_match = lambda src : re.match(SOURCE_LINKS, src)
    packages = []
    # normalize license information
    for dir, dirs, files in os.walk(dir):
        for file in files:
            if re.match(LICENSE_FILES, file):
                with open(os.path.join(dir, file), 'r') as f:
                    txt = f.readlines()
                    fulltxt = ' '.join(txt).replace('\n', ' ')
                    lics = [lic_match(lic).groups()[0] for lic in txt if lic_match(lic)]
                    copyrights = [cr for cr in txt if cr_match(cr)]
                    emails = [email_match(email).groups()[0] for email in txt if email_match(email)]
                    sources = [src_match(source).groups()[0] for source in txt if src_match(source)]
                # actual license file
                info = dict(dir=dir,
                     file=file,
                     lic=lics[0] if lics else '',
                     source=sources[0] if sources else 'n/a',
                     copyright=copyrights[0] if copyrights else 'n/a',
                     title=titles[0].replace(' * ', '') if titles else file,
                     email=emails[0] if emails else '',
                     author='',
                     licfile=file,
                     )
                packages.append(info)
            elif re.match(SOURCE_FILES, file):
                with open(os.path.join(dir, file), 'r') as f:
                    src = f.readlines()
                    fullsrc = ' '.join(src).replace('\n', ' ')
                    licfile = ''
                    if '.json' in file:
                        import json
                        # see if we have some sensible content
                        descr = json.loads(fullsrc)
                        lic = descr.get('license')
                        title = descr.get('title') or descr.get('name')
                        author = descr.get('author', '')
                        if isinstance(author, dict):
                            email = author.get('email', '')
                            author = author.get('name', '')
                    else:
                        # any other file
                        lic = re.match(LICENSE_IDS, fullsrc)
                    if lic:
                        # got licence data, generate
                        copyright = [cr_match(cr).groups()[0] for cr in src if cr_match(cr)]
                        opts = dict(dir=dir, file=file)
                        titles = [title for title in src if title_match(title)]
                        emails = [email_match(email).groups()[0] for email in src if email_match(email)]
                        sources = [src_match(source).groups()[0] for source in src if src_match(source)]
                        author = ''
                        # generate info
                        info = dict(dir=dir,
                             file=file,
                             lic=lic.groups()[0],
                             source=sources[0] if sources else 'n/a',
                             copyright=copyright[0] if copyright else 'n/a',
                             title=titles[0].replace(' * ', '') if titles else file,
                             email=emails[0] if emails else '',
                             author=author if author else '',
                             licfile=licfile,
                             )
                        packages.append(info)
    # write licence file
    with open('licenses.json', 'w') as licf:
        licf.write(json.dumps(packages))
    with open('licenses.md', 'w') as licf:
        for pkg in packages:
            txt = (
            'This product includes software from {title} as licensed',
            ' {author} {email} under the terms of the {license} license.',
            '\n{copyright}\n',
            '\n\n```\n{lictext}\n```\n\n',
            '\n\n---\n\n',
            )
            txt = ' '.join(txt)
            if not pkg.get('licfile'):
                try:
                    import license as liclookup
                    # requires git+https://github.com/miraculixx/license.git
                    license = liclookup.find_by_key('rpm', pkg.get('lic'))[0]
                except:
                    lictext = ''
                    licurl = ''
                else:
                    lictext = license.render(name=pkg.get('title'),
                                         email=pkg.get('email', ''))
                    licurl = license.url
            else:
                fn = os.path.join(pkg.get('dir'), pkg.get('licfile'))
                if fn:
                    with open(fn, 'r') as origlic:
                        lictext = origlic.readlines()
                        lictext = '\n'.join(lictext)
            opts=dict(
                title=pkg.get('title', ''),
                author=pkg.get('author', ''),
                email=pkg.get('email', ''),
                copyright=pkg.get('copyright', ''),
                license=pkg.get('lic', 'following'),
                lictext=lictext,
                licurl=licurl,
            )
            for k, v in opts.iteritems():
                opts[k] = v.replace('\n', ' ') if v else ''
            licf.write(txt.format(**opts))
	def getlicenses(dir=None):
	"""
	simple licence collector
	"""
	import re
	dir = dir or '.'
	LICENSE_FILES = r'LICENSE.*'
	SOURCE_FILES = r'(.css$)\|(.js$)\|(.min$)\|(.json$)'
	LICENSE_IDS = r'.(MIT\|BSD\|GPL\|GNU.GPL\|LPGL\|APACHE).'
	SOURCE_LINKS = r'.(http.://\W*\s)'
	COPYRIGHT = r'.(copyright.\|\(c\).[0-9]+.\|©.[0-9]+.).*'
	TITLE = r'.({file}\|@package.\|\s\\s.)'
	#EMAIL = r'\s?([^@]+@[^@]+\.[^@]+)\|(\w*.at.[^@]+\.[^@]+)\s+'
	EMAIL = r'\s([^@]+@[^@]+\.[^@]+)\s'
	EMAIL = r"^\w+([-+.']\w+)@\w+([-.]\w+)\.\w+([-.]\w+)*$"
	lic_match = lambda lic : re.match(LICENSE_IDS, lic, re.IGNORECASE)
	cr_match = lambda cr : re.match(COPYRIGHT, cr, re.IGNORECASE)
	title_match = lambda title : re.match(TITLE.format(**opts), title, re.IGNORECASE)
	email_match = lambda email : re.match(EMAIL, email)
	src_match = lambda src : re.match(SOURCE_LINKS, src)
	packages = []
	# normalize license information
	for dir, dirs, files in os.walk(dir):
	for file in files:
	if re.match(LICENSE_FILES, file):
	with open(os.path.join(dir, file), 'r') as f:
	txt = f.readlines()
	fulltxt = ' '.join(txt).replace('\n', ' ')
	lics = [lic_match(lic).groups()[0] for lic in txt if lic_match(lic)]
	copyrights = [cr for cr in txt if cr_match(cr)]
	emails = [email_match(email).groups()[0] for email in txt if email_match(email)]
	sources = [src_match(source).groups()[0] for source in txt if src_match(source)]
	# actual license file
	info = dict(dir=dir,
	file=file,
	lic=lics[0] if lics else '',
	source=sources[0] if sources else 'n/a',
	copyright=copyrights[0] if copyrights else 'n/a',
	title=titles[0].replace(' * ', '') if titles else file,
	email=emails[0] if emails else '',
	author='',
	licfile=file,
	)
	packages.append(info)
	elif re.match(SOURCE_FILES, file):
	with open(os.path.join(dir, file), 'r') as f:
	src = f.readlines()
	fullsrc = ' '.join(src).replace('\n', ' ')
	licfile = ''
	if '.json' in file:
	import json
	# see if we have some sensible content
	descr = json.loads(fullsrc)
	lic = descr.get('license')
	title = descr.get('title') or descr.get('name')
	author = descr.get('author', '')
	if isinstance(author, dict):
	email = author.get('email', '')
	author = author.get('name', '')
	else:
	# any other file
	lic = re.match(LICENSE_IDS, fullsrc)
	if lic:
	# got licence data, generate
	copyright = [cr_match(cr).groups()[0] for cr in src if cr_match(cr)]
	opts = dict(dir=dir, file=file)
	titles = [title for title in src if title_match(title)]
	emails = [email_match(email).groups()[0] for email in src if email_match(email)]
	sources = [src_match(source).groups()[0] for source in src if src_match(source)]
	author = ''
	# generate info
	info = dict(dir=dir,
	file=file,
	lic=lic.groups()[0],
	source=sources[0] if sources else 'n/a',
	copyright=copyright[0] if copyright else 'n/a',
	title=titles[0].replace(' * ', '') if titles else file,
	email=emails[0] if emails else '',
	author=author if author else '',
	licfile=licfile,
	)
	packages.append(info)
	# write licence file
	with open('licenses.json', 'w') as licf:
	licf.write(json.dumps(packages))
	with open('licenses.md', 'w') as licf:
	for pkg in packages:
	txt = (
	'This product includes software from {title} as licensed',
	' {author} {email} under the terms of the {license} license.',
	'\n{copyright}\n',
	'\n\n```\n{lictext}\n```\n\n',
	'\n\n---\n\n',
	)
	txt = ' '.join(txt)
	if not pkg.get('licfile'):
	try:
	import license as liclookup
	# requires git+https://github.com/miraculixx/license.git
	license = liclookup.find_by_key('rpm', pkg.get('lic'))[0]
	except:
	lictext = ''
	licurl = ''
	else:
	lictext = license.render(name=pkg.get('title'),
	email=pkg.get('email', ''))
	licurl = license.url
	else:
	fn = os.path.join(pkg.get('dir'), pkg.get('licfile'))
	if fn:
	with open(fn, 'r') as origlic:
	lictext = origlic.readlines()
	lictext = '\n'.join(lictext)
	opts=dict(
	title=pkg.get('title', ''),
	author=pkg.get('author', ''),
	email=pkg.get('email', ''),
	copyright=pkg.get('copyright', ''),
	license=pkg.get('lic', 'following'),
	lictext=lictext,
	licurl=licurl,
	)
	for k, v in opts.iteritems():
	opts[k] = v.replace('\n', ' ') if v else ''
	licf.write(txt.format(**opts))