awesomebytes/rosdistro_further_stats.py

## rosdistro_further_stats.py
#!/usr/bin/env python

"""
Count lines of code of ROS packages.
Based on https://gist.github.com/mintar/269c62f1f2b4f00b057696ad8c324d03
by Martin Martin Guenther (https://github.com/mintar)

Part of this discussion: https://discourse.ros.org/t/are-serious-things-done-with-ros-in-python/4359/7

Author: Sammy Pfeiffer <Sammy.Pfeiffer at student.uts.edu.au>
"""
from __future__ import division

import yaml
import os
import json
from glob import glob
from collections import defaultdict
from operator import itemgetter
from urllib2 import urlopen, Request
import sys


# Get all repo urls
repourls = []
dist_url = 'https://raw.githubusercontent.com/ros/rosdistro/master/kinetic/distribution.yaml'
for info in yaml.load(
        urlopen(dist_url))['repositories'].itervalues():
    try:
        repourls.append(info['source']['url'])
    except KeyError:
        pass
print '... done (%d repos loaded).' % len(repourls)


# Clone all repos
for repourl in repourls:
    # clone repo
    print("Cloning: " + repourl)
    # Tried to workaround bitbucket cloning not working on my machine...
    # if 'bitbucket' in repourl:
    #     repourl = repourl.replace('https://', 'ssh://hg@')
    os.system("git clone --depth 1 " + repourl)


# Run cloc on every repo
repofoldernames = glob("./*/")
cleaned = []
for fname in repofoldernames:
    # From "./folder_name/" to "folder_name"
    fname = fname.replace('./', '')
    fname = fname.replace('/', '')
    cleaned.append(fname)
repofoldernames = cleaned
for foldername in repofoldernames:
    reportfilename = foldername + ".yaml"
    # sudo apt-get install cloc
    os.system("cloc --yaml --report-file=" + reportfilename + " " + foldername)


# Collect stats
yaml_files = glob('./*.yaml')
num_pkgs_cpp = 0
num_pkgs_python = 0
num_pkgs_both = 0
pkgs_both_list = []
total_cpp_loc = 0
total_python_loc = 0
summary_dict = {}
for yaml_f in yaml_files:
    stats = yaml.load(open(yaml_f, 'r'))
    cpp = stats.get("C++", {'code': 0})
    cpp_loc = cpp.get('code', 0)

    cpp_h = stats.get("C/C++ Header", {'code': 0})
    cpp_h_loc = cpp_h.get('code', 0)

    cpp_locs = cpp_loc + cpp_h_loc
    if cpp_locs > 0:
        num_pkgs_cpp += 1
    total_cpp_loc += cpp_locs

    pyt = stats.get("Python", {'code': 0})
    pyt_loc = pyt.get('code', 0)

    if pyt_loc > 0:
        num_pkgs_python += 1
    total_python_loc += pyt_loc

    repo_name = yaml_f.replace('.yaml', '')

    if pyt_loc > 0 and cpp_locs > 0:
        num_pkgs_both += 1
        pkgs_both_list.append(repo_name)

    total_loc = float(cpp_locs + pyt_loc)
    if total_loc > 0.0:
        print("For repo_name:" + str(repo_name))
        print("pct_python: " + str(pyt_loc) + " / " +
              str(total_loc) + " = " + str(pyt_loc / total_loc))
        print("pct_python %: " + str(pyt_loc / total_loc * 100.0))
        if (pyt_loc / total_loc * 100.0) > 100.0:
            print("--------------- OVER 100% WTF")
        summary_dict[repo_name] = {'cpp_loc': cpp_locs,
                                   'python_loc': pyt_loc,
                                   'pct_cpp': cpp_locs / total_loc * 100.0,
                                   'pct_python': pyt_loc / total_loc * 100.0}


print("From " + str(len(yaml_files)) + " packages analysed")
print("There are " + str(num_pkgs_cpp) + " packages using C++")
print("With " + str(total_cpp_loc) + " LOC")

print("There are " + str(num_pkgs_python) + " packages using Python")
print("With " + str(total_python_loc) + " LOC")

print("And, actually, using both languages: " + str(num_pkgs_both))


total_loc = float(total_cpp_loc + total_python_loc)

pct_cpp = total_cpp_loc / total_loc * 100.0
pct_python = total_python_loc / total_loc * 100.0

print(str(pct_cpp) + " % is CPP code")
print(str(pct_python) + " % is Python code")


# Recompute with the % of the repo we got before

print "Please generate a personal access token here: https://github.com/settings/tokens/new ."
print "You don't need to give it any permissions, it is only required to increase the rate limit "
print "when accessing the GitHub API."
print
token = unicode(raw_input('Enter GitHub personal access token: '))

request = Request(u'https://api.github.com/rate_limit')
request.add_header('Authorization', 'token %s' % token)
response = urlopen(request)
if response.code != 200:
    print 'ERROR: wrong access token'
    sys.exit(-1)

print '\nWorking, please be patient. This will take 5-10 minutes.'
print 'Loading rosdistro repos...'
reponames = []
for info in yaml.load(urlopen('https://raw.githubusercontent.com/ros/rosdistro/master/kinetic/distribution.yaml'))[
        'repositories'].itervalues():
    try:
        reponames.append(info['source']['url'])
    except KeyError:
        pass
print '... done (%d repos loaded).' % len(reponames)

# reponames = ['https://github.com/ros/kdl_parser.git', ...]

# filter out non-github repos, remove github prefix + '.git'
reponames = [r[19:-4] for r in reponames if r.find('https://github.com/') == 0]
# reponames = ['ros/kdl_parser', ...]

remaining_rate = json.load(response)['rate']['remaining']
if len(reponames) > remaining_rate:
    print 'ERROR: more repos to process (%d) than remaining rate (%d)' % (len(reponames), remaining_rate)
    sys.exit(-1)

repos = []
i = 0
for reponame in reponames:
    i += 1
    print 'Reading repo %d/%d...' % (i, len(reponames))
    request = Request(u'https://api.github.com/repos/%s' % reponame)
    request.add_header('Authorization', 'token %s' % token)
    response = urlopen(request)
    if response.code != 200:
        print 'ERROR: rate limited?'
        sys.exit(-1)
    repos.append(json.load(response))

print '\n\n### ROS repos by popularity with percentage\n'
stargazers = []
for repo in repos:
    lang = repo['language']
    if lang in ("C++", "Python"):
        reponame = repo['full_name'].split('/')[1]
        # messed up with the ./, sorry
        d = summary_dict.get("./" + reponame, None)
        print("reponame: " + reponame)
        print("dict: " + str(d))
        if d:
            if lang == "C++":
                lang = "C++ (" + str(round(d['pct_cpp'], 1)) + \
                    "), Python (" + str(round(d['pct_python'], 1)) + ")"
            elif lang == "Python":
                lang = "Python (" + str(round(d['pct_python'], 1)) + \
                    "), C++ (" + str(round(d['pct_cpp'], 1)) + ")"
    stargazers.append((repo['full_name'],
                       repo['stargazers_count'],
                       lang))

print '| rank | repo name                                                    | stars | language        |'
print '|------|--------------------------------------------------------------|------:|-----------------|'
i = 0
for item in sorted(stargazers, key=itemgetter(1), reverse=True):
    i += 1
    print '| {:3d}. | {:60} | {:5d} | {:15} |'.format(i, *item)


# Count packages with actual *.py files
import subprocess
packages_with_python_files = {}
for foldername in repofoldernames:
    # os.system("find " + foldername + ' -name "*.py" | wc -l')
    output = subprocess.check_output(
        "find " + foldername + ' -name "*.py" | wc -l', shell=True)
    if int(output) != 0:
        # print(foldername + ": " + output)
        outputf = subprocess.check_output(
            "find " + foldername + ' -name "*.py"', shell=True)
        filelist = outputf.split()
        packages_with_python_files[foldername] = {'num_files': int(output),
                                                  'file_list': filelist}
num_pkgs_with_python_files = len(packages_with_python_files)
print("There are " + str(num_pkgs_with_python_files) +
      " packages with Python files")

print("Packages with less than 5 Python files:")
num_less_5 = 0
for k, v in packages_with_python_files.iteritems():
    if v['num_files'] < 5:
        num_less_5 += 1
        print(k + " python files:")
        print(v['file_list'])

print("There are " + str(num_less_5) +
      " packages with less than 5 Python files")

# check just the packages with C++ & Python
# print their package name, number of files, and the file list
# useful for doing a grep later on
# My output: https://pastebin.com/aiY5c1t2
for pkg in pkgs_both_list:
    pkg_name = pkg.replace('./', '')
    d = packages_with_python_files[pkg_name]
    print(pkg_name + " (" + str(d['num_files']) + ") python files:")
    for f in d['file_list']:
        print("   " + f)
	#!/usr/bin/env python

	"""
	Count lines of code of ROS packages.
	Based on https://gist.github.com/mintar/269c62f1f2b4f00b057696ad8c324d03
	by Martin Martin Guenther (https://github.com/mintar)

	Part of this discussion: https://discourse.ros.org/t/are-serious-things-done-with-ros-in-python/4359/7

	Author: Sammy Pfeiffer <Sammy.Pfeiffer at student.uts.edu.au>
	"""
	from __future__ import division

	import yaml
	import os
	import json
	from glob import glob
	from collections import defaultdict
	from operator import itemgetter
	from urllib2 import urlopen, Request
	import sys


	# Get all repo urls
	repourls = []
	dist_url = 'https://raw.githubusercontent.com/ros/rosdistro/master/kinetic/distribution.yaml'
	for info in yaml.load(
	urlopen(dist_url))['repositories'].itervalues():
	try:
	repourls.append(info['source']['url'])
	except KeyError:
	pass
	print '... done (%d repos loaded).' % len(repourls)


	# Clone all repos
	for repourl in repourls:
	# clone repo
	print("Cloning: " + repourl)
	# Tried to workaround bitbucket cloning not working on my machine...
	# if 'bitbucket' in repourl:
	# repourl = repourl.replace('https://', 'ssh://hg@')
	os.system("git clone --depth 1 " + repourl)


	# Run cloc on every repo
	repofoldernames = glob("./*/")
	cleaned = []
	for fname in repofoldernames:
	# From "./folder_name/" to "folder_name"
	fname = fname.replace('./', '')
	fname = fname.replace('/', '')
	cleaned.append(fname)
	repofoldernames = cleaned
	for foldername in repofoldernames:
	reportfilename = foldername + ".yaml"
	# sudo apt-get install cloc
	os.system("cloc --yaml --report-file=" + reportfilename + " " + foldername)


	# Collect stats
	yaml_files = glob('./*.yaml')
	num_pkgs_cpp = 0
	num_pkgs_python = 0
	num_pkgs_both = 0
	pkgs_both_list = []
	total_cpp_loc = 0
	total_python_loc = 0
	summary_dict = {}
	for yaml_f in yaml_files:
	stats = yaml.load(open(yaml_f, 'r'))
	cpp = stats.get("C++", {'code': 0})
	cpp_loc = cpp.get('code', 0)

	cpp_h = stats.get("C/C++ Header", {'code': 0})
	cpp_h_loc = cpp_h.get('code', 0)

	cpp_locs = cpp_loc + cpp_h_loc
	if cpp_locs > 0:
	num_pkgs_cpp += 1
	total_cpp_loc += cpp_locs

	pyt = stats.get("Python", {'code': 0})
	pyt_loc = pyt.get('code', 0)

	if pyt_loc > 0:
	num_pkgs_python += 1
	total_python_loc += pyt_loc

	repo_name = yaml_f.replace('.yaml', '')

	if pyt_loc > 0 and cpp_locs > 0:
	num_pkgs_both += 1
	pkgs_both_list.append(repo_name)

	total_loc = float(cpp_locs + pyt_loc)
	if total_loc > 0.0:
	print("For repo_name:" + str(repo_name))
	print("pct_python: " + str(pyt_loc) + " / " +
	str(total_loc) + " = " + str(pyt_loc / total_loc))
	print("pct_python %: " + str(pyt_loc / total_loc * 100.0))
	if (pyt_loc / total_loc * 100.0) > 100.0:
	print("--------------- OVER 100% WTF")
	summary_dict[repo_name] = {'cpp_loc': cpp_locs,
	'python_loc': pyt_loc,
	'pct_cpp': cpp_locs / total_loc * 100.0,
	'pct_python': pyt_loc / total_loc * 100.0}


	print("From " + str(len(yaml_files)) + " packages analysed")
	print("There are " + str(num_pkgs_cpp) + " packages using C++")
	print("With " + str(total_cpp_loc) + " LOC")

	print("There are " + str(num_pkgs_python) + " packages using Python")
	print("With " + str(total_python_loc) + " LOC")

	print("And, actually, using both languages: " + str(num_pkgs_both))


	total_loc = float(total_cpp_loc + total_python_loc)

	pct_cpp = total_cpp_loc / total_loc * 100.0
	pct_python = total_python_loc / total_loc * 100.0

	print(str(pct_cpp) + " % is CPP code")
	print(str(pct_python) + " % is Python code")


	# Recompute with the % of the repo we got before

	print "Please generate a personal access token here: https://github.com/settings/tokens/new ."
	print "You don't need to give it any permissions, it is only required to increase the rate limit "
	print "when accessing the GitHub API."
	print
	token = unicode(raw_input('Enter GitHub personal access token: '))

	request = Request(u'https://api.github.com/rate_limit')
	request.add_header('Authorization', 'token %s' % token)
	response = urlopen(request)
	if response.code != 200:
	print 'ERROR: wrong access token'
	sys.exit(-1)

	print '\nWorking, please be patient. This will take 5-10 minutes.'
	print 'Loading rosdistro repos...'
	reponames = []
	for info in yaml.load(urlopen('https://raw.githubusercontent.com/ros/rosdistro/master/kinetic/distribution.yaml'))[
	'repositories'].itervalues():
	try:
	reponames.append(info['source']['url'])
	except KeyError:
	pass
	print '... done (%d repos loaded).' % len(reponames)

	# reponames = ['https://github.com/ros/kdl_parser.git', ...]

	# filter out non-github repos, remove github prefix + '.git'
	reponames = [r[19:-4] for r in reponames if r.find('https://github.com/') == 0]
	# reponames = ['ros/kdl_parser', ...]

	remaining_rate = json.load(response)['rate']['remaining']
	if len(reponames) > remaining_rate:
	print 'ERROR: more repos to process (%d) than remaining rate (%d)' % (len(reponames), remaining_rate)
	sys.exit(-1)

	repos = []
	i = 0
	for reponame in reponames:
	i += 1
	print 'Reading repo %d/%d...' % (i, len(reponames))
	request = Request(u'https://api.github.com/repos/%s' % reponame)
	request.add_header('Authorization', 'token %s' % token)
	response = urlopen(request)
	if response.code != 200:
	print 'ERROR: rate limited?'
	sys.exit(-1)
	repos.append(json.load(response))

	print '\n\n### ROS repos by popularity with percentage\n'
	stargazers = []
	for repo in repos:
	lang = repo['language']
	if lang in ("C++", "Python"):
	reponame = repo['full_name'].split('/')[1]
	# messed up with the ./, sorry
	d = summary_dict.get("./" + reponame, None)
	print("reponame: " + reponame)
	print("dict: " + str(d))
	if d:
	if lang == "C++":
	lang = "C++ (" + str(round(d['pct_cpp'], 1)) + \
	"), Python (" + str(round(d['pct_python'], 1)) + ")"
	elif lang == "Python":
	lang = "Python (" + str(round(d['pct_python'], 1)) + \
	"), C++ (" + str(round(d['pct_cpp'], 1)) + ")"
	stargazers.append((repo['full_name'],
	repo['stargazers_count'],
	lang))

	print '\| rank \| repo name \| stars \| language \|'
	print '\|------\|--------------------------------------------------------------\|------:\|-----------------\|'
	i = 0
	for item in sorted(stargazers, key=itemgetter(1), reverse=True):
	i += 1
	print '\| {:3d}. \| {:60} \| {:5d} \| {:15} \|'.format(i, *item)


	# Count packages with actual *.py files
	import subprocess
	packages_with_python_files = {}
	for foldername in repofoldernames:
	# os.system("find " + foldername + ' -name "*.py" \| wc -l')
	output = subprocess.check_output(
	"find " + foldername + ' -name "*.py" \| wc -l', shell=True)
	if int(output) != 0:
	# print(foldername + ": " + output)
	outputf = subprocess.check_output(
	"find " + foldername + ' -name "*.py"', shell=True)
	filelist = outputf.split()
	packages_with_python_files[foldername] = {'num_files': int(output),
	'file_list': filelist}
	num_pkgs_with_python_files = len(packages_with_python_files)
	print("There are " + str(num_pkgs_with_python_files) +
	" packages with Python files")

	print("Packages with less than 5 Python files:")
	num_less_5 = 0
	for k, v in packages_with_python_files.iteritems():
	if v['num_files'] < 5:
	num_less_5 += 1
	print(k + " python files:")
	print(v['file_list'])

	print("There are " + str(num_less_5) +
	" packages with less than 5 Python files")

	# check just the packages with C++ & Python
	# print their package name, number of files, and the file list
	# useful for doing a grep later on
	# My output: https://pastebin.com/aiY5c1t2
	for pkg in pkgs_both_list:
	pkg_name = pkg.replace('./', '')
	d = packages_with_python_files[pkg_name]
	print(pkg_name + " (" + str(d['num_files']) + ") python files:")
	for f in d['file_list']:
	print(" " + f)