/github-analyzer.py

## github-analyzer.py
#!/usr/bin/env python
#
# Copyright (c) 2012, Max Jonas Werner <mail@makk.es>
# All rights reserved.

# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    * Redistributions of source code must retain the above copyright notice,
#      this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright notice,
#      this list of conditions and the following disclaimer in the documentation
#      and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from beautifulscraper import BeautifulScraper
from collections import defaultdict, Counter
import urllib2, re

LICENSE_FILE_REGEX = re.compile('^(.*license.*)|copying|copyright$', re.IGNORECASE)
README_REGEX = re.compile('^readme(\..+)*$', re.IGNORECASE)
LICENSE_REGEX = re.compile('^[#=]* *license *$', re.IGNORECASE)

def top_languages():
    scraper = BeautifulScraper()
    body = scraper.go("https://github.com/languages")
    return [ (x.string, x["href"]) for x in body.select("#languages .popular .left td[width=100] a") ]

def analyze_language(language):
    langname = language[0]
    langhref = language[1]
    return analyze_projects("https://github.com" + langhref + "/most_watched")

def analyze_projects(url):
    scraper = BeautifulScraper()
    body = scraper.go(url)
    projects = body.select("ul.repolist > li > h3 > a")
    licensed = defaultdict(lambda: 0)
    for project in projects:
        try:
            projbody = scraper.go("https://github.com" + project["href"])
            license = find_license(projbody.select("table.tree-browser tbody td.content > a"))
            licensed[license != None] += 1
        except HTTPError as err:
            print "Error analyzing project " + project.string + ": " + str(err)
    nextlink = body.select("a.next_page")
    if nextlink:
        return Counter(licensed) + Counter(analyze_projects("https://github.com" + nextlink[0]["href"]))
    else:
        return licensed

def find_license(file_list):
    license_file = find_license_file(file_list)
    return license_file

def find_license_file(file_list):
    for fname in file_list:
        if LICENSE_FILE_REGEX.match(fname.string):
            return fname.string
        if README_REGEX.match(fname.string):
            return check_readme(fname)
    return None

def check_readme(fname):
    url = ("https://github.com" + fname["href"]).replace('/blob/', '/raw/')
    for line in urllib2.build_opener().open(url):
        if LICENSE_REGEX.match(line):
            return line.strip()

def run():
    for language in top_languages():
        print "{0}: {1}".format(language[0], analyze_language(language))
    return

if __name__ == "__main__":
    run()
	#!/usr/bin/env python
	#
	# Copyright (c) 2012, Max Jonas Werner <mail@makk.es>
	# All rights reserved.

	# Redistribution and use in source and binary forms, with or without
	# modification, are permitted provided that the following conditions are met:
	#
	# * Redistributions of source code must retain the above copyright notice,
	# this list of conditions and the following disclaimer.
	# * Redistributions in binary form must reproduce the above copyright notice,
	# this list of conditions and the following disclaimer in the documentation
	# and/or other materials provided with the distribution.
	#
	# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
	# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
	# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
	# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

	from beautifulscraper import BeautifulScraper
	from collections import defaultdict, Counter
	import urllib2, re

	LICENSE_FILE_REGEX = re.compile('^(.license.)\|copying\|copyright$', re.IGNORECASE)
	README_REGEX = re.compile('^readme(\..+)*$', re.IGNORECASE)
	LICENSE_REGEX = re.compile('^[#=]* license $', re.IGNORECASE)

	def top_languages():
	scraper = BeautifulScraper()
	body = scraper.go("https://github.com/languages")
	return [ (x.string, x["href"]) for x in body.select("#languages .popular .left td[width=100] a") ]

	def analyze_language(language):
	langname = language[0]
	langhref = language[1]
	return analyze_projects("https://github.com" + langhref + "/most_watched")

	def analyze_projects(url):
	scraper = BeautifulScraper()
	body = scraper.go(url)
	projects = body.select("ul.repolist > li > h3 > a")
	licensed = defaultdict(lambda: 0)
	for project in projects:
	try:
	projbody = scraper.go("https://github.com" + project["href"])
	license = find_license(projbody.select("table.tree-browser tbody td.content > a"))
	licensed[license != None] += 1
	except HTTPError as err:
	print "Error analyzing project " + project.string + ": " + str(err)
	nextlink = body.select("a.next_page")
	if nextlink:
	return Counter(licensed) + Counter(analyze_projects("https://github.com" + nextlink[0]["href"]))
	else:
	return licensed

	def find_license(file_list):
	license_file = find_license_file(file_list)
	return license_file

	def find_license_file(file_list):
	for fname in file_list:
	if LICENSE_FILE_REGEX.match(fname.string):
	return fname.string
	if README_REGEX.match(fname.string):
	return check_readme(fname)
	return None

	def check_readme(fname):
	url = ("https://github.com" + fname["href"]).replace('/blob/', '/raw/')
	for line in urllib2.build_opener().open(url):
	if LICENSE_REGEX.match(line):
	return line.strip()

	def run():
	for language in top_languages():
	print "{0}: {1}".format(language[0], analyze_language(language))
	return

	if __name__ == "__main__":
	run()