marvin-roesch/_README.md

## _README.md

      
    Raw
  

              _README.md
            
          
    Curse Language Statistics

These are two simple Python scripts to gather some basic data about Minecraft mods on CurseForge.
curselangs.py scrapes the project listing, while curselangs-stats.py evaluates the scraped data and outputs some accumulated results.
Mind you that I'm far from a Python expert, so the scripts might not be the most well written.
In order to save you the effort of scraping, I've dumped the latest 1.12 data (as of 2017-10-21) in a separate Gist.
The evaluated data is visualized in a Google Spreadsheet.

  
## curselangs-stats.py
import heapq
import json
from collections import defaultdict

with open('data-1.12.json') as data_file:
    data = json.load(data_file)

stats = defaultdict(int)
grouped_lang = defaultdict(list)
grouped_license = defaultdict(int)
for p in data:
    grouped_lang[p["language"]].append(p)
    grouped_license[p["license"]] += 1

for l, ps in grouped_lang.items():
    stats[l] = len(ps)
    print(l + ": " + str(list(map(lambda p: p['url'], sorted(heapq.nlargest(3, ps, lambda p: p['downloads']),
                                                             key=lambda p1: -p1['downloads'])))))
for p in grouped_lang['kotlin']:
    print(p['url'])

print(stats)
print("Licenses:")
for l, ps in sorted(list(grouped_license.items()), key=lambda xs: xs[1]):
    print(l + "\t" + str(ps))

## curselangs.py
import http.client
import json
import operator
from collections import defaultdict
from itertools import groupby
import heapq

from bs4 import BeautifulSoup
from github import Github
from pybitbucket.repository import Repository

gh = Github('{MUH SECRIT}')

connection = http.client.HTTPSConnection('minecraft.curseforge.com')
baseUrl = '/mc-mods?filter-game-version=2020709689%3A6580&filter-sort=downloads'

stats = defaultdict(int)
projectData = []
repos = 0

page = 1
nextUrl = baseUrl + "&page=1"


def inspect_github(repo_link):
    global repos
    repo = repo_link.replace('https://github.com/', '')
    try:
        languages = gh.get_repo(repo).get_languages()
        if len(languages) == 0:
            print('No language found')
            return "unknown"
        else:
            used_lang = max(languages.items(), key=operator.itemgetter(1))[0]
            repos += 1
            return used_lang.lower()
    except Exception:
        print(repo_link)
        return "failure"


def inspect_bitbucket(repo_link):
    global repos
    repo = repo_link.replace('https://bitbucket.org/', '')
    try:
        result = Repository.find_repository_by_full_name(repo)
        language = result.data['language']
        if len(language) != 0:
            repos += 1
            return lang.lower()
        else:
            print('No language found')
            return "unknown"
    except Exception:
        print(repo_link)
        return "failure"


while nextUrl is not None:
    connection.request('GET', nextUrl)
    response = connection.getresponse()
    soup = BeautifulSoup(response.read().decode(), 'html.parser')
    projects = soup.select('ul.project-listing .name-wrapper a')
    for project in projects:
        projectUrl = project['href']
        print('Inspecting ' + projectUrl)
        connection.request('GET', projectUrl)
        response = connection.getresponse()
        projectSoup = BeautifulSoup(response.read().decode(), 'html.parser')
        menu = projectSoup.select('.e-header-nav .e-menu a')
        lang = "unknown"
        for link in menu:
            if "Source" not in link.string:
                continue
            repoLink = link['href']
            if "github" in repoLink:
                lang = inspect_github(repoLink)
            elif "bitbucket" in repoLink:
                lang = inspect_bitbucket(repoLink)
        licence = 'unknown'
        downloads = 0
        details = projectSoup.select('.cf-details.project-details li')
        for detail in details:
            detail_label = detail.select_one('.info-label').string.strip()
            if detail_label == 'Total Downloads':
                downloads = int(detail.select_one('.info-data').string.replace(',', ''))
            elif detail_label == 'License':
                licenceLink = detail.select_one('.info-data a')
                licence = licenceLink.string
                if licence is None:
                    licence = licenceLink.select_one('span')['title']
                licence = licence.strip()
        notice = 'none'
        if lang == 'failure':
            notice = 'Failed: ' + repoLink
            lang = 'unknown'
        projectData.append({
            'url': 'https://minecraft.curseforge.com' + projectUrl,
            'language': lang,
            'downloads': downloads,
            'license': licence,
            'notice': notice
        })
    print("Done inspecting page " + str(page))
    next_button = soup.select_one('.listing-header .paging-list a[rel="next"]')
    if next_button is None:
        nextUrl = None
    else:
        nextUrl = next_button["href"]
        page += 1

grouped = {}
for l, psGrouper in groupby(projectData, lambda x: x['language']):
    ps = list(psGrouper)
    grouped[l] = ps

for l, ps in grouped.items():
    stats[l] = len(ps)
    print(l + ": " + str(list(map(lambda p: p['url'], heapq.nlargest(3, ps, lambda p: p['downloads'])))))

with open('data-1.12.json', 'w') as outfile:
    json.dump(projectData, outfile, indent=4)
	import heapq
	import json
	from collections import defaultdict

	with open('data-1.12.json') as data_file:
	data = json.load(data_file)

	stats = defaultdict(int)
	grouped_lang = defaultdict(list)
	grouped_license = defaultdict(int)
	for p in data:
	grouped_lang[p["language"]].append(p)
	grouped_license[p["license"]] += 1

	for l, ps in grouped_lang.items():
	stats[l] = len(ps)
	print(l + ": " + str(list(map(lambda p: p['url'], sorted(heapq.nlargest(3, ps, lambda p: p['downloads']),
	key=lambda p1: -p1['downloads'])))))
	for p in grouped_lang['kotlin']:
	print(p['url'])

	print(stats)
	print("Licenses:")
	for l, ps in sorted(list(grouped_license.items()), key=lambda xs: xs[1]):
	print(l + "\t" + str(ps))
	import http.client
	import json
	import operator
	from collections import defaultdict
	from itertools import groupby
	import heapq

	from bs4 import BeautifulSoup
	from github import Github
	from pybitbucket.repository import Repository

	gh = Github('{MUH SECRIT}')

	connection = http.client.HTTPSConnection('minecraft.curseforge.com')
	baseUrl = '/mc-mods?filter-game-version=2020709689%3A6580&filter-sort=downloads'

	stats = defaultdict(int)
	projectData = []
	repos = 0

	page = 1
	nextUrl = baseUrl + "&page=1"


	def inspect_github(repo_link):
	global repos
	repo = repo_link.replace('https://github.com/', '')
	try:
	languages = gh.get_repo(repo).get_languages()
	if len(languages) == 0:
	print('No language found')
	return "unknown"
	else:
	used_lang = max(languages.items(), key=operator.itemgetter(1))[0]
	repos += 1
	return used_lang.lower()
	except Exception:
	print(repo_link)
	return "failure"


	def inspect_bitbucket(repo_link):
	global repos
	repo = repo_link.replace('https://bitbucket.org/', '')
	try:
	result = Repository.find_repository_by_full_name(repo)
	language = result.data['language']
	if len(language) != 0:
	repos += 1
	return lang.lower()
	else:
	print('No language found')
	return "unknown"
	except Exception:
	print(repo_link)
	return "failure"


	while nextUrl is not None:
	connection.request('GET', nextUrl)
	response = connection.getresponse()
	soup = BeautifulSoup(response.read().decode(), 'html.parser')
	projects = soup.select('ul.project-listing .name-wrapper a')
	for project in projects:
	projectUrl = project['href']
	print('Inspecting ' + projectUrl)
	connection.request('GET', projectUrl)
	response = connection.getresponse()
	projectSoup = BeautifulSoup(response.read().decode(), 'html.parser')
	menu = projectSoup.select('.e-header-nav .e-menu a')
	lang = "unknown"
	for link in menu:
	if "Source" not in link.string:
	continue
	repoLink = link['href']
	if "github" in repoLink:
	lang = inspect_github(repoLink)
	elif "bitbucket" in repoLink:
	lang = inspect_bitbucket(repoLink)
	licence = 'unknown'
	downloads = 0
	details = projectSoup.select('.cf-details.project-details li')
	for detail in details:
	detail_label = detail.select_one('.info-label').string.strip()
	if detail_label == 'Total Downloads':
	downloads = int(detail.select_one('.info-data').string.replace(',', ''))
	elif detail_label == 'License':
	licenceLink = detail.select_one('.info-data a')
	licence = licenceLink.string
	if licence is None:
	licence = licenceLink.select_one('span')['title']
	licence = licence.strip()
	notice = 'none'
	if lang == 'failure':
	notice = 'Failed: ' + repoLink
	lang = 'unknown'
	projectData.append({
	'url': 'https://minecraft.curseforge.com' + projectUrl,
	'language': lang,
	'downloads': downloads,
	'license': licence,
	'notice': notice
	})
	print("Done inspecting page " + str(page))
	next_button = soup.select_one('.listing-header .paging-list a[rel="next"]')
	if next_button is None:
	nextUrl = None
	else:
	nextUrl = next_button["href"]
	page += 1

	grouped = {}
	for l, psGrouper in groupby(projectData, lambda x: x['language']):
	ps = list(psGrouper)
	grouped[l] = ps

	for l, ps in grouped.items():
	stats[l] = len(ps)
	print(l + ": " + str(list(map(lambda p: p['url'], heapq.nlargest(3, ps, lambda p: p['downloads'])))))

	with open('data-1.12.json', 'w') as outfile:
	json.dump(projectData, outfile, indent=4)