Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@glouppe
Last active December 23, 2015 21:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 2 You must be signed in to fork a gist
  • Save glouppe/6698145 to your computer and use it in GitHub Desktop.
Save glouppe/6698145 to your computer and use it in GitHub Desktop.
Generate a sparse matrix such that rows=users, columns=filenames and data[i, j]=number of commits of user i on file j, and then find the 3 nearest neighbors of each scikit-learn contributor.
import numpy as np
import os
from collections import defaultdict
from git import Repo
from scipy.sparse import csc_matrix
path = "/home/gilles/Sources/scikit-learn/sklearn/"
extensions = ["py", "pyx", "pxd"]
# List all files
filenames = []
for extension in extensions:
for dirpath, _, files in os.walk(path):
for filename in files:
if filename.endswith(extension):
filenames.append(os.path.join(dirpath, filename))
filenames = {filename: filename_id
for filename_id, filename in enumerate(sorted(filenames))}
# Traverse all commits of all files
nb_commits = {}
authors = {}
author_count = 0
repo = Repo(path)
for filename, filename_id in sorted(filenames.items()):
print filename
nb_commits[filename_id] = defaultdict(int)
for commit in repo.iter_commits(paths=filename):
author = commit.author.name.title()
if author not in authors:
authors[author] = author_count
author_count += 1
author_id = authors[author]
nb_commits[filename_id][author_id] += 1
# Build sparse matrix rows=users, columns=filenames, data=nb of commits
data = []
indices = []
indptr = []
i = 0
indptr.append(i)
for filename_id, values in sorted(nb_commits.items()):
for author_id, count in sorted(values.items()):
data.append(count)
indices.append(author_id)
i += 1
indptr.append(i)
data = np.array(data, dtype=np.int32)
indices = np.array(indices, dtype=np.int32)
indptr = np.array(indptr, dtype=np.int32)
X = csc_matrix((data, indices, indptr))
# Nearest neighbors
from sklearn.neighbors import NearestNeighbors
r_authors = {author_id: author for author, author_id in authors.items()}
X = X.todense()
model = NearestNeighbors(n_neighbors=4).fit(X)
neighbors = model.kneighbors(X, return_distance=False)
for author, author_id in sorted(authors.items()):
print(author, r_authors[neighbors[author_id, 1]],
r_authors[neighbors[author_id, 2]],
r_authors[neighbors[author_id, 3]])
(u'A. Flaxman', u'Miroslav Batchkarov', u'A. Flaxman', u'Michael Eickenberg')
(u'Adrien Gaidon', u'Justin Pati', u'Rafael Cunha De Almeida', u'Raul Garreta')
(u'Alemagnani', u'Roman', u'Matti Lyra', u'Lqdc')
(u'Alex Companioni', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Alexander Fabisch', u'Alexander Fabisch', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Alexandre Abraham', u'Jan Hendrik Metzen', u'Cow', u'Miroslav Shubernetskiy')
(u'Alexandre Gramfort', u'Gaelvaroquaux', u'Immanuel Bayer', u'Jaques Grobler')
(u'Alexandre Passos', u'Nzer0', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Alexis Mignon', u'Jamestwebber', u'Daniel Velkov', u'Roman')
(u'Andreas Mueller', u'Lars Buitinck', u'Olivier Grisel', u'Alexandre Gramfort')
(u'Andrew Winterman', u'Kyle Kastner', u'Approximateidentity', u'Felix Brockherde')
(u'Andy', u'Francois Savard', u'Nick Wilson', u'David Warde-Farley')
(u'Anze', u'Alex Companioni', u'A. Flaxman', u'Jansoe')
(u'Approximateidentity', u'Felix Brockherde', u'Rafael Cunha De Almeida', u'Robertlayton')
(u'Arnaud Joly', u'Joel Nothman', u'Satrajit Ghosh', u'Conrad Lee')
(u'Aymeric Masurelle', u'Hrishikesh Huilgolkar', u'Erwin Marsi', u'Robertlayton')
(u'Balu', u'Sergio Medina', u'Joshua Vredevoogd', u'Bussonnier Matthias')
(u'Bastiaan Van Den Berg', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Bdholt1', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Benjamin Peterson', u'Benjamin Peterson', u'Jakemick', u'F\xe9lix-Antoine Fortin')
(u'Brandyn A. White', u'Peter Welinder', u'Brandyn A. White', u'Brandyn White')
(u'Brandyn White', u'Peter Welinder', u'Brandyn A. White', u'Brandyn White')
(u'Brian Cajes', u'Jansoe', u'Miroslav Batchkarov', u'A. Flaxman')
(u'Brian Cheung', u'Cow', u'Szabo Roland', u'Alexandre Abraham')
(u'Brian Holt', u'Tim Sheerman-Chase', u'John Benediktsson', u'Bdholt1')
(u'Brooke Osborn', u'Miroslav Batchkarov', u'Eugene Nizhibitsky', u'A. Flaxman')
(u'Bthirion', u'Jaques Grobler', u'Shiqiao Du', u'Daniel Nouri')
(u'Bussonnier Matthias', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Carlos Scheidegger', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Charles Mccarthy', u'Jansoe', u'A. Flaxman', u'Miroslav Batchkarov')
(u'Charles-Pierre Astolfi', u'Emmanuelle Gouillart', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Christian Jauvin', u'Scott Dickerson', u'Tadej Jane\u017e', u'Michael Eickenberg')
(u'Claire Revillet', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Clay Woolam', u'Anze', u'Bdholt1', u'Miroslav Shubernetskiy')
(u'Conrad Lee', u'Kyle Beauchamp', u'Olivier Hervieu', u'Jim Holmstr\xf6m')
(u'Corey Lynch', u'Diego Molla', u'Daniel Velkov', u'Roman')
(u'Cow', u'Szabo Roland', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Daniel Duckworth', u'Naoki Orii', u'Robertlayton', u'Miroslav Shubernetskiy')
(u'Daniel Nouri', u'Tadej Jane\u017e', u'Christian Jauvin', u'Scott Dickerson')
(u'Daniel Velkov', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'David Cournapeau', u'Roman', u'Claire Revillet', u'Alemagnani')
(u'David Marek', u'Tadej Jane\u017e', u'Christian Jauvin', u'Scott Dickerson')
(u'David Warde-Farley', u'Nick Wilson', u'Francois Savard', u'Miroslav Shubernetskiy')
(u'Dengemann', u'Osdf', u'Denis A. Engemann', u'Denis Engemann')
(u'Denis A. Engemann', u'Denis Engemann', u'Jansoe', u'Federico Vaggi')
(u'Denis Engemann', u'Jansoe', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Denton Cockburn', u'Brandyn A. White', u'Peter Welinder', u'John Zwinck')
(u'Diego Molla', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Doug Coleman', u'Richard T. Guy', u'Fedev Ubuntu Laptop', u'Sebastian Berg')
(u'Dougal Sutherland', u'Sergey Karayev', u'A. Flaxman', u'Matthias Ekman')
(u'Draix', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Edouard Duchesnay', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Emanuele', u'Emanuele', u'Scott White', u'Miroslav Shubernetskiy')
(u'Emmanuelle Gouillart', u'Emmanuelle Gouillart', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Erwin Marsi', u'Robertlayton', u'Daniel Duckworth', u'Hrishikesh Huilgolkar')
(u'Eugene Nizhibitsky', u'Emanuele', u'Scott White', u'Miroslav Shubernetskiy')
(u'Fabian Pedregosa', u'Gaelvaroquaux', u'Jaques Grobler', u'Michael')
(u'Fazlul Shahriar', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Fcostin', u'Nicolas Pinto', u'Michael', u'Charles-Pierre Astolfi')
(u'Federico Vaggi', u'Fedev Ubuntu Laptop', u'Jansoe', u'Miroslav Shubernetskiy')
(u'Fedev Ubuntu Laptop', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Felix Brockherde', u'Miroslav Batchkarov', u'Jansoe', u'A. Flaxman')
(u'Flyingimmidev', u'Jamestwebber', u'Unknown', u'Anze')
(u'Francois Savard', u'Juan Manuel Caicedo Carvajal', u'Nick Wilson', u'Jansoe')
(u'F\xe9lix-Antoine Fortin', u'Alexander Fabisch', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Gael Varoquaux', u'Gaelvaroquaux', u'Jaques Grobler', u'Brian Cheung')
(u'Gaelvaroquaux', u'Jaques Grobler', u'Stefano Lattarini', u'Flyingimmidev')
(u'Gilles Louppe', u'Brian Holt', u'Noel Dawe', u'Peter Prettenhofer')
(u'Hannes Schulz', u'Leonpalafox', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Hrishikesh Huilgolkar', u'Erwin Marsi', u'Robertlayton', u'Fedev Ubuntu Laptop')
(u'Immanuel Bayer', u'Flyingimmidev', u'Jamestwebber', u'Yaroslav Halchenko')
(u'Imran Haque', u'Shaun Jackman', u'Udi Weinsberg', u'Alexandre Passos')
(u'Jack Hale', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Jacob Vanderplas', u'Naoki Orii', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Jacques Kvam', u'Scott White', u'Eugene Nizhibitsky', u'Emanuele')
(u'Jake Vanderplas', u'David Marek', u'Jaques Grobler', u'Joonas Sillanp\xe4\xe4')
(u'Jakemick', u'F\xe9lix-Antoine Fortin', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'James Bergstra', u'Francois Savard', u'Nick Wilson', u'Imran Haque')
(u'Jamestwebber', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Jan Hendrik Metzen', u'Alexandre Abraham', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Jansoe', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Jaques Grobler', u'Mikhail Korobov', u'Stefano Lattarini', u'Jim Holmstr\xf6m')
(u'Jim Holmstr\xf6m', u'Kyle Beauchamp', u'Conrad Lee', u'Olivier Hervieu')
(u'Jnothman', u'Brandyn A. White', u'Peter Welinder', u'John Zwinck')
(u'Jochen Wersd\xf6rfer', u'Peter Welinder', u'John Zwinck', u'Roman')
(u'Joel Nothman', u'Satrajit Ghosh', u'Jim Holmstr\xf6m', u'Conrad Lee')
(u'Johannes Sch\xf6nberger', u'Daniel Duckworth', u'Robertlayton', u'Miroslav Shubernetskiy')
(u'John Benediktsson', u'Bdholt1', u'Jnothman', u'Fazlul Shahriar')
(u'John Zwinck', u'Peter Welinder', u'Brandyn A. White', u'Brandyn White')
(u'Joonas Sillanp\xe4\xe4', u'Naoki Orii', u'Daniel Duckworth', u'Bastiaan Van Den Berg')
(u'Joshua Vredevoogd', u'Sergio Medina', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Juan Manuel Caicedo Carvajal', u'Miroslav Batchkarov', u'Jansoe', u'A. Flaxman')
(u'Justin Pati', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Justin Vincent', u'Adrien Gaidon', u'Justin Pati', u'Jochen Wersd\xf6rfer')
(u'Karol Pysniak', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Kemal Eren', u'Kenneth C. Arnold', u'Unknown', u'Charles-Pierre Astolfi')
(u'Kenneth C. Arnold', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Kernc', u'Lqdc', u'Matti Lyra', u'Alemagnani')
(u'Kuantkid', u'Satrajit Ghosh', u'Conrad Lee', u'Jim Holmstr\xf6m')
(u'Kyle Beauchamp', u'Olivier Hervieu', u'Conrad Lee', u'Jim Holmstr\xf6m')
(u'Kyle Kastner', u'Felix Brockherde', u'Approximateidentity', u'Jansoe')
(u'Lars Buitinck', u'Fabian Pedregosa', u'Mathieu Blondel', u'Alexandre Gramfort')
(u'Leonpalafox', u'Hannes Schulz', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Lqdc', u'Matti Lyra', u'Roman', u'Alemagnani')
(u'Luis Pedro Coelho', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Marko Burjek', u'Dougal Sutherland', u'Sergey Karayev', u'Jansoe')
(u'Mathieu Blondel', u'Rob Zinkov', u'Michael', u'Fabian Pedregosa')
(u'Matthias Ekman', u'Miroslav Batchkarov', u'A. Flaxman', u'Michael Eickenberg')
(u'Matti Lyra', u'Matti Lyra', u'Roman', u'Alemagnani')
(u'Michael', u'Fcostin', u'Nicolas Pinto', u'Charles-Pierre Astolfi')
(u'Michael Eickenberg', u'Scott Dickerson', u'Miroslav Batchkarov', u'A. Flaxman')
(u'Mikhail Korobov', u'Jochen Wersd\xf6rfer', u'Nick Wilson', u'Joshua Vredevoogd')
(u'Miroslav Batchkarov', u'Miroslav Batchkarov', u'A. Flaxman', u'Michael Eickenberg')
(u'Miroslav Shubernetskiy', u'Benjamin Peterson', u'Jakemick', u'F\xe9lix-Antoine Fortin')
(u'Mr.Shu', u'Shaun Jackman', u'Udi Weinsberg', u'Alexandre Passos')
(u'Naoki Orii', u'Jacob Vanderplas', u'Daniel Duckworth', u'Miroslav Shubernetskiy')
(u'Nelle Varoquaux', u'Andy', u'Approximateidentity', u'Brooke Osborn')
(u'Nick Wilson', u'David Warde-Farley', u'Francois Savard', u'John Zwinck')
(u'Nicolas Pinto', u'Fcostin', u'Emmanuelle Gouillart', u'Charles-Pierre Astolfi')
(u'Nicolas Tr\xe9segnie', u'Adrien Gaidon', u'Justin Pati', u'Michael Eickenberg')
(u'Noel Dawe', u'Mikhail Korobov', u'Jim Holmstr\xf6m', u'Tim Sheerman-Chase')
(u'Norbert Crombach', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Nzer0', u'Nzer0', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Olivier Grisel', u'Gael Varoquaux', u'Gaelvaroquaux', u'Fabian Pedregosa')
(u'Olivier Hervieu', u'Kyle Beauchamp', u'Conrad Lee', u'Jnothman')
(u'Osdf', u'Jansoe', u'A. Flaxman', u'Miroslav Batchkarov')
(u'Paolo Losi', u'Miroslav Batchkarov', u'Jansoe', u'A. Flaxman')
(u'Peter Prettenhofer', u'Brian Holt', u'Jacques Kvam', u'Scott White')
(u'Peter Welinder', u'Peter Welinder', u'Brandyn A. White', u'Brandyn White')
(u'Philippe Gervais', u'Aymeric Masurelle', u'Hrishikesh Huilgolkar', u'Erwin Marsi')
(u'Rafael Cunha De Almeida', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Raul Garreta', u'Seamus Abshere', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Richard T. Guy', u'Miroslav Batchkarov', u'Jansoe', u'A. Flaxman')
(u'Rob Speer', u'Charles Mccarthy', u'Miroslav Batchkarov', u'A. Flaxman')
(u'Rob Zinkov', u'Marko Burjek', u'Sergey Karayev', u'Dougal Sutherland')
(u'Robert Layton', u'Philippe Gervais', u'Aymeric Masurelle', u'Hrishikesh Huilgolkar')
(u'Robert Marchman', u'Kernc', u'Alemagnani', u'Xinfan Meng')
(u'Robert Mcgibbon', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Robertlayton', u'Daniel Duckworth', u'Erwin Marsi', u'Miroslav Shubernetskiy')
(u'Roman', u'Matti Lyra', u'Lqdc', u'Alemagnani')
(u'Satrajit Ghosh', u'Jim Holmstr\xf6m', u'Conrad Lee', u'Kyle Beauchamp')
(u'Scott Dickerson', u'Michael Eickenberg', u'A. Flaxman', u'Miroslav Batchkarov')
(u'Scott White', u'Emanuele', u'Eugene Nizhibitsky', u'Jacques Kvam')
(u'Seamus Abshere', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Sebastian Berg', u'Shaun Jackman', u'Udi Weinsberg', u'Alexandre Passos')
(u'Sergey Feldman', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Sergey Karayev', u'Dougal Sutherland', u'Marko Burjek', u'Jansoe')
(u'Sergeyf', u'Sergey Feldman', u'Alexandre Passos', u'Udi Weinsberg')
(u'Sergio Medina', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Sergio Pascual', u'Hannes Schulz', u'Leonpalafox', u'Benjamin Peterson')
(u'Shaun Jackman', u'Udi Weinsberg', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Shiqiao', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Shiqiao Du', u'Daniel Nouri', u'Shiqiao', u'John Benediktsson')
(u'Stefano Lattarini', u'A. Flaxman', u'Rafael Cunha De Almeida', u'Fazlul Shahriar')
(u'Steven De Gryze', u'Seamus Abshere', u'Miroslav Batchkarov', u'A. Flaxman')
(u'Sturla Molden', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Subhodeep Moitra', u'Christian Jauvin', u'Michael Eickenberg', u'Scott Dickerson')
(u'Syhw', u'Yann N. Dauphin', u'Carlos Scheidegger', u'Udi Weinsberg')
(u'Szabo Roland', u'Cow', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Tadej Jane\u017e', u'Christian Jauvin', u'Scott Dickerson', u'Michael Eickenberg')
(u'Tiago Nunes', u'Raul Garreta', u'Alexandre Passos', u'Udi Weinsberg')
(u'Tim Sheerman-Chase', u'Bdholt1', u'Sebastian Berg', u'Udi Weinsberg')
(u'Tiziano Zito', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Udi Weinsberg', u'Udi Weinsberg', u'Benjamin Peterson', u'Miroslav Shubernetskiy')
(u'Unknown', u'Emmanuelle Gouillart', u'Charles-Pierre Astolfi', u'Varoquaux')
(u'Varoquaux', u'Jakemick', u'Miroslav Shubernetskiy', u'Benjamin Peterson')
(u'Vene', u'Tiziano Zito', u'Varoquaux', u'Jamestwebber')
(u'Virgilefritsch', u'Johannes Sch\xf6nberger', u'Brooke Osborn', u'Balu')
(u'Vlad Niculae', u'Vene', u'Aymeric Masurelle', u'Stefano Lattarini')
(u'Xinfan Meng', u'Alemagnani', u'Roman', u'Lqdc')
(u'Yann N. Dauphin', u'Syhw', u'Alexandre Passos', u'Udi Weinsberg')
(u'Yannick Schwartz', u'Tadej Jane\u017e', u'Christian Jauvin', u'David Marek')
(u'Yaroslav Halchenko', u'Matthias Ekman', u'Miroslav Batchkarov', u'A. Flaxman')
@larsmans
Copy link

from git import Repo? Which library do I need for that?

@glouppe
Copy link
Author

glouppe commented Sep 25, 2013

from git import Repo? Which library do I need for that?

GitPython

@mblondel
Copy link

Can't we call git from the command line to retrieve the data? If we want to make this an example, that would be better.

@mblondel
Copy link

I think data normalization is very important (but perhaps w.r.t. files instead of w.r.t. users).

@amueller
Copy link

pca

from sklearn.decomposition import RandomizedPCA
trans = RandomizedPCA(n_components=2).fit_transform(X)
plt.scatter(trans[:, 0], trans[:, 1])
axes = plt.gca()
for i, p in enumerate(trans):
    axes.text(p[0], p[1], r_authors[i])
plt.show()

@amueller
Copy link

isomap:isomap
No back to the thesis ^^

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment