Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Attributed Github Dataset
--- find programming languages a user has been using (only as an owner)
--- contains 19M rows 10M of them are not NULL
select user_id, group_concat(org_id order by org_id separator ',')
from organization_members
group by user_id;
select * from followers;
--- find programming languages a user has been using (only as an owner)
--- contains 19M rows 10M of them are not NULL
select owner_id, group_concat(distinct(language) order by language separator ',')
from projects
where owner_id != -1
group by owner_id;
from sklearn.preprocessing import MultiLabelBinarizer as MLB
import sys
mlb = MLB()
lines = sys.stdin.read().splitlines()
feats = []
users = []
for line in lines:
user, languages = line.split('\t')
languages = set(languages.split(','))
feats.append(languages)
users.append(user)
feats = mlb.fit_transform(feats)
for feat, user in zip(feats, users):
print('{} {}'.format(user, ' '.join(feat.astype(str))))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment