Instantly share code, notes, and snippets.

@d4l3k /main.go
Last active Jul 8, 2017

Embed
What would you like to do?
nwHacks Registration Machine Learning Pipeline
package main
import (
"context"
"encoding/csv"
"flag"
"fmt"
"log"
"net/http"
"os"
"sort"
"strings"
"sync"
"unicode"
"github.com/d4l3k/docconv"
"github.com/google/go-github/github"
"github.com/jasonwinn/geocoder"
geo "github.com/kellydunn/golang-geo"
"github.com/xlvector/hector"
"github.com/xlvector/hector/core"
"../db"
)
var (
file = flag.String("f", "", "the file to load")
model = flag.String("model", "", "path to the model to test")
classifierName = flag.String("classifier", "rf", "the classifier to use")
)
func extractWords(words string) []string {
return strings.FieldsFunc(strings.ToLower(words), func(r rune) bool {
return !(unicode.IsLetter(r) || unicode.IsDigit(r))
})
}
var (
wordMap = map[string]int{}
wordMapMu sync.Mutex
)
func wordToID(word string) int {
word = clean(word)
wordMapMu.Lock()
id, ok := wordMap[word]
if !ok {
id = len(wordMap) + 1
wordMap[word] = id
}
wordMapMu.Unlock()
return id
}
func wordsToIDs(words []string) []int {
var out []int
for _, word := range words {
out = append(out, wordToID(word))
}
return out
}
func countIDs(ids []int) map[int]int {
out := map[int]int{}
for _, id := range ids {
out[id]++
}
return out
}
func openDataset(name string) *os.File {
f, err := os.OpenFile(name, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0755)
if err != nil {
log.Fatal(name, err)
}
return f
}
func writeFeaturesToDataset(ds *os.File, positive bool, features map[int]float64) {
var err error
if positive {
_, err = ds.Write([]byte("1"))
} else {
_, err = ds.Write([]byte("0"))
}
if err != nil {
log.Fatal(err)
}
var featureIDs []int
for id := range features {
featureIDs = append(featureIDs, id)
}
sort.Sort(sort.IntSlice(featureIDs))
for _, id := range featureIDs {
fmt.Fprintf(ds, " %d:%f", id, features[id])
}
ds.Write([]byte("\n"))
}
func btof(b bool) float64 {
if b {
return 1.0
}
return 0.0
}
func getGithubFeatures(username string, features map[int]float64) (string, error) {
t := &github.UnauthenticatedRateLimitedTransport{
ClientID: "",
ClientSecret: "",
}
ctx := context.Background()
client := github.NewClient(t.Client())
username = strings.TrimPrefix(username, "https://github.com/")
username = strings.TrimPrefix(username, "http://github.com/")
user, _, err := client.Users.Get(ctx, username)
if err != nil {
return "", err
}
features[30] = float64(user.GetPublicRepos())
features[31] = float64(user.GetPublicGists())
features[32] = float64(user.GetFollowers())
features[33] = float64(user.GetFollowing())
words := user.GetBio()
repos, _, err := client.Repositories.List(ctx, username, nil)
if err != nil {
return "", err
}
stars := 0
for _, repo := range repos {
stars += repo.GetStargazersCount()
words += " " + repo.GetDescription()
words += " " + repo.GetName()
}
features[34] = float64(stars)
return words, nil
}
func getFeatures(reg *db.Registration) map[int]float64 {
words := extractWords(reg.Reason)
resume := reg.ResumeLink()
if len(resume) > 0 {
resp, err := http.Get(resume)
if err != nil {
log.Fatal(err)
}
res, _, err := docconv.ConvertPDF(resp.Body)
if err != nil {
log.Fatal(err)
}
resp.Body.Close()
words = append(words, extractWords(res)...)
}
features := map[int]float64{}
features[0] = btof(reg.FirstHackathon)
features[1] = btof(reg.Mentor)
features[2] = btof(len(reg.GitHub) > 0)
features[3] = btof(len(reg.PersonalSite) > 0)
features[4] = btof(len(reg.LinkedIn) > 0)
features[5] = float64(len(strings.Split(reg.Teammates, ",")))
if len(reg.GitHub) > 0 {
out, err := getGithubFeatures(reg.GitHub, features)
if err != nil {
log.Println(err)
} else {
words = append(words, extractWords(out)...)
}
}
if len(reg.School) > 0 {
features[1000+wordToID("school:"+reg.School)] = 1.0
}
if len(reg.City) > 0 {
lat, lng, err := geocoder.Geocode(reg.City)
if err != nil {
log.Println(err)
}
city := geo.NewPoint(lat, lng)
features[6] = vancouver.GreatCircleDistance(city)
}
wordIDs := wordsToIDs(words)
counts := countIDs(wordIDs)
for id, count := range counts {
features[id+1000] = float64(count) / float64(len(words))
}
return features
}
func featuresToSample(features map[int]float64) *core.Sample {
sample := core.Sample{}
for id, val := range features {
sample.Features = append(sample.Features, core.Feature{
Id: int64(id),
Value: val,
})
}
return &sample
}
var vancouver *geo.Point
const workers = 32
func main() {
flag.Parse()
geocoder.SetAPIKey("")
vanLat, vanLng, err := geocoder.Geocode("Vancouver, BC, Canada")
if err != nil {
log.Fatal(err)
}
vancouver = geo.NewPoint(vanLat, vanLng)
if len(*model) > 0 {
classifier := hector.GetClassifier(*classifierName)
classifier.LoadModel(*model)
reg := &db.Registration{
Name: "Tristan Rice",
School: "University of British Columbia",
City: "Vancouver",
GitHub: "d4l3k",
LinkedIn: "d4l3k",
Reason: "I really want to come to nwHacks and make some cool stuff! I've gone that past couple of years and really enjoyed it.",
Resume: "https://fn.lc/resume.pdf",
Mentor: true,
FirstHackathon: false,
Teammates: "jinny, roy",
PersonalSite: "https://fn.lc",
Email: "rice@fn.lc",
}
sample := featuresToSample(getFeatures(reg))
log.Printf("Predicted value = %f", classifier.Predict(sample))
} else {
generate()
}
}
func generate() {
accepted := openDataset("./accepted.data")
checkin := openDataset("./checkin.data")
holisticCheckin := openDataset("./holisticCheckin.data")
submit := openDataset("./submit.data")
holisticSubmit := openDataset("./holisticSubmit.data")
submitted := map[string]bool{}
input, err := os.Open(*file)
if err != nil {
log.Fatal(err)
}
defer input.Close()
reader := csv.NewReader(input)
reader.FieldsPerRecord = -1
records, err := reader.ReadAll()
if err != nil {
log.Fatal(err)
}
for _, record := range records {
for _, entry := range record[10:] {
submitted[clean(entry)] = true
}
}
firebase := db.NewDB()
regs, err := firebase.AllRegistrations()
if err != nil {
log.Fatal(err)
}
regChan := make(chan *db.Registration, workers)
go func() {
defer close(regChan)
for _, reg := range regs {
regChan <- reg
}
}()
type fetchedReg struct {
reg *db.Registration
features map[int]float64
}
fetchedChan := make(chan *fetchedReg, workers)
var wg sync.WaitGroup
for i := 0; i < workers; i++ {
wg.Add(1)
go func() {
defer wg.Done()
for reg := range regChan {
fetchedChan <- &fetchedReg{
reg: reg,
features: getFeatures(reg),
}
}
}()
}
go func() {
wg.Wait()
close(fetchedChan)
}()
count := 0
for fetched := range fetchedChan {
count++
log.Printf("Fetched %d/%d", count, len(regs))
reg := fetched.reg
features := fetched.features
isAccepted := reg.Status == db.StatusAccepted
writeFeaturesToDataset(accepted, isAccepted, features)
writeFeaturesToDataset(holisticCheckin, reg.CheckedIn, features)
didSubmit := submitted[clean(reg.Name)] || submitted[clean(reg.Email)]
writeFeaturesToDataset(
holisticSubmit,
didSubmit,
features,
)
if len(reg.RSVP) > 0 || reg.CheckedIn {
writeFeaturesToDataset(checkin, reg.CheckedIn, features)
}
if reg.CheckedIn {
writeFeaturesToDataset(submit, didSubmit, features)
}
}
}
func clean(str string) string {
return strings.ToLower(strings.TrimSpace(str))
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment