-
-
Save statsccpr/a22329bbe1bb0d9ea1173239f3df65b6 to your computer and use it in GitHub Desktop.
exer_merge
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name | nba | school_college | school_high | source | url_src | |
---|---|---|---|---|---|---|
Lonzo Ball | LAL | University of California, Los Angeles | Chino Hills in Chino Hills | 1 | https://www.basketball-reference.com/players/b/balllo01.html | |
Kyle Kuzma | LAL | University of Utah | Rise Academy in Philadelphia | 1 | https://www.basketball-reference.com/players/k/kuzmaky01.html | |
Luc Mbah a Moute | MIL | University of California, Los Angeles | Monteverde Academy in St. Monteverde,�Florida | 1 | https://www.basketball-reference.com/players/m/mbahalu01.html | |
Nene | NYK | Instituto Alvaro Guiao in Sao Carlos, Brazil | 1 | https://www.basketball-reference.com/players/h/hilarne01.html | ||
Andrew Nene | UMass-Lowell | Abbey Kelley School | 2 | https://www.sports-reference.com/cbb/players/andrew-nene-1.html | ||
Bobby Brown | SAC | California State University, Fullerton | Westchester | 1 | https://www.basketball-reference.com/players/b/brownbo02.html | |
Bobby Brown | Alabama State | Lithia Springs HS | 2 | https://www.sports-reference.com/cbb/players/bobby-brown-2.html | ||
Josh Smith | ATL | Oak Hill Academy in Mouth of Wilson | 1 | https://www.basketball-reference.com/players/s/smithjo03.html | ||
Josh Smith | Savannah State | 2 | https://www.sports-reference.com/cbb/players/josh-smith-1.html | |||
Josh Smith | Clemson | Olymipc HS | 2 | https://www.sports-reference.com/cbb/players/josh-smith-2.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name | nba | school_college | school_high | source | url_src | |
---|---|---|---|---|---|---|
Lonzo Ball | LAL | University of California, Los Angeles | Chino Hills in Chino Hills | 1 | https://www.basketball-reference.com/players/b/balllo01.html | |
Kyle Kuzma | LAL | University of Utah | Rise Academy in Philadelphia | 1 | https://www.basketball-reference.com/players/k/kuzmaky01.html | |
Luc Mbah a Moute | MIL | University of California, Los Angeles | Monteverde Academy in St. Monteverde,�Florida | 1 | https://www.basketball-reference.com/players/m/mbahalu01.html | |
Nene | NYK | Instituto Alvaro Guiao in Sao Carlos, Brazil | 1 | https://www.basketball-reference.com/players/h/hilarne01.html | ||
Bobby Brown | SAC | California State University, Fullerton | Westchester | 1 | https://www.basketball-reference.com/players/b/brownbo02.html | |
Josh Smith | ATL | Oak Hill Academy in Mouth of Wilson | 1 | https://www.basketball-reference.com/players/s/smithjo03.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name | nba | school_college | school_high | source | url_src | |
---|---|---|---|---|---|---|
Lonzo Ball | UCLA | Chino HIlls High School | 2 | https://www.sports-reference.com/cbb/players/lonzo-ball-1.html | ||
Kyle Kuzma | Utah | Rise Academy | 2 | https://www.sports-reference.com/cbb/players/kyle-kuzma-1.html | ||
Luc Richard Mbah a Moute | UCLA | 2 | https://www.sports-reference.com/cbb/players/luc-richard-mbah-a-moute-1.html | |||
Andrew Nene | UMass-Lowell | Abbey Kelley School | 2 | https://www.sports-reference.com/cbb/players/andrew-nene-1.html | ||
Bobby Brown | Cal State Fullerton | 2 | https://www.sports-reference.com/cbb/players/bobby-brown-1.html | |||
Bobby Brown | Alabama State | Lithia Springs HS | 2 | https://www.sports-reference.com/cbb/players/bobby-brown-2.html | ||
Josh Smith | Savannah State | 2 | https://www.sports-reference.com/cbb/players/josh-smith-1.html | |||
Josh Smith | Clemson | Olymipc HS | 2 | https://www.sports-reference.com/cbb/players/josh-smith-2.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# setup ------------------------------------------------------------------- | |
setwd("~") | |
getwd() | |
options(stringsAsFactors = FALSE) | |
dat_nba = read.csv(here::here('/dat_nba.csv')) | |
dat_ncaa = read.csv(here::here('/dat_ncaa.csv')) | |
str(dat_nba) | |
head(dat_nba) | |
head(dat_ncaa) | |
dat_nba_in = dat_nba | |
dat_ncaa_in = dat_ncaa | |
View(dat_nba_in) | |
View(dat_ncaa) | |
# deterministic ----------------------------------------------------------- | |
# basically a sql join | |
# https://www.w3schools.com/sql/ | |
library(dplyr) | |
dplyr::full_join(x=dat_nba_in, | |
by="name", | |
y=dat_ncaa_in) %>% | |
select(contains('name'),contains('source'), | |
contains('high'),contains('college')) %>% | |
View() | |
# try to join by= "name" and "school_college" | |
dplyr::full_join(x=dat_nba_in, | |
by=c("name"), | |
y=dat_ncaa_in) %>% | |
select(contains('name'),contains('source'), | |
contains('high'),contains('college')) %>% | |
View() | |
# approximate ------------------------------------------------------------- | |
library(fuzzyjoin) | |
stringdist_full_join(x=dat_nba_in, | |
by="name", | |
y=dat_ncaa_in, | |
max_dist=0.1, | |
method='jw', | |
distance_col = "distance") %>% | |
select(contains('name'),contains('source'), | |
contains('high'),contains('college'), | |
contains('distance')) %>% | |
View() | |
# try changing max_dist=0.5, | |
stringdist_full_join(x=dat_nba_in, | |
by="name", | |
y=dat_ncaa_in, | |
max_dist= , | |
method='jw', | |
distance_col = "distance") %>% | |
select(contains('name'),contains('source'), | |
contains('high'),contains('college'), | |
contains('distance')) %>% | |
View() | |
# try using fuzzy on "name" and "school_college" | |
stringdist_full_join(x=dat_nba_in, | |
y=dat_ncaa_in, | |
by=c("name","school_college"), | |
max_dist=0.5, | |
method='jw', | |
distance_col = "distance") %>% | |
select(contains('name'),contains('source'), | |
contains('high'),contains('college'), | |
contains('distance')) %>% | |
View() | |
stringdist_full_join(x=dat_nba_in, | |
y=dat_ncaa_in, | |
by=c("name","school_high"), | |
max_dist=0.5, | |
method='jw', | |
distance_col = "distance") %>% | |
select(contains('name'),contains('source'), | |
contains('high'),contains('college'), | |
contains('distance')) %>% | |
View() | |
# try "name","school_college","school_high" | |
stringdist_full_join(x=dat_nba_in, | |
y=dat_ncaa_in, | |
by=c("name"), | |
max_dist=0.7, | |
method='jw', | |
distance_col = "distance") %>% | |
select(contains('name'),contains('source'), | |
contains('high'),contains('college'), | |
contains('distance')) %>% | |
View() | |
# probabilistic ----------------------------------------------------------- | |
# library(devtools) | |
# install_github("kosukeimai/fastLink",dependencies=TRUE) | |
library(fastLink) | |
# quick umbrella function | |
?fastLink() | |
# more control, specific functions under the hood | |
# define pattern features + estimate + filter + lookup | |
dfA = dat_nba_in | |
dfB = dat_ncaa_in | |
names(dfA) | |
# patterns aka 'features / predictors' | |
?gammaCKpar | |
feat_1 <- gammaCKpar(dfA$name, dfB$name, | |
# cut.p=0.7, | |
cut.p=0.5, | |
cut.a=0.8) | |
glimpse(feat_1) | |
?gammaCK2par | |
feat_1_ck2 <- gammaCK2par(dfA$name, dfB$name,cut.a=0.5) | |
# try different threshold values of cut.a | |
feat_2 <- gammaCK2par(dfA$school_high, dfB$school_high,cut.a=0.5) | |
feat_3 <- gammaCK2par(dfA$school_college, dfB$school_college,cut.a=0.5) | |
# try gammaCKpar() for feat_2 and feat_3 | |
ls_feat_1 = list(feat_1) | |
ls_feat_12 = list(feat_1,feat_2) | |
ls_feat_123 = list(feat_1,feat_2,feat_3) | |
tableCounts(ls_feat_1, nobs.a = nrow(dfA), nobs.b = nrow(dfB)) | |
tableCounts(ls_feat_12, nobs.a = nrow(dfA), nobs.b = nrow(dfB)) | |
tableCounts(ls_feat_123, nobs.a = nrow(dfA), nobs.b = nrow(dfB)) | |
tc_feat_123 <- tableCounts(ls_feat_123, nobs.a = nrow(dfA), nobs.b = nrow(dfB)) | |
tc_feat_12 <- tableCounts(ls_feat_12, nobs.a = nrow(dfA), nobs.b = nrow(dfB)) | |
tc_feat_1 <- tableCounts(ls_feat_1, nobs.a = nrow(dfA), nobs.b = nrow(dfB)) | |
# estimate 'importance/relevance' parameters of features | |
# just use name feature | |
em <- emlinkMARmov(patterns=tc_feat_1, | |
nobs.a = nrow(dfA), | |
nobs.b = nrow(dfB) | |
) | |
matchesLink(gammalist=ls_feat_1, | |
nobs.a = nrow(dfA), | |
nobs.b = nrow(dfB), | |
em = em, thresh = .2) %>% | |
getMatches( | |
dfA = dat_nba_in, | |
dfB = dat_ncaa_in, | |
fl.out = ., | |
combine.dfs=FALSE | |
) %>% View() | |
# josh smith missing college but oak hill | |
# josh smith college but missing high school | |
emlinkMARmov(patterns=tc_feat_12, | |
nobs.a = nrow(dfA), | |
nobs.b = nrow(dfB)) %>% | |
summary(.) | |
emlinkMARmov(patterns=tc_feat_123, | |
nobs.a = nrow(dfA), | |
nobs.b = nrow(dfB)) %>% | |
summary(.) | |
emlinkMARmov(patterns=tc_feat_123, | |
nobs.a = nrow(dfA), | |
nobs.b = nrow(dfB)) %>% | |
matchesLink(em = ., | |
gammalist=ls_feat_123, | |
nobs.a = nrow(dfA), | |
nobs.b = nrow(dfB), | |
thresh = .2) %>% | |
getMatches(fl.out = ., | |
dfA = dat_nba_in, | |
dfB = dat_ncaa_in, | |
combine.dfs=FALSE) %>% | |
View() | |
# for this example, probably should only rely on name | |
# need to clean high school and univeristy, to reliable use | |
# misc scrub b4 fun -------------------------------------------------------------- | |
# example of string scrub / parse | |
# if string is address | |
# step 0) install libpostal library first | |
# https://github.com/openvenues/libpostal | |
# step 1) install the r binding / wrapper | |
# devtools::install_github("ironholds/poster") | |
library(poster) | |
?normalise_addr | |
parse_addr("926 Broxton Ave, Los Angeles, CA 90024") | |
parse_addr("926Broxton Ave, Los Angeles, CA 90024") | |
paste(house_number("781 st Street"),road("781 st Street")) | |
?regex | |
sub(x="781ststreet", | |
pattern="([0-9]+)([a-z])", | |
replacement="\\1 \\2", | |
ignore.case = TRUE, | |
perl = FALSE, | |
fixed = FALSE, useBytes = FALSE) | |
sub(x="926Broxton Ave, Los Angeles, CA 90024", | |
pattern="([0-9]+)([a-z])", | |
replacement="\\1 \\2", | |
ignore.case = TRUE, | |
perl = FALSE, | |
fixed = FALSE, useBytes = FALSE) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment