Skip to content

Instantly share code, notes, and snippets.

@statsccpr
Last active January 30, 2019 00:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save statsccpr/a22329bbe1bb0d9ea1173239f3df65b6 to your computer and use it in GitHub Desktop.
Save statsccpr/a22329bbe1bb0d9ea1173239f3df65b6 to your computer and use it in GitHub Desktop.
exer_merge
name nba school_college school_high source url_src
Lonzo Ball LAL University of California, Los Angeles Chino Hills in Chino Hills 1 https://www.basketball-reference.com/players/b/balllo01.html
Kyle Kuzma LAL University of Utah Rise Academy in Philadelphia 1 https://www.basketball-reference.com/players/k/kuzmaky01.html
Luc Mbah a Moute MIL University of California, Los Angeles Monteverde Academy in St. Monteverde,�Florida 1 https://www.basketball-reference.com/players/m/mbahalu01.html
Nene NYK Instituto Alvaro Guiao in Sao Carlos, Brazil 1 https://www.basketball-reference.com/players/h/hilarne01.html
Andrew Nene UMass-Lowell Abbey Kelley School 2 https://www.sports-reference.com/cbb/players/andrew-nene-1.html
Bobby Brown SAC California State University, Fullerton Westchester 1 https://www.basketball-reference.com/players/b/brownbo02.html
Bobby Brown Alabama State Lithia Springs HS 2 https://www.sports-reference.com/cbb/players/bobby-brown-2.html
Josh Smith ATL Oak Hill Academy in Mouth of Wilson 1 https://www.basketball-reference.com/players/s/smithjo03.html
Josh Smith Savannah State 2 https://www.sports-reference.com/cbb/players/josh-smith-1.html
Josh Smith Clemson Olymipc HS 2 https://www.sports-reference.com/cbb/players/josh-smith-2.html
name nba school_college school_high source url_src
Lonzo Ball LAL University of California, Los Angeles Chino Hills in Chino Hills 1 https://www.basketball-reference.com/players/b/balllo01.html
Kyle Kuzma LAL University of Utah Rise Academy in Philadelphia 1 https://www.basketball-reference.com/players/k/kuzmaky01.html
Luc Mbah a Moute MIL University of California, Los Angeles Monteverde Academy in St. Monteverde,�Florida 1 https://www.basketball-reference.com/players/m/mbahalu01.html
Nene NYK Instituto Alvaro Guiao in Sao Carlos, Brazil 1 https://www.basketball-reference.com/players/h/hilarne01.html
Bobby Brown SAC California State University, Fullerton Westchester 1 https://www.basketball-reference.com/players/b/brownbo02.html
Josh Smith ATL Oak Hill Academy in Mouth of Wilson 1 https://www.basketball-reference.com/players/s/smithjo03.html
name nba school_college school_high source url_src
Lonzo Ball UCLA Chino HIlls High School 2 https://www.sports-reference.com/cbb/players/lonzo-ball-1.html
Kyle Kuzma Utah Rise Academy 2 https://www.sports-reference.com/cbb/players/kyle-kuzma-1.html
Luc Richard Mbah a Moute UCLA 2 https://www.sports-reference.com/cbb/players/luc-richard-mbah-a-moute-1.html
Andrew Nene UMass-Lowell Abbey Kelley School 2 https://www.sports-reference.com/cbb/players/andrew-nene-1.html
Bobby Brown Cal State Fullerton 2 https://www.sports-reference.com/cbb/players/bobby-brown-1.html
Bobby Brown Alabama State Lithia Springs HS 2 https://www.sports-reference.com/cbb/players/bobby-brown-2.html
Josh Smith Savannah State 2 https://www.sports-reference.com/cbb/players/josh-smith-1.html
Josh Smith Clemson Olymipc HS 2 https://www.sports-reference.com/cbb/players/josh-smith-2.html
# setup -------------------------------------------------------------------
setwd("~")
getwd()
options(stringsAsFactors = FALSE)
dat_nba = read.csv(here::here('/dat_nba.csv'))
dat_ncaa = read.csv(here::here('/dat_ncaa.csv'))
str(dat_nba)
head(dat_nba)
head(dat_ncaa)
dat_nba_in = dat_nba
dat_ncaa_in = dat_ncaa
View(dat_nba_in)
View(dat_ncaa)
# deterministic -----------------------------------------------------------
# basically a sql join
# https://www.w3schools.com/sql/
library(dplyr)
dplyr::full_join(x=dat_nba_in,
by="name",
y=dat_ncaa_in) %>%
select(contains('name'),contains('source'),
contains('high'),contains('college')) %>%
View()
# try to join by= "name" and "school_college"
dplyr::full_join(x=dat_nba_in,
by=c("name"),
y=dat_ncaa_in) %>%
select(contains('name'),contains('source'),
contains('high'),contains('college')) %>%
View()
# approximate -------------------------------------------------------------
library(fuzzyjoin)
stringdist_full_join(x=dat_nba_in,
by="name",
y=dat_ncaa_in,
max_dist=0.1,
method='jw',
distance_col = "distance") %>%
select(contains('name'),contains('source'),
contains('high'),contains('college'),
contains('distance')) %>%
View()
# try changing max_dist=0.5,
stringdist_full_join(x=dat_nba_in,
by="name",
y=dat_ncaa_in,
max_dist= ,
method='jw',
distance_col = "distance") %>%
select(contains('name'),contains('source'),
contains('high'),contains('college'),
contains('distance')) %>%
View()
# try using fuzzy on "name" and "school_college"
stringdist_full_join(x=dat_nba_in,
y=dat_ncaa_in,
by=c("name","school_college"),
max_dist=0.5,
method='jw',
distance_col = "distance") %>%
select(contains('name'),contains('source'),
contains('high'),contains('college'),
contains('distance')) %>%
View()
stringdist_full_join(x=dat_nba_in,
y=dat_ncaa_in,
by=c("name","school_high"),
max_dist=0.5,
method='jw',
distance_col = "distance") %>%
select(contains('name'),contains('source'),
contains('high'),contains('college'),
contains('distance')) %>%
View()
# try "name","school_college","school_high"
stringdist_full_join(x=dat_nba_in,
y=dat_ncaa_in,
by=c("name"),
max_dist=0.7,
method='jw',
distance_col = "distance") %>%
select(contains('name'),contains('source'),
contains('high'),contains('college'),
contains('distance')) %>%
View()
# probabilistic -----------------------------------------------------------
# library(devtools)
# install_github("kosukeimai/fastLink",dependencies=TRUE)
library(fastLink)
# quick umbrella function
?fastLink()
# more control, specific functions under the hood
# define pattern features + estimate + filter + lookup
dfA = dat_nba_in
dfB = dat_ncaa_in
names(dfA)
# patterns aka 'features / predictors'
?gammaCKpar
feat_1 <- gammaCKpar(dfA$name, dfB$name,
# cut.p=0.7,
cut.p=0.5,
cut.a=0.8)
glimpse(feat_1)
?gammaCK2par
feat_1_ck2 <- gammaCK2par(dfA$name, dfB$name,cut.a=0.5)
# try different threshold values of cut.a
feat_2 <- gammaCK2par(dfA$school_high, dfB$school_high,cut.a=0.5)
feat_3 <- gammaCK2par(dfA$school_college, dfB$school_college,cut.a=0.5)
# try gammaCKpar() for feat_2 and feat_3
ls_feat_1 = list(feat_1)
ls_feat_12 = list(feat_1,feat_2)
ls_feat_123 = list(feat_1,feat_2,feat_3)
tableCounts(ls_feat_1, nobs.a = nrow(dfA), nobs.b = nrow(dfB))
tableCounts(ls_feat_12, nobs.a = nrow(dfA), nobs.b = nrow(dfB))
tableCounts(ls_feat_123, nobs.a = nrow(dfA), nobs.b = nrow(dfB))
tc_feat_123 <- tableCounts(ls_feat_123, nobs.a = nrow(dfA), nobs.b = nrow(dfB))
tc_feat_12 <- tableCounts(ls_feat_12, nobs.a = nrow(dfA), nobs.b = nrow(dfB))
tc_feat_1 <- tableCounts(ls_feat_1, nobs.a = nrow(dfA), nobs.b = nrow(dfB))
# estimate 'importance/relevance' parameters of features
# just use name feature
em <- emlinkMARmov(patterns=tc_feat_1,
nobs.a = nrow(dfA),
nobs.b = nrow(dfB)
)
matchesLink(gammalist=ls_feat_1,
nobs.a = nrow(dfA),
nobs.b = nrow(dfB),
em = em, thresh = .2) %>%
getMatches(
dfA = dat_nba_in,
dfB = dat_ncaa_in,
fl.out = .,
combine.dfs=FALSE
) %>% View()
# josh smith missing college but oak hill
# josh smith college but missing high school
emlinkMARmov(patterns=tc_feat_12,
nobs.a = nrow(dfA),
nobs.b = nrow(dfB)) %>%
summary(.)
emlinkMARmov(patterns=tc_feat_123,
nobs.a = nrow(dfA),
nobs.b = nrow(dfB)) %>%
summary(.)
emlinkMARmov(patterns=tc_feat_123,
nobs.a = nrow(dfA),
nobs.b = nrow(dfB)) %>%
matchesLink(em = .,
gammalist=ls_feat_123,
nobs.a = nrow(dfA),
nobs.b = nrow(dfB),
thresh = .2) %>%
getMatches(fl.out = .,
dfA = dat_nba_in,
dfB = dat_ncaa_in,
combine.dfs=FALSE) %>%
View()
# for this example, probably should only rely on name
# need to clean high school and univeristy, to reliable use
# misc scrub b4 fun --------------------------------------------------------------
# example of string scrub / parse
# if string is address
# step 0) install libpostal library first
# https://github.com/openvenues/libpostal
# step 1) install the r binding / wrapper
# devtools::install_github("ironholds/poster")
library(poster)
?normalise_addr
parse_addr("926 Broxton Ave, Los Angeles, CA 90024")
parse_addr("926Broxton Ave, Los Angeles, CA 90024")
paste(house_number("781 st Street"),road("781 st Street"))
?regex
sub(x="781ststreet",
pattern="([0-9]+)([a-z])",
replacement="\\1 \\2",
ignore.case = TRUE,
perl = FALSE,
fixed = FALSE, useBytes = FALSE)
sub(x="926Broxton Ave, Los Angeles, CA 90024",
pattern="([0-9]+)([a-z])",
replacement="\\1 \\2",
ignore.case = TRUE,
perl = FALSE,
fixed = FALSE, useBytes = FALSE)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment