Skip to content

Instantly share code, notes, and snippets.

@boennecd
Last active April 14, 2021 06:28
Show Gist options
  • Save boennecd/d42fc0ced67033c222ddc8c95945af00 to your computer and use it in GitHub Desktop.
Save boennecd/d42fc0ced67033c222ddc8c95945af00 to your computer and use it in GitHub Desktop.
Provides a faster version of the kinship2::makefamid.

Provides a faster version of the kinship2::makefamid function if sourced using Rcpp::sourceCpp. It also provides a new implementation and quite different implementation based largely on C++ called makefamid_new. You can try the functions by calling

download.file("https://gist.githubusercontent.com/boennecd/d42fc0ced67033c222ddc8c95945af00/raw/makefamid-fast.cpp", 
              src_file <- paste0(tempfile(), ".cpp"))
Rcpp::sourceCpp(src_file)

This will add three functions to the global environment: table_int_fast which is a fast version of table only for integers and makefamid_fast which is a fast version of makefamid, and makefamid_new which is the new mainly C++ implementation. Two examples of using these functions are provided below:

#####
# test and benchmark the table_int_fast function
set.seed(1)
x <- sample.int(10L, 1000L, replace = TRUE)

# we get the same
v1 <- table(x)
v2 <- table_int_fast(x)
all.equal(as.vector(unclass(v1)), v2, check.attributes = FALSE)

# the new version is faster
microbenchmark::microbenchmark(table = table(x),
                               table_int_fast = table_int_fast(x))

#####
# test and benchmark the makefamid_fast and makefamid_new functions
base_fam <- data.frame(
  id  =c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14),
  mom =c(0, 0, 0, 0, 2, 2, 4, 4, 6,  2,  0,  0, 12, 13),
  dad =c(0, 0, 0, 0, 1, 1, 3, 3, 3,  7,  0,  0, 11, 10),
  sex =c(0, 1, 0, 1, 0, 1, 0, 1, 0,  0,  0,  1,  1,  1))
f1 <- base_fam
f2 <- within(base_fam, {
  id <- id + 14L
  mom[mom > 0] <- mom[mom > 0] + 14L
  dad[dad > 0] <- dad[dad > 0] + 14L
})
dat <- rbind(f1, f2)
dat <- dat[sample.int(NROW(dat)), ] # scramble data order

library(kinship2)
r1 <- with(dat, makefamid_fast(id = id, father.id = dad, mother.id = mom))
r2 <- with(dat, makefamid     (id = id, father.id = dad, mother.id = mom))
r3 <- with(dat, makefamid_new (id = id, father.id = dad, mother.id = mom))
all.equal(r1, r2) # we get the same
all.equal(r2, r3) # we get the same

# compare computation time
microbenchmark::microbenchmark(
  kinship2 = 
    with(dat, makefamid     (id = id, father.id = dad, mother.id = mom)), 
  makefamid_fast = 
    with(dat, makefamid_fast(id = id, father.id = dad, mother.id = mom)),
  makefamid_new = 
    with(dat, makefamid_new (id = id, father.id = dad, mother.id = mom)))

Performance differences may be greater on larger data sets. The functions will not work if there are NAs in the data.

#include <Rcpp.h>
#include <numeric>
#include <map>
#include <unordered_map>
#include <unordered_set>
#include <memory>
#include <algorithm>
#include <limits>
// fast table for integer vectors. There is no checks for NAs.
// [[Rcpp::export(rng = false)]]
Rcpp::IntegerVector table_int_fast(Rcpp::IntegerVector const x){
std::map<int, R_xlen_t> counter_sorted;
{
std::unordered_map<int, R_xlen_t> counter;
int const *xi = &x[0];
for(R_xlen_t i = 0; i < x.size(); ++i)
counter[*xi++]++;
for(auto &v : counter)
counter_sorted[v.first] = v.second;
}
Rcpp::IntegerVector out(counter_sorted.size());
Rcpp::CharacterVector names(counter_sorted.size());
R_xlen_t i = 0L;
for(auto &v : counter_sorted){
names[i ] = v.first;
out [i++] = v.second;
}
out.names() = names;
return out;
}
// function to reduce the computation time of kinship2::makefamid. Note that
// argument newid is by reference. This only works if typeof(newid) is integer.
// [[Rcpp::export(".makefamid_fast_inner", rng = false)]]
void makefamid_fast_inner
(Rcpp::IntegerVector newid, Rcpp::IntegerVector const xid2,
Rcpp::IntegerVector const newid_sorted,
Rcpp::IntegerVector const grp_sizes){
int * const p_newid = &newid[0];
int const * p_xid2 = &xid2[0],
* p_newid_sorted = &newid_sorted[0],
* p_grp_size = &grp_sizes[0];
R_len_t const n_grps = grp_sizes.size();
if(n_grps != xid2.size())
Rcpp::stop("lengths of n_grps and mid do not match");
for(R_len_t i = 0; i < n_grps; ++i, ++p_grp_size, ++p_xid2){
int out = std::numeric_limits<int>::max();
for(int j = 0; j < *p_grp_size; ++j, ++p_newid_sorted)
if(out > *p_newid_sorted)
out = *p_newid_sorted;
if(p_newid[*p_xid2] > out)
p_newid[*p_xid2] = out;
}
}
namespace {
/// keeps track of family members
struct family_tracker {
struct family_member {
R_len_t id;
family_tracker * family;
family_member * next_member = nullptr;
};
R_len_t fam_id = std::numeric_limits<R_len_t>::max();
family_member * first_ele = nullptr,
* last_ele = nullptr;
size_t n_ele = 0L;
inline void add_member(family_member * const new_member){
if(!new_member)
return;
fam_id = std::min(fam_id, new_member->id);
new_member->family = this;
if(n_ele < 1L){
// the first member
first_ele = new_member;
last_ele = new_member;
n_ele = 1L;
return;
}
// add the member
last_ele->next_member = new_member;
last_ele = new_member;
++n_ele;
}
// should not be used
family_tracker() = default;
void merge(family_tracker &other){
if(other.fam_id != fam_id)
add_members(other);
}
void merge(std::unordered_set<family_tracker*> &others){
for(auto x : others)
if(x->fam_id != fam_id)
add_members(*x);
}
private:
inline void add_members(family_tracker &other){
// change the families
{
family_member * x = other.first_ele;
for(size_t i = 0; i < other.n_ele; ++i, x = x->next_member)
x->family = this;
}
// update the last element, the number of elements, and the family id
last_ele->next_member = other.first_ele;
last_ele = other.last_ele;
n_ele += other.n_ele;
fam_id = std::min(fam_id, other.fam_id);
// update other
other.first_ele = nullptr;
other.last_ele = nullptr;
other.n_ele = 0L;
}
};
} // namespace
// [[Rcpp::export(".makefamid_new", rng = false)]]
Rcpp::IntegerVector makefamid_new(Rcpp::IntegerVector const drow,
Rcpp::IntegerVector const mrow){
// setup families
R_len_t const n = drow.size();
std::unique_ptr<family_tracker::family_member[]> members(
new family_tracker::family_member[n]);
std::unique_ptr<family_tracker[]> trackers(new family_tracker[n]);
for(R_len_t j = 0; j < n; ++j){
members[j] = family_tracker::family_member { j, nullptr };
trackers[j].add_member(&members[j]);
}
// compute the order for drow and mrow
auto get_ord = [&](Rcpp::IntegerVector const x) -> std::unique_ptr<int[]> {
std::unique_ptr<int[]> out(new int[x.size()]);
std::iota(out.get(), out.get() + x.size(), 0);
int const *xp = &x[0];
std::sort(out.get(), out.get() + x.size(),
[&](int const i, int const j){ return xp[i] < xp[j]; });
return out;
};
std::unique_ptr<int[]> const m_ord = get_ord(mrow),
d_ord = get_ord(drow);
// find the family ids. Assign a lambda function to add all the relations
// the mothers or fathers
auto merge_families =
[&](int const * prow, int const * p_ord) -> void {
std::unordered_set<family_tracker*> families;
for(R_len_t c = 0L; c < n; ){
R_len_t const cur_parent = prow[p_ord[c]];
if(cur_parent >= n)
// we are done (reached the founders with no parents)
break;
// find the families to merge. We add the families to the largest one.
// This is computationally attractive. Thus, we need to keep track of the
// largest family
families.clear();
family_tracker * largest_family(nullptr);
size_t largest_family_size(0L);
// adds a family to be merged
auto add_family = [&](family_tracker * const new_fam) -> void {
auto const ret_info = families.insert(new_fam);
if(ret_info.second and
new_fam->n_ele > largest_family_size){
largest_family_size = new_fam->n_ele;
largest_family = new_fam;
}
};
// add the children to this family
{
R_len_t p_ord_c;
while(c < n and prow[(p_ord_c = p_ord[c])] == cur_parent){
add_family(members[p_ord_c].family);
++c;
}
}
// merge the families
if(families.size() < 1)
// nothing to merge
continue;
add_family(members[cur_parent].family);
largest_family->merge(families);
}
};
// add the relations through the mothers and fathers
merge_families(&mrow[0], &m_ord[0]);
merge_families(&drow[0], &d_ord[0]);
// setup the return object and return
Rcpp::IntegerVector out(n);
for(R_len_t j = 0; j < n; ++j)
out[j] = members[j].family->fam_id + 1L;
return out;
}
/*** R
# faster version of kinship2::makefamid.
makefamid_fast <- function(id, father.id, mother.id){
n <- length(id)
mid <- c(match(mother.id, id, nomatch = n + 1), n + 1)
did <- c(match(father.id, id, nomatch = n + 1), n + 1)
## ADDED ##
m_newid <- rep(NA_integer_, length(mid))
m_ord <- order(order(mid))
m_grp_size <- table_int_fast(mid)
d_newid <- rep(NA_integer_, length(did))
d_ord <- order(order(did))
d_grp_size <- table_int_fast(did)
###########
## CHANGED ##
# mid2 <- sort(unique(mid))
# did2 <- sort(unique(did))
mid2_m1 <- sort(unique(mid)) - 1L
did2_m1 <- sort(unique(did)) - 1L
#############
famid <- 1:(n + 1)
temp <- kindepth(id, father.id, mother.id)
for (i in 1:n) {
newid <- pmin(famid, famid[mid], famid[did])
## CHANGED ##
# newid[mid2] <- pmin(newid[mid2], tapply(newid, mid, min))
m_newid[m_ord] <- newid
.makefamid_fast_inner(newid = newid, xid2 = mid2_m1,
newid_sorted = m_newid, grp_sizes = m_grp_size)
#############
newid[n + 1] <- n + 1L
## CHANGED ##
# newid[did2] <- pmin(newid[did2], tapply(newid, did, min))
d_newid[d_ord] <- newid
dum <- .makefamid_fast_inner(newid = newid, xid2 = did2_m1,
newid_sorted = d_newid, grp_sizes = d_grp_size)
#############
newid[n + 1] <- n + 1L
if (all(newid == famid))
break
else if (i < n)
famid <- newid
}
if (all(newid == famid)) {
famid <- famid[1:n]
xx <- table_int_fast(famid)
if (any(xx == 1)) {
singles <- as.integer(names(xx[xx == 1]))
famid[!is.na(match(famid, singles))] <- 0L
match(famid, sort(unique(famid))) - 1L
}
else match(famid, sort(unique(famid)))
}
else stop("Bug in routine: seem to have found an infinite loop")
}
# a completely new implementation of kinship2::makefamid largely based on C++.
makefamid_new <- function(id, father.id, mother.id){
stopifnot(require(kinship2), !anyDuplicated(id))
n <- length(id)
mrow <- match(mother.id, id, nomatch = n + 1) - 1L
drow <- match(father.id, id, nomatch = n + 1) - 1L
out <- .makefamid_new(mrow = mrow, drow = drow)
xx <- table_int_fast(out)
if(any(xx == 1)) {
singles <- as.integer(names(xx[xx == 1]))
out[!is.na(match(out, singles))] <- 0L
match(out, sort(unique(out))) - 1L
}
else match(out, sort(unique(out)))
}
*/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment