boennecd/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Provides a faster version of the kinship2::makefamid function if sourced using Rcpp::sourceCpp.
It also provides a new implementation and quite different implementation based largely on C++
called makefamid_new. You can try the functions by calling
download.file("https://gist.githubusercontent.com/boennecd/d42fc0ced67033c222ddc8c95945af00/raw/makefamid-fast.cpp", 
              src_file <- paste0(tempfile(), ".cpp"))
Rcpp::sourceCpp(src_file)
This will add three functions to the global environment: table_int_fast which is a fast version of
table only for integers and makefamid_fast which is a fast version of makefamid, and makefamid_new
which is the new mainly C++ implementation.
Two examples of using these functions are provided below:
#####
# test and benchmark the table_int_fast function
set.seed(1)
x <- sample.int(10L, 1000L, replace = TRUE)

# we get the same
v1 <- table(x)
v2 <- table_int_fast(x)
all.equal(as.vector(unclass(v1)), v2, check.attributes = FALSE)

# the new version is faster
microbenchmark::microbenchmark(table = table(x),
                               table_int_fast = table_int_fast(x))

#####
# test and benchmark the makefamid_fast and makefamid_new functions
base_fam <- data.frame(
  id  =c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14),
  mom =c(0, 0, 0, 0, 2, 2, 4, 4, 6,  2,  0,  0, 12, 13),
  dad =c(0, 0, 0, 0, 1, 1, 3, 3, 3,  7,  0,  0, 11, 10),
  sex =c(0, 1, 0, 1, 0, 1, 0, 1, 0,  0,  0,  1,  1,  1))
f1 <- base_fam
f2 <- within(base_fam, {
  id <- id + 14L
  mom[mom > 0] <- mom[mom > 0] + 14L
  dad[dad > 0] <- dad[dad > 0] + 14L
})
dat <- rbind(f1, f2)
dat <- dat[sample.int(NROW(dat)), ] # scramble data order

library(kinship2)
r1 <- with(dat, makefamid_fast(id = id, father.id = dad, mother.id = mom))
r2 <- with(dat, makefamid     (id = id, father.id = dad, mother.id = mom))
r3 <- with(dat, makefamid_new (id = id, father.id = dad, mother.id = mom))
all.equal(r1, r2) # we get the same
all.equal(r2, r3) # we get the same

# compare computation time
microbenchmark::microbenchmark(
  kinship2 = 
    with(dat, makefamid     (id = id, father.id = dad, mother.id = mom)), 
  makefamid_fast = 
    with(dat, makefamid_fast(id = id, father.id = dad, mother.id = mom)),
  makefamid_new = 
    with(dat, makefamid_new (id = id, father.id = dad, mother.id = mom)))
Performance differences may be greater on larger data sets.
The functions will not work if there are NAs in the data.

  
## makefamid-fast.cpp
#include <Rcpp.h>
#include <numeric>
#include <map>
#include <unordered_map>
#include <unordered_set>
#include <memory>
#include <algorithm>
#include <limits>

// fast table for integer vectors. There is no checks for NAs.
// [[Rcpp::export(rng = false)]]
Rcpp::IntegerVector table_int_fast(Rcpp::IntegerVector const x){
  std::map<int, R_xlen_t> counter_sorted;
  {
    std::unordered_map<int, R_xlen_t> counter;
    int const *xi = &x[0];
    for(R_xlen_t i = 0; i < x.size(); ++i)
      counter[*xi++]++;

    for(auto &v : counter)
      counter_sorted[v.first] = v.second;
  }

  Rcpp::IntegerVector out(counter_sorted.size());
  Rcpp::CharacterVector names(counter_sorted.size());
  R_xlen_t i = 0L;
  for(auto &v : counter_sorted){
    names[i  ] = v.first;
    out  [i++] = v.second;
  }

  out.names() = names;
  return out;
}

// function to reduce the computation time of kinship2::makefamid. Note that
// argument newid is by reference. This only works if typeof(newid) is integer.
// [[Rcpp::export(".makefamid_fast_inner", rng = false)]]
void makefamid_fast_inner
  (Rcpp::IntegerVector newid, Rcpp::IntegerVector const xid2,
   Rcpp::IntegerVector const newid_sorted,
   Rcpp::IntegerVector const grp_sizes){
  int       * const p_newid        = &newid[0];
  int const *       p_xid2         = &xid2[0],
            *       p_newid_sorted = &newid_sorted[0],
            *       p_grp_size     = &grp_sizes[0];

  R_len_t const n_grps = grp_sizes.size();
  if(n_grps != xid2.size())
    Rcpp::stop("lengths of n_grps and mid do not match");

  for(R_len_t i = 0; i < n_grps; ++i, ++p_grp_size, ++p_xid2){
    int out = std::numeric_limits<int>::max();
    for(int j = 0; j < *p_grp_size; ++j, ++p_newid_sorted)
      if(out > *p_newid_sorted)
        out = *p_newid_sorted;

    if(p_newid[*p_xid2] > out)
      p_newid[*p_xid2] = out;
  }
}

namespace {
/// keeps track of family members
struct family_tracker {
  struct family_member {
    R_len_t id;
    family_tracker * family;
    family_member * next_member = nullptr;
  };

  R_len_t fam_id = std::numeric_limits<R_len_t>::max();
  family_member * first_ele = nullptr,
                * last_ele = nullptr;
  size_t n_ele = 0L;

  inline void add_member(family_member * const new_member){
    if(!new_member)
      return;

    fam_id = std::min(fam_id, new_member->id);
    new_member->family = this;

    if(n_ele < 1L){
      // the first member
      first_ele = new_member;
      last_ele = new_member;
      n_ele = 1L;
      return;
    }

    // add the member
    last_ele->next_member = new_member;
    last_ele = new_member;
    ++n_ele;
  }

  // should not be used
  family_tracker() = default;

  void merge(family_tracker &other){
    if(other.fam_id != fam_id)
      add_members(other);
  }

  void merge(std::unordered_set<family_tracker*> &others){
    for(auto x : others)
      if(x->fam_id != fam_id)
        add_members(*x);
  }

private:
  inline void add_members(family_tracker &other){
    // change the families
    {
      family_member * x = other.first_ele;
      for(size_t i = 0; i < other.n_ele; ++i, x = x->next_member)
        x->family = this;
    }

    // update the last element, the number of elements, and the family id
    last_ele->next_member = other.first_ele;
    last_ele = other.last_ele;
    n_ele += other.n_ele;
    fam_id = std::min(fam_id, other.fam_id);

    // update other
    other.first_ele = nullptr;
    other.last_ele = nullptr;
    other.n_ele = 0L;
  }
};

} // namespace

// [[Rcpp::export(".makefamid_new", rng = false)]]
Rcpp::IntegerVector makefamid_new(Rcpp::IntegerVector const drow,
                                  Rcpp::IntegerVector const mrow){
  // setup families
  R_len_t const n = drow.size();

  std::unique_ptr<family_tracker::family_member[]> members(
    new family_tracker::family_member[n]);
  std::unique_ptr<family_tracker[]> trackers(new family_tracker[n]);
  for(R_len_t j = 0; j < n; ++j){
    members[j] = family_tracker::family_member { j, nullptr };
    trackers[j].add_member(&members[j]);
  }

  // compute the order for drow and mrow
  auto get_ord = [&](Rcpp::IntegerVector const x) -> std::unique_ptr<int[]> {
    std::unique_ptr<int[]> out(new int[x.size()]);
    std::iota(out.get(), out.get() + x.size(), 0);

    int const *xp = &x[0];
    std::sort(out.get(), out.get() + x.size(),
              [&](int const i, int const j){ return xp[i] < xp[j]; });

    return out;
  };

  std::unique_ptr<int[]> const m_ord = get_ord(mrow),
                               d_ord = get_ord(drow);

  // find the family ids. Assign a lambda function to add all the relations
  // the mothers or fathers
  auto merge_families =
    [&](int const * prow, int const * p_ord) -> void {
    std::unordered_set<family_tracker*> families;
    for(R_len_t c = 0L; c < n; ){
      R_len_t const cur_parent = prow[p_ord[c]];
      if(cur_parent >= n)
        // we are done (reached the founders with no parents)
        break;

      // find the families to merge. We add the families to the largest one.
      // This is computationally attractive. Thus, we need to keep track of the
      // largest family
      families.clear();
      family_tracker * largest_family(nullptr);
      size_t largest_family_size(0L);

      // adds a family to be merged
      auto add_family = [&](family_tracker * const new_fam) -> void {
        auto const ret_info = families.insert(new_fam);
        if(ret_info.second and
             new_fam->n_ele > largest_family_size){
          largest_family_size = new_fam->n_ele;
          largest_family = new_fam;
        }
      };

      // add the children to this family
      {
        R_len_t p_ord_c;
        while(c < n and prow[(p_ord_c = p_ord[c])] == cur_parent){
          add_family(members[p_ord_c].family);
          ++c;
        }
      }

      // merge the families
      if(families.size() < 1)
        // nothing to merge
        continue;

      add_family(members[cur_parent].family);
      largest_family->merge(families);
    }
  };

  // add the relations through the mothers and fathers
  merge_families(&mrow[0], &m_ord[0]);
  merge_families(&drow[0], &d_ord[0]);

  // setup the return object and return
  Rcpp::IntegerVector out(n);
  for(R_len_t j = 0; j < n; ++j)
    out[j] = members[j].family->fam_id + 1L;

  return out;
}

/*** R
# faster version of kinship2::makefamid.
makefamid_fast <- function(id, father.id, mother.id){
  n <- length(id)
  mid <- c(match(mother.id, id, nomatch = n + 1), n + 1)
  did <- c(match(father.id, id, nomatch = n + 1), n + 1)
  ## ADDED ##
  m_newid <- rep(NA_integer_, length(mid))
  m_ord <- order(order(mid))
  m_grp_size <- table_int_fast(mid)

  d_newid <- rep(NA_integer_, length(did))
  d_ord <- order(order(did))
  d_grp_size <- table_int_fast(did)
  ###########
  ## CHANGED ##
  # mid2 <- sort(unique(mid))
  # did2 <- sort(unique(did))
  mid2_m1 <- sort(unique(mid)) - 1L
  did2_m1 <- sort(unique(did)) - 1L
  #############
  famid <- 1:(n + 1)
  temp <- kindepth(id, father.id, mother.id)
  for (i in 1:n) {
    newid <- pmin(famid, famid[mid], famid[did])
    ## CHANGED ##
    # newid[mid2] <- pmin(newid[mid2], tapply(newid, mid, min))
    m_newid[m_ord] <- newid
    .makefamid_fast_inner(newid = newid, xid2 = mid2_m1,
                          newid_sorted = m_newid, grp_sizes = m_grp_size)
    #############
    newid[n + 1] <- n + 1L
    ## CHANGED ##
    # newid[did2] <- pmin(newid[did2], tapply(newid, did, min))
    d_newid[d_ord] <- newid
    dum <- .makefamid_fast_inner(newid = newid, xid2 = did2_m1,
                                 newid_sorted = d_newid, grp_sizes = d_grp_size)
    #############
    newid[n + 1] <- n + 1L
    if (all(newid == famid))
      break
    else if (i < n)
      famid <- newid
  }
  if (all(newid == famid)) {
    famid <- famid[1:n]
    xx <- table_int_fast(famid)
    if (any(xx == 1)) {
      singles <- as.integer(names(xx[xx == 1]))
      famid[!is.na(match(famid, singles))] <- 0L
      match(famid, sort(unique(famid))) - 1L
    }
    else match(famid, sort(unique(famid)))
  }
  else stop("Bug in routine: seem to have found an infinite loop")
}

# a completely new implementation of kinship2::makefamid largely based on C++.
makefamid_new <- function(id, father.id, mother.id){
  stopifnot(require(kinship2), !anyDuplicated(id))
  n <- length(id)
  mrow <- match(mother.id, id, nomatch = n + 1) - 1L
  drow <- match(father.id, id, nomatch = n + 1) - 1L

  out <- .makefamid_new(mrow = mrow, drow = drow)

  xx <- table_int_fast(out)
  if(any(xx == 1)) {
    singles <- as.integer(names(xx[xx == 1]))
    out[!is.na(match(out, singles))] <- 0L
    match(out, sort(unique(out))) - 1L
  }
  else match(out, sort(unique(out)))
}
*/
	#include <Rcpp.h>
	#include <numeric>
	#include <map>
	#include <unordered_map>
	#include <unordered_set>
	#include <memory>
	#include <algorithm>
	#include <limits>

	// fast table for integer vectors. There is no checks for NAs.
	// [[Rcpp::export(rng = false)]]
	Rcpp::IntegerVector table_int_fast(Rcpp::IntegerVector const x){
	std::map<int, R_xlen_t> counter_sorted;
	{
	std::unordered_map<int, R_xlen_t> counter;
	int const *xi = &x[0];
	for(R_xlen_t i = 0; i < x.size(); ++i)
	counter[*xi++]++;

	for(auto &v : counter)
	counter_sorted[v.first] = v.second;
	}

	Rcpp::IntegerVector out(counter_sorted.size());
	Rcpp::CharacterVector names(counter_sorted.size());
	R_xlen_t i = 0L;
	for(auto &v : counter_sorted){
	names[i ] = v.first;
	out [i++] = v.second;
	}

	out.names() = names;
	return out;
	}

	// function to reduce the computation time of kinship2::makefamid. Note that
	// argument newid is by reference. This only works if typeof(newid) is integer.
	// [[Rcpp::export(".makefamid_fast_inner", rng = false)]]
	void makefamid_fast_inner
	(Rcpp::IntegerVector newid, Rcpp::IntegerVector const xid2,
	Rcpp::IntegerVector const newid_sorted,
	Rcpp::IntegerVector const grp_sizes){
	int * const p_newid = &newid[0];
	int const * p_xid2 = &xid2[0],
	* p_newid_sorted = &newid_sorted[0],
	* p_grp_size = &grp_sizes[0];

	R_len_t const n_grps = grp_sizes.size();
	if(n_grps != xid2.size())
	Rcpp::stop("lengths of n_grps and mid do not match");

	for(R_len_t i = 0; i < n_grps; ++i, ++p_grp_size, ++p_xid2){
	int out = std::numeric_limits<int>::max();
	for(int j = 0; j < *p_grp_size; ++j, ++p_newid_sorted)
	if(out > *p_newid_sorted)
	out = *p_newid_sorted;

	if(p_newid[*p_xid2] > out)
	p_newid[*p_xid2] = out;
	}
	}

	namespace {
	/// keeps track of family members
	struct family_tracker {
	struct family_member {
	R_len_t id;
	family_tracker * family;
	family_member * next_member = nullptr;
	};

	R_len_t fam_id = std::numeric_limits<R_len_t>::max();
	family_member * first_ele = nullptr,
	* last_ele = nullptr;
	size_t n_ele = 0L;

	inline void add_member(family_member * const new_member){
	if(!new_member)
	return;

	fam_id = std::min(fam_id, new_member->id);
	new_member->family = this;

	if(n_ele < 1L){
	// the first member
	first_ele = new_member;
	last_ele = new_member;
	n_ele = 1L;
	return;
	}

	// add the member
	last_ele->next_member = new_member;
	last_ele = new_member;
	++n_ele;
	}

	// should not be used
	family_tracker() = default;

	void merge(family_tracker &other){
	if(other.fam_id != fam_id)
	add_members(other);
	}

	void merge(std::unordered_set<family_tracker*> &others){
	for(auto x : others)
	if(x->fam_id != fam_id)
	add_members(*x);
	}

	private:
	inline void add_members(family_tracker &other){
	// change the families
	{
	family_member * x = other.first_ele;
	for(size_t i = 0; i < other.n_ele; ++i, x = x->next_member)
	x->family = this;
	}

	// update the last element, the number of elements, and the family id
	last_ele->next_member = other.first_ele;
	last_ele = other.last_ele;
	n_ele += other.n_ele;
	fam_id = std::min(fam_id, other.fam_id);

	// update other
	other.first_ele = nullptr;
	other.last_ele = nullptr;
	other.n_ele = 0L;
	}
	};

	} // namespace

	// [[Rcpp::export(".makefamid_new", rng = false)]]
	Rcpp::IntegerVector makefamid_new(Rcpp::IntegerVector const drow,
	Rcpp::IntegerVector const mrow){
	// setup families
	R_len_t const n = drow.size();

	std::unique_ptr<family_tracker::family_member[]> members(
	new family_tracker::family_member[n]);
	std::unique_ptr<family_tracker[]> trackers(new family_tracker[n]);
	for(R_len_t j = 0; j < n; ++j){
	members[j] = family_tracker::family_member { j, nullptr };
	trackers[j].add_member(&members[j]);
	}

	// compute the order for drow and mrow
	auto get_ord = [&](Rcpp::IntegerVector const x) -> std::unique_ptr<int[]> {
	std::unique_ptr<int[]> out(new int[x.size()]);
	std::iota(out.get(), out.get() + x.size(), 0);

	int const *xp = &x[0];
	std::sort(out.get(), out.get() + x.size(),
	[&](int const i, int const j){ return xp[i] < xp[j]; });

	return out;
	};

	std::unique_ptr<int[]> const m_ord = get_ord(mrow),
	d_ord = get_ord(drow);

	// find the family ids. Assign a lambda function to add all the relations
	// the mothers or fathers
	auto merge_families =
	[&](int const * prow, int const * p_ord) -> void {
	std::unordered_set<family_tracker*> families;
	for(R_len_t c = 0L; c < n; ){
	R_len_t const cur_parent = prow[p_ord[c]];
	if(cur_parent >= n)
	// we are done (reached the founders with no parents)
	break;

	// find the families to merge. We add the families to the largest one.
	// This is computationally attractive. Thus, we need to keep track of the
	// largest family
	families.clear();
	family_tracker * largest_family(nullptr);
	size_t largest_family_size(0L);

	// adds a family to be merged
	auto add_family = [&](family_tracker * const new_fam) -> void {
	auto const ret_info = families.insert(new_fam);
	if(ret_info.second and
	new_fam->n_ele > largest_family_size){
	largest_family_size = new_fam->n_ele;
	largest_family = new_fam;
	}
	};

	// add the children to this family
	{
	R_len_t p_ord_c;
	while(c < n and prow[(p_ord_c = p_ord[c])] == cur_parent){
	add_family(members[p_ord_c].family);
	++c;
	}
	}

	// merge the families
	if(families.size() < 1)
	// nothing to merge
	continue;

	add_family(members[cur_parent].family);
	largest_family->merge(families);
	}
	};

	// add the relations through the mothers and fathers
	merge_families(&mrow[0], &m_ord[0]);
	merge_families(&drow[0], &d_ord[0]);

	// setup the return object and return
	Rcpp::IntegerVector out(n);
	for(R_len_t j = 0; j < n; ++j)
	out[j] = members[j].family->fam_id + 1L;

	return out;
	}

	/*** R
	# faster version of kinship2::makefamid.
	makefamid_fast <- function(id, father.id, mother.id){
	n <- length(id)
	mid <- c(match(mother.id, id, nomatch = n + 1), n + 1)
	did <- c(match(father.id, id, nomatch = n + 1), n + 1)
	## ADDED ##
	m_newid <- rep(NA_integer_, length(mid))
	m_ord <- order(order(mid))
	m_grp_size <- table_int_fast(mid)

	d_newid <- rep(NA_integer_, length(did))
	d_ord <- order(order(did))
	d_grp_size <- table_int_fast(did)
	###########
	## CHANGED ##
	# mid2 <- sort(unique(mid))
	# did2 <- sort(unique(did))
	mid2_m1 <- sort(unique(mid)) - 1L
	did2_m1 <- sort(unique(did)) - 1L
	#############
	famid <- 1:(n + 1)
	temp <- kindepth(id, father.id, mother.id)
	for (i in 1:n) {
	newid <- pmin(famid, famid[mid], famid[did])
	## CHANGED ##
	# newid[mid2] <- pmin(newid[mid2], tapply(newid, mid, min))
	m_newid[m_ord] <- newid
	.makefamid_fast_inner(newid = newid, xid2 = mid2_m1,
	newid_sorted = m_newid, grp_sizes = m_grp_size)
	#############
	newid[n + 1] <- n + 1L
	## CHANGED ##
	# newid[did2] <- pmin(newid[did2], tapply(newid, did, min))
	d_newid[d_ord] <- newid
	dum <- .makefamid_fast_inner(newid = newid, xid2 = did2_m1,
	newid_sorted = d_newid, grp_sizes = d_grp_size)
	#############
	newid[n + 1] <- n + 1L
	if (all(newid == famid))
	break
	else if (i < n)
	famid <- newid
	}
	if (all(newid == famid)) {
	famid <- famid[1:n]
	xx <- table_int_fast(famid)
	if (any(xx == 1)) {
	singles <- as.integer(names(xx[xx == 1]))
	famid[!is.na(match(famid, singles))] <- 0L
	match(famid, sort(unique(famid))) - 1L
	}
	else match(famid, sort(unique(famid)))
	}
	else stop("Bug in routine: seem to have found an infinite loop")
	}

	# a completely new implementation of kinship2::makefamid largely based on C++.
	makefamid_new <- function(id, father.id, mother.id){
	stopifnot(require(kinship2), !anyDuplicated(id))
	n <- length(id)
	mrow <- match(mother.id, id, nomatch = n + 1) - 1L
	drow <- match(father.id, id, nomatch = n + 1) - 1L

	out <- .makefamid_new(mrow = mrow, drow = drow)

	xx <- table_int_fast(out)
	if(any(xx == 1)) {
	singles <- as.integer(names(xx[xx == 1]))
	out[!is.na(match(out, singles))] <- 0L
	match(out, sort(unique(out))) - 1L
	}
	else match(out, sort(unique(out)))
	}
	*/