Skip to content

Instantly share code, notes, and snippets.

@asemic-horizon
Last active October 7, 2021 19:35
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save asemic-horizon/2830ed3637cfd278e7937f56450fd271 to your computer and use it in GitHub Desktop.
Save asemic-horizon/2830ed3637cfd278e7937f56450fd271 to your computer and use it in GitHub Desktop.
rust bottlenecking

In this example we calculate "distance covariance" matrices (look it up on Wikipedia)

Python code

import numpy as np
from math import sqrt

def distance_matrix(X):
  D = np.empty(shape=(X.shape[0],X.shape[0]))
  for j in range(X.shape[0]):
    for k in range(j+1):
      uv = (X[j,:]-X[k,:])
      D[j,k] = np.sqrt(np.sum([x*x for x in uv]))
      D[k,j] = D[j,k]
  return D

def doubly_center(X):
  grand_mean = np.mean(X)
  Z = np.empty(shape=X.shape)
  for j, k in product(range(X.shape[0]),repeat=2):
    Z[j,k] = X[j,k] - np.mean(X[j,:]) - np.mean(X[:,k]) + grand_mean
  return Z
  
def dcov(X,Y):
  A = doubly_center(distance_matrix(X))
  B = doubly_center(distance_matrix(Y))
  return np.sqrt(np.mean(A*B))

def dcor(X,Y):
  xvar = dcov(X,X)
  yvar = dcov(Y,Y)
  xycov = dcov(X,Y)
  denom = sqrt((xvar*yvar))
  return xycov / denom

Rust code (functionality)

extern crate ndarray;
use ndarray::prelude::*;

fn euclidean_distance(u: ArrayView<f64,Ix1> , v: ArrayView<f64,Ix1>)-> f64 {
  let uv = u.to_owned() - v.to_owned();
  uv.mapv(|x| x*x).sum().sqrt()
}

fn distance_matrix(x: &Array<f64,Ix2>) -> Array<f64,Ix2>{
  let mut d = Array::<f64,Ix2>::zeros((x.nrows(),x.nrows()));
  for j in 0..x.nrows() {
    for k in 0..j+1 {
      let dist = euclidean_distance(x.slice(s![j,..]),x.slice(s![k,..]));
      d[[j,k]] = dist;
      d[[k,j]] = dist;
    }
  }
  d
}

fn doubly_center(x: &Array<f64,Ix2>) -> Array<f64,Ix2>{
  let grand_mean = x.mean().unwrap();
  let mut xbar = Array::<f64,Ix2>::zeros(x.raw_dim());
  for j in 0..x.dim().0 {
    for k in 0..x.dim().1 {
      xbar[[j,k]] = x[[j,k]]
                   - x.slice(s![j,..]).mean().unwrap() 
                   - x.slice(s![..,k]).mean().unwrap() 
                   + grand_mean;
    }
  }
  xbar
}

pub fn dcov(x : &Array<f64,Ix2>, y: &Array<f64,Ix2>) -> f64 {
  let a = doubly_center(&distance_matrix(x));
  let b = doubly_center(&distance_matrix(y));
  (a*b).mean().unwrap().sqrt()
}

pub fn dvariance(x: &Array<f64, Ix2>) -> f64 {
    dcov(&x,&x)
}
pub fn dcor(x : &Array<f64,Ix2>, y: &Array<f64,Ix2>) -> f64 {
  let xvar = dvariance(x);
  let yvar = dvariance(y);
  let xycov = dcov(x,y);
  let denom = (xvar * yvar).sqrt();
  xycov/denom
}

pub fn vec_dcor(x: Vec<f64>, y: Vec<f64>) -> f64 {
  let xvec = Array::from_shape_vec((x.len(),1),x).unwrap();
  let yvec = Array::from_shape_vec((y.len(),1),y).unwrap();
  dcor(&xvec,&yvec)
}

pub fn vec_dcov(x: Vec<f64>, y: Vec<f64>) -> f64 {
    let xvec = Array::from_shape_vec((x.len(),1),x).unwrap();
    let yvec = Array::from_shape_vec((y.len(),1),y).unwrap();
    dcov(&xvec,&yvec)
}

Rust bridging code

extern crate ndarray;
use ndarray::prelude::*;

use pyo3::prelude::*;
mod dcov; 


#[pyfunction]
fn dcov(xs: Vec<f64>, ys: Vec<f64>) -> f64 {
    dcov::vec_dcov(xs,ys)
}

#[pyfunction]
fn dcor(xs: Vec<f64>, ys: Vec<f64>) -> f64 {
    dcov::vec_dcor(xs,ys)
}

 #[pymodule]
fn hsmode(_py: Python, m: &PyModule) -> PyResult<()> {

     m.add_function(wrap_pyfunction!(dcov, m)?)?;
     m.add_function(wrap_pyfunction!(dcor, m)?)?;
     Ok(())
 }

Python testing code (note that I'm building the matrices in Python!)

def perf_test(M,N):
  xs = np.random.uniform(size=(N,M))
  ys = np.random.uniform(size=(N,M)) + 0.005 * xs
  data = np.concatenate([xs,ys], axis=1)

  t0 = time()
  mat = np.empty((data.shape[1],data.shape[1]))
  for i in range(data.shape[1]):
    for j in range(i+1):
      mat[i,j] = hsmode.dcor(data[:,i],data[:,j])
      mat[j,i] = mat[i,j]

  rs_dt = time()-t0;
  print(rs_dt, end=" ", flush=True)
  t0 = time();

  pymat = np.empty((data.shape[1],data.shape[1]))
  for i in range(data.shape[1]):
    for j in range(i+1):
      pymat[i,j] = py_dcov.dcor(data[:,i].reshape(-1,1),data[:,j].reshape(-1,1))
      pymat[j,i] = mat[i,j]
  py_dt = time() - t0
  print(py_dt, end=" ", flush=True)

  diff = np.linalg.norm(mat-pymat)
  print(diff)

  return rs_dt, py_dt, diff
@bgraves
Copy link

bgraves commented Oct 7, 2021

Something like this might go a little faster if you wanted to stick with numpy

import numpy as np
from math import sqrt

def distance_matrix(X):
    height, width = X.shape
    D = np.zeros((height, height))
    for i in range(width):
        column = X[:,[i]]
        d = (column - column.T)
        D += d*d
    return np.sqrt(D)

def doubly_center(X):
    mean0 = np.mean(X, axis=0, keepdims=True)
    return X - mean0 - mean0.T + np.mean(X)

def dcor(X, Y):
    Dx = doubly_center(distance_matrix(X))
    Dy = doubly_center(distance_matrix(Y))
    xvar  = np.sqrt(np.mean(Dx * Dx))
    yvar  = np.sqrt(np.mean(Dy * Dy))
    xycov = np.sqrt(np.mean(Dx * Dy))
    return xycov / sqrt(xvar * yvar)

@asemic-horizon
Copy link
Author

Here's some random benchmarks with your version

   n_cols  n_obs     rs_dt     py_dt          diff      rs/py
0    14.0    1.0  0.001631  0.041193  0.000000e+00   0.039589
1     5.0   13.0  0.007658  0.006880  3.236829e-16   1.113083
2     9.0   95.0  2.083242  0.038936  1.274416e-15  53.504066
3     8.0   87.0  1.320364  0.028660  1.125642e-15  46.070612
4    12.0   50.0  0.744650  0.043561  8.861160e-16  17.094325
5    15.0   43.0  0.809348  0.062195  1.322692e-15  13.013053
6     9.0   34.0  0.174201  0.022482  6.156484e-16   7.748351
7    23.0   49.0  2.533264  0.153347  1.946060e-15  16.519841
8    23.0   45.0  2.131848  0.150471  1.763626e-15  14.167811
9    14.0   13.0  0.055805  0.044774  1.091327e-15   1.246368

@asemic-horizon
Copy link
Author

I should add -- not having seen the opportunity for (column - column.T) makes me feel dumb.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment