pengyu FantasyVR

## neohookean-xpbd.py
import taichi as ti
import numpy as np

ti.init(arch=ti.cpu, cpu_max_num_threads=1)

# cube with 5 tetrahedrons
pos_np = np.array(
    [[-1.0, 0.0, -1.0], [-1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [1.0, 0.0, -1.0],
     [-1.0, 1.0, -1.0], [-1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, -1.0]],
    dtype=np.float32)

## softbody_pbd_cuda.cu

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <iostream>
#include <vector>
#include <fstream>
#include <string>
#include <sstream>
#include <cassert>

## mpm_without_atomic.py
# MPM-MLS in 88 lines of Taichi code, originally created by @yuanming-hu
import taichi as ti

ti.init(arch=ti.gpu)

n_particles = 8192
n_grid = 128
dx = 1 / n_grid
dt = 2e-4

## armadillo.txt
numPoints 117
0.365071 0.0626429
0.489804 0.0447059
0.595851 0.0482979
0.705 0.0581818
1.20242 0.0495
1.3474 0.0344667
1.49 0.0335058
1.63627 0.0534804
0.403448 0.148879

## cuda_driver_test.cpp
/*
 gcc cu_driver.cpp  -fpermissive -ldl -g  -fno-omit-frame-pointer
*/
#include <dlfcn.h>
#include <stdio.h>


typedef int (*init) ( unsigned int  Flags);
typedef int (*get_version)(int *version);
typedef int (*malloc) ( void** devPtr, size_t size );

## cusparse_spmv_dll.cpp
#include <dlfcn.h>
#include <stdio.h>

enum cudaMemcpyKind {
  cudaMemcpyHostToHost = 0,
  cudaMemcpyHostToDevice = 1,
  cudaMemcpyDeviceToHost = 2,
  cudaMemcpyDeviceToDevice = 3,
  cudaMemcpyDefault = 4
};

## test_cu.py
import numpy as np

import taichi as ti

ti.init(arch=ti.cuda)

# Numpy arrays for taichi ndarrays
h_row_csr = np.asarray([0, 3, 4, 7, 9], dtype=np.int32)
h_col_csr = np.asarray([0, 2, 3, 1, 0, 2, 3, 1, 3], dtype=np.int32)
h_value_csr = np.asarray([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0],

## fem99_export.py
import taichi as ti
import numpy as np

ti.init(arch=ti.gpu)

N = 32
dt = 1e-4
dx = 1 / N
rho = 4e1
NF = 2 * N**2  # number of faces

## cusparse_spmv.cu
#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
#include <cusparse.h>         // cusparseSpMV
#include <stdio.h>            // printf
#include <stdlib.h>           // EXIT_FAILURE

/*
 *  cuSparse Version: 11.4.0
 *
 *  How to compile:
 *          nvcc -arch=sm_75  -lcusparse  cusparse_spmv.cu -o cusparse_spmv

## ndarry_sparse_matrix.py
"""
从jiafeng的review： https://github.com/taichi-dev/taichi/pull/4841#discussion_r880193695
这里我发现 内存的layout确实会影响稀疏矩阵的构建。


目前的方案是：
1. 直接获取ndarry的 data_ptr
2. 从该指针处的数据按照[row, col, value]的三元组的内存排布创建Sparse matrix。

目前的脚本尝试着各种ndarray的内存分布。
	import taichi as ti
	import numpy as np

	ti.init(arch=ti.cpu, cpu_max_num_threads=1)

	# cube with 5 tetrahedrons
	pos_np = np.array(
	[[-1.0, 0.0, -1.0], [-1.0, 0.0, 1.0], [1.0, 0.0, 1.0], [1.0, 0.0, -1.0],
	[-1.0, 1.0, -1.0], [-1.0, 1.0, 1.0], [1.0, 1.0, 1.0], [1.0, 1.0, -1.0]],
	dtype=np.float32)

	#include "cuda_runtime.h"
	#include "device_launch_parameters.h"

	#include <iostream>
	#include <vector>
	#include <fstream>
	#include <string>
	#include <sstream>
	#include <cassert>
	# MPM-MLS in 88 lines of Taichi code, originally created by @yuanming-hu
	import taichi as ti

	ti.init(arch=ti.gpu)

	n_particles = 8192
	n_grid = 128
	dx = 1 / n_grid
	dt = 2e-4
	numPoints 117
	0.365071 0.0626429
	0.489804 0.0447059
	0.595851 0.0482979
	0.705 0.0581818
	1.20242 0.0495
	1.3474 0.0344667
	1.49 0.0335058
	1.63627 0.0534804
	0.403448 0.148879
	/*
	gcc cu_driver.cpp -fpermissive -ldl -g -fno-omit-frame-pointer
	*/
	#include <dlfcn.h>
	#include <stdio.h>


	typedef int (*init) ( unsigned int Flags);
	typedef int (get_version)(int version);
	typedef int (malloc) ( void* devPtr, size_t size );
	#include <dlfcn.h>
	#include <stdio.h>

	enum cudaMemcpyKind {
	cudaMemcpyHostToHost = 0,
	cudaMemcpyHostToDevice = 1,
	cudaMemcpyDeviceToHost = 2,
	cudaMemcpyDeviceToDevice = 3,
	cudaMemcpyDefault = 4
	};
	import numpy as np

	import taichi as ti

	ti.init(arch=ti.cuda)

	# Numpy arrays for taichi ndarrays
	h_row_csr = np.asarray([0, 3, 4, 7, 9], dtype=np.int32)
	h_col_csr = np.asarray([0, 2, 3, 1, 0, 2, 3, 1, 3], dtype=np.int32)
	h_value_csr = np.asarray([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0],
	#include <cuda_runtime_api.h> // cudaMalloc, cudaMemcpy, etc.
	#include <cusparse.h> // cusparseSpMV
	#include <stdio.h> // printf
	#include <stdlib.h> // EXIT_FAILURE

	/*
	* cuSparse Version: 11.4.0
	*
	* How to compile:
	* nvcc -arch=sm_75 -lcusparse cusparse_spmv.cu -o cusparse_spmv
	"""
	从jiafeng的review： https://github.com/taichi-dev/taichi/pull/4841#discussion_r880193695
	这里我发现内存的layout确实会影响稀疏矩阵的构建。


	目前的方案是：
	1. 直接获取ndarry的 data_ptr
	2. 从该指针处的数据按照[row, col, value]的三元组的内存排布创建Sparse matrix。

	目前的脚本尝试着各种ndarray的内存分布。