Skip to content

Instantly share code, notes, and snippets.

@shubhsherl
Last active March 28, 2020 08:57
Show Gist options
  • Save shubhsherl/858d8bd444425552682b2e86111f5a7d to your computer and use it in GitHub Desktop.
Save shubhsherl/858d8bd444425552682b2e86111f5a7d to your computer and use it in GitHub Desktop.
CS422: Scribing 3 Cache friendly codes and its comparision
#include<bits/stdc++.h>
#define M 10000
#define N 10000
int a[M][N];
using namespace std;
using namespace std::chrono;
int sum_array_rows() {
int sum = 0;
for (int i = 0; i < M; i++)
for (int j = 0; j < N; j++)
sum += a[i][j];
return sum;
}
int sum_array_cols() {
int sum = 0;
for (int j = 0; j < N; j++)
for (int i = 0; i < M; i++)
sum += a[i][j];
return sum;
}
void fil_random() {
for(int i=0;i<M;i++)
for(int j=0;j<N;j++)
a[i][j] = rand()%10;
}
void evaluate(int i) {
auto start = high_resolution_clock::now();
switch(i) {
case 1:
sum_array_cols();
break;
case 2:
sum_array_rows();
}
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
cout << "Time taken by function "<<i<<" : "<< duration.count() << " microseconds" << endl;
}
int main() {
int sum;
fil_random();
for(int i=1;i<=2;i++)
evaluate(i);
}
#include<bits/stdc++.h>
// Matrix 1 : A (M*N)
// Matrix 2 : B (N*O)
// Output Matrix : C (M*O)
#define ORDER_MIN 0
#define ORDER_MAX 9
int order[9] = {100, 200, 400, 800, 1600, 2400, 4800, 7200, 10000};
int M=order[ORDER_MAX-1], N=order[ORDER_MAX-1], O=order[ORDER_MAX-1], CB1=240;
double result[5][ORDER_MAX];
typedef long long data_type;
using namespace std;
using namespace std::chrono;
vector<vector<data_type>> A( M , vector<data_type> ( N, 0 ) );
vector<vector<data_type>> B( N , vector<data_type> ( O, 0 ) );
vector<vector<data_type>> C( M , vector<data_type> ( O, 0 ) );
void print(vector<vector<data_type>> & mat, char m) {
cout<<"matrix "<<m<<": \n";
for(auto i = mat.begin(); i<mat.end();i++) {
for(auto j = i->begin();j<i->end();j++)
cout<<*j<<" ";
cout<<endl;
}
}
void print_res() {
for(int i=-1;i<5; i++) {
i==-1 ? cout<<"order:\t" : cout<<"method "<<(i+1)<<" :";
for(int j=ORDER_MIN;j<ORDER_MAX;j++)
i==-1 ? cout<<"\t"<<order[j] : cout<<"\t"<<result[i][j];
cout<<endl;
}
}
void fill_zero(vector<data_type> & row)
{
generate(row.begin(), row.end(), [](){ return 0; });
}
void fill_row(vector<data_type> & row)
{
generate(row.begin(), row.end(), [](){ return rand() % 100; });
}
void fill_matrix(vector<vector<data_type>> & mat)
{
for_each(mat.begin(), mat.end(), fill_row);
}
void matrix_multiply_1()
{
for (int i=0;i<M;i++)
for (int j=0;j<O;j++) {
data_type s = 0;
for (int k=0;k<N;k++)
s+=(A[i][k]*B[k][j]);
C[i][j]=s;
}
}
void matrix_muptiply_2() {
int B1=M/2, B2=O/2;
for (int ii = 0;ii<M;ii+=B1)
for (int jj = 0;jj<O;jj+=B2)
for (int i=ii;i<min(ii+B1,M);i++)
for (int j=jj;j<min(jj+B2,O);j++) {
data_type s = 0;
for (int k=0;k<N;k++)
s+=(A[i][k]*B[k][j]);
C[i][j]=s;
}
}
void matrix_muptiply_3() {
int B1=M/2,B2=O/2;
for (int ii = 0;ii<M;ii+=B1)
for (int jj = 0;jj<O;jj+=B2)
for (int i=ii;i<min(ii+B1,M);i+=2)
for (int j=jj;j<min(jj+B2,O);j+=2) {
data_type s = 0;
for (int k=0;k<N;k++)
s+=(A[i][k]*B[k][j]);
C[i][j]=s;
if (i+1<min(ii+B1,M)) {
s = 0;
for (int k=0;k<N;k++)
s+=(A[i+1][k]*B[k][j]);
C[i+1][j]=s;
}
if (j+1<min(jj+B2,O)) {
s = 0;
for (int k=0;k<N;k++)
s+=(A[i][k]*B[k][j+1]);
C[i][j+1]=s;
}
if (i+1<min(ii+B1,M) && j+1<min(jj+B2,O)) {
s = 0;
for (int k=0;k<N;k++)
s+=(A[i+1][k]*B[k][j+1]);
C[i+1][j+1]=s;
}
}
}
void matrix_muptiply_4() {
int B1=M/2,B2=O/2;
for (int ii = 0;ii<M;ii+=B1)
for (int jj = 0;jj<O;jj+=B2)
for (int i=ii;i<min(ii+B1,M);i+=2)
for (int j=jj;j<min(jj+B2,O);j+=2) {
data_type s[4] = {0};
for (int k=0;k<N;k++) {
s[0]+=(A[i][k]*B[k][j]);
if (i+1<min(ii+B1,M)) {
s[1]+=(A[i+1][k]*B[k][j]);
}
if (j+1<min(jj+B2,O)) {
s[2]+=(A[i][k]*B[k][j+1]);
}
if (i+1<min(ii+B1,M) && j+1<min(jj+B2,O)) {
s[3]+=(A[i+1][k]*B[k][j+1]);
}
}
C[i][j] = s[0];
C[i+1][j] = s[1];
C[i][j+1] = s[2];
C[i+1][j+1] = s[3];
}
}
void matrix_muptiply_5() {
for_each(C.begin(), C.end(), fill_zero);
int B1 = CB1, B2 = CB1;
for (int jj = 0;jj<O;jj+=B2)
for (int kk = 0;kk<N;kk+=B2)
for(int i=0;i<M;i++)
for (int j=jj;j<min(jj+B2,O);j++) {
data_type s = 0;
for (int k=kk;k<min(kk+B2,N);k++)
s+=A[i][k]*B[k][j];
C[i][j] += s;
}
}
auto evaluate(int i) {
auto start = high_resolution_clock::now();
switch(i) {
case 1:
matrix_multiply_1();
break;
case 2:
matrix_muptiply_2();
break;
case 3:
matrix_muptiply_3();
break;
case 4:
matrix_muptiply_4();
break;
case 5:
matrix_muptiply_5();
}
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
// cout << "Time taken by function "<<i<<" : "<< duration.count() << " microseconds" << endl;
return duration.count();
}
int main()
{
fill_matrix(A);
fill_matrix(B);
for(int j=ORDER_MIN;j<ORDER_MAX;j++) {
M = N = O = order[j]; CB1 = min(M/10, 480);
for(int i=1;i<=5;i++) {
cin.sync();
ofstream ofs("/proc/sys/vm/drop_caches");
ofs << "3" << endl;
ofs.close();
result[i-1][j] = evaluate(i);
//print(A,'A');
//print(B,'B');
//print(C,'C');
}
}
print_res();
return 0;
}
#include<bits/stdc++.h>
#define SIZE 100000
using namespace std;
using namespace std::chrono;
struct optimized_array {
long *add;
long value;
};
long *arr[SIZE];
optimized_array op_arr[SIZE];
void init() {
long temp[SIZE];
long flag[SIZE];
for(long i=0;i<SIZE;i++) {
arr[i] = new long(rand() % 100000);
temp[i] = *arr[i];
flag[i] = 0;
op_arr[i].add = new long(*arr[i]);
}
// get value for op_arr
sort(temp, temp+SIZE);
for(long i=0;i<SIZE;i++)
for(long j=0;j<SIZE;j++) {
if (temp[i] == *arr[j] && flag[j] == 0) {
flag[j]=1;
op_arr[j].value = i;
break;
}
}
for(long i=0;i<SIZE;i++)
if(flag[i]==0) {
cout<<"Soemthing is wrong!!";
break;
}
}
void sort_arr() {
for(long i=0;i<SIZE;i++)
for(long j=i;j<SIZE;j++) {
if (*arr[i] > *arr[j]) {
long *t = arr[i];
arr[i] = arr[j];
arr[j] = t;
}
}
}
void sort_op_arr() {
for(long i=0;i<SIZE;i++)
for(long j=i;j<SIZE;j++) {
if (op_arr[i].value > op_arr[j].value) {
optimized_array t = op_arr[i];
op_arr[i] = op_arr[j];
op_arr[j] = t;
}
}
}
void evaluate(int i) {
auto start = high_resolution_clock::now();
switch(i) {
case 1:
sort_arr();
break;
case 2:
sort_op_arr();
}
auto stop = high_resolution_clock::now();
auto duration = duration_cast<microseconds>(stop - start);
cout << "Time taken by function "<<i<<" : "<< duration.count() << " microseconds" << endl;
}
int main() {
init();
for(int i=1;i<=2;i++) {
evaluate(i);
}
}
@shubhsherl
Copy link
Author

shubhsherl commented Mar 23, 2020

Results for file: matrix_mult.cpp

order:		100    200      400     800          1600          2400           4800         7200          10000
method 1 :	7754	64025	548027	5.26076e+06	6.29403e+07	2.15464e+08   1.8651e+09	5.85912e+09   1.78609e+10
method 2 :	7886	63165	555527	4.89346e+06	6.21949e+07	2.36251e+08   1.79481e+09	5.86632e+09   1.81065e+10
method 3 :	7877	62084	542085	5.53331e+06	5.50849e+07	2.00772e+08   1.56022e+09	5.30873e+09   1.67668e+10
method 4 :	9034	73311	627083	5.94624e+06	5.42425e+07	1.91889e+08   1.37739e+09	4.66325e+09   1.51412e+10
method 5 :	57283	124322	643889	6.09029e+06	4.38997e+07	1.53847e+08   1.13993e+09	3.84289e+09   1.11036e+10

time in microseconds

Speedup:

order:	      100	      200	       400	       800    1600	2400	4800	7200	10000
method 1	1	        1	        1	       1.00	1.00	1.00	1.00	1.00	1.00
method 2	0.983261476	1.013615135	0.9864993061	1.08	1.01	0.91	1.04	1.00	0.99
method 3	0.9843849181	1.031264094	1.010961381	0.95	1.14	1.07	1.20	1.10	1.07
method 4	0.8583130396	0.8733341518	0.8739305642	0.88	1.16	1.12	1.35	1.26	1.18
method 5	0.1353630222	0.5149933238	0.8511203018	0.86	1.43	1.40	1.64	1.52	1.61

Speed up in performance for matrix multiplication
Execution time for matrix multiplication

CPU architecture:

Architecture:                    x86_64
Address sizes:                   39 bits physical, 48 bits virtual
CPU(s):                          8
On-line CPU(s) list:             0-7
Thread(s) per core:              2
Core(s) per socket:              4
Model name:                      Intel(R) Core(TM) i7-7700HQ CPU @ 2.80GHz
CPU MHz:                         3265.954
CPU max MHz:                     3800.0000
CPU min MHz:                     800.0000
L1d cache:                       128 KiB
L1i cache:                       128 KiB
L2 cache:                        1 MiB
L3 cache:                        6 MiB

@shubhsherl
Copy link
Author

Sorting time comparison for file: sorting.cpp

Time taken by function 1 : 45552680 microseconds
Time taken by function 2 : 24873190 microseconds

@shubhsherl
Copy link
Author

Array sum comparison for file: array_sum.cpp

Time taken by function 1 : 563559 microseconds
Time taken by function 2 : 179487 microseconds

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment