Nadav Rotem nadavrot

## Matrix.md

      
              7 files
            
          
              74 forks
            
          
              17 comments
            
          
              857 stars
            
          
                nadavrot
                / Matrix.md
            
            
              Last active
              April 2, 2024 06:45
            
              
                Efficient matrix multiplication
              
          
    High-Performance Matrix Multiplication

This is a short post that explains how to write a high-performance matrix
multiplication program on modern processors. In this tutorial I will use a
single core of the Skylake-client CPU with AVX2, but the principles in this post
also apply to other processors with different instruction sets (such as AVX512).
Intro

Matrix multiplication is a mathematical operation that defines the product of

  
## gist:50e856b4711798a1c8bc6ecc061d77d0
#ifndef _GNU_SOURCE
#define _GNU_SOURCE
#endif

#include <execinfo.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

#define BT_BUF_SIZE 100

## mymalloc.cc
#include <algorithm>
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <cstring>
#include <pthread.h>
#include <unistd.h>

namespace {
/// A Block is a header to a managed memory buffer. Blocks are arranged as

## gist:9a071a02ea20ab483158c443638dea7a
#!/usr/bin/python

from __future__ import print_function
from bcc import BPF
from bcc.utils import printb
from time import sleep
import sys
import pdb

if len(sys.argv) < 2:

## gist:b17ac84c1ef88b69deade029151303a0
FROM ubuntu:21.04

# We install some useful packages.
RUN apt-get update -qq
RUN DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata
RUN apt-get install -y vim clang-format sudo python3 wget cmake g++
RUN apt-get install -y git clang linux-tools-generic ninja-build lldb zip curl
RUN apt-get install -y firefox
RUN apt-get install -y python3-pip
RUN apt-get install -y libssl-dev

## continued_fractions.py
https://en.wikipedia.org/wiki/Continued_fraction

from math import trunc, pi, sqrt

def approximate(alpha, iters):
    r = [alpha]
    a = []
    for i in range(iters):
        a.append(trunc(r[-1]))
        r.append(1/(r[-1] - a[-1]))
	#ifndef _GNU_SOURCE
	#define _GNU_SOURCE
	#endif

	#include <execinfo.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <unistd.h>

	#define BT_BUF_SIZE 100
	#include <algorithm>
	#include <cassert>
	#include <cstdint>
	#include <cstdio>
	#include <cstring>
	#include <pthread.h>
	#include <unistd.h>

	namespace {
	/// A Block is a header to a managed memory buffer. Blocks are arranged as
	#!/usr/bin/python

	from __future__ import print_function
	from bcc import BPF
	from bcc.utils import printb
	from time import sleep
	import sys
	import pdb

	if len(sys.argv) < 2:
	FROM ubuntu:21.04

	# We install some useful packages.
	RUN apt-get update -qq
	RUN DEBIAN_FRONTEND="noninteractive" apt-get -y install tzdata
	RUN apt-get install -y vim clang-format sudo python3 wget cmake g++
	RUN apt-get install -y git clang linux-tools-generic ninja-build lldb zip curl
	RUN apt-get install -y firefox
	RUN apt-get install -y python3-pip
	RUN apt-get install -y libssl-dev
	https://en.wikipedia.org/wiki/Continued_fraction

	from math import trunc, pi, sqrt

	def approximate(alpha, iters):
	r = [alpha]
	a = []
	for i in range(iters):
	a.append(trunc(r[-1]))
	r.append(1/(r[-1] - a[-1]))