Nikita Shulga malfet

## facbench.hs
import Text.Printf
import Control.Exception
import System.CPUTime

-- time function implementation borrowed from
-- http://www.haskell.org/haskellwiki/Timing_computations
time :: IO t -> IO t
time a = do
    start <- getCPUTime
    v <- a

## cubinsizes.py
#!/usr/bin/env python3
from elftools.elf.elffile import ELFFile
import struct
import sys


# From https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
def sizeof_fmt(num, suffix='B'):
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:

## jit flip
import torch

def hflip(img: torch.Tensor) -> torch.Tensor: return img.flip(-1)
print(torch.jit.script(hflip)(torch.rand(3, 8, 8)))

## hello.S
# as -o hello.o hello.S ; cc -o hello hello.o -nostdlib
  .text
  .globl _start
  .type  _start, @function
_start:
  movl $1,    %eax  # sys_write(
  movl $1,    %edi  # fd = stdout,
  movl $.LC0, %esi  # buf = LC0,
  movl $12,   %edx     # 12);
  syscall

## wrong-vmul-ps.c
// gcc -c -Os -mavx512f -masm=intel
#include <immintrin.h>

float foo(float* con) {
    __mmask16 msk = 0x00ff;
    __m512 a = _mm512_maskz_loadu_ps(msk, con);
    __m512 b = _mm512_set1_ps(con[1]);
    __m512 c = _mm512_mul_ps(a,b);
    return ((float *)&c)[0];
}

## hello.cu
// nvcc -o hello hello.cu; ./hello
#include <stdio.h>

__global__ void kernel() {
  printf("Hello World of CUDA\n");
}

int main() {
  kernel<<<1,1>>>();
  return cudaDeviceSynchronize();

## gist:8e33477f1971fcfd3ca90472edbe9b67
 	[Inline Frame] torch_cuda.dll!std::_Default_allocator_traits<std::allocator<std::_Tree_node<unsigned int,void *>>>::deallocate(std::allocator<std::_Tree_node<unsigned int,void *>> &) Line 689	C++
 	[Inline Frame] torch_cuda.dll!std::_Tree_node<unsigned int,void *>::_Freenode0(std::allocator<std::_Tree_node<unsigned int,void *>> &) Line 373	C++
 	[Inline Frame] torch_cuda.dll!std::_Tree_val<std::_Tree_simple_types<unsigned int>>::_Erase_head(std::allocator<std::_Tree_node<unsigned int,void *>> &) Line 753	C++
 	[Inline Frame] torch_cuda.dll!std::_Tree<std::_Tset_traits<unsigned int,std::less<unsigned int>,std::allocator<unsigned int>,0>>::{dtor}() Line 1191	C++
>	torch_cuda.dll!torch::jit::fuser::newForReduction(torch::jit::fuser::TensorView * tv, const std::vector<unsigned int,std::allocator<unsigned int>> & axes) Line 438	C++
 	torch_cuda.dll!torch::jit::fuser::reductionOp(torch::jit::fuser::BinaryOpType reduction_op_type, const std::vector<int,std::allocator<int>> & axes, torch::jit::fuser::Val * init, to

## NEON reciprocal example
#include <arm_neon.h>
#include <math.h>
#include <stdio.h>


void run_neon_reciproc(float data_in[4], float data_out[4]) {
	float32x4_t input = vld1q_f32(data_in);
	float32x4_t out = vrecpeq_f32(input);
	//out = vmulq_f32(vrecpsq_f32(input, out), out);
	//out = vmulq_f32(vrecpsq_f32(input, out), out);

## test_time_trends.py
#!/usr/bin/env python3
import boto3
import os
import bz2
import json
import subprocess
from datetime import datetime


def get_git_commit_history(path, branch="master"):

## gh-get-milestone-issues.py
#!/usr/bin/env python3

from datetime import datetime
from typing import Any, Dict, List, Optional, Union
from urllib.request import urlopen, Request
import json
import enum
import os
	import Text.Printf
	import Control.Exception
	import System.CPUTime

	-- time function implementation borrowed from
	-- http://www.haskell.org/haskellwiki/Timing_computations
	time :: IO t -> IO t
	time a = do
	start <- getCPUTime
	v <- a
	#!/usr/bin/env python3
	from elftools.elf.elffile import ELFFile
	import struct
	import sys


	# From https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
	def sizeof_fmt(num, suffix='B'):
	for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
	if abs(num) < 1024.0:
	import torch

	def hflip(img: torch.Tensor) -> torch.Tensor: return img.flip(-1)
	print(torch.jit.script(hflip)(torch.rand(3, 8, 8)))
	# as -o hello.o hello.S ; cc -o hello hello.o -nostdlib
	.text
	.globl _start
	.type _start, @function
	_start:
	movl $1, %eax # sys_write(
	movl $1, %edi # fd = stdout,
	movl $.LC0, %esi # buf = LC0,
	movl $12, %edx # 12);
	syscall
	// gcc -c -Os -mavx512f -masm=intel
	#include <immintrin.h>

	float foo(float* con) {
	__mmask16 msk = 0x00ff;
	__m512 a = _mm512_maskz_loadu_ps(msk, con);
	__m512 b = _mm512_set1_ps(con[1]);
	__m512 c = _mm512_mul_ps(a,b);
	return ((float *)&c)[0];
	}
	// nvcc -o hello hello.cu; ./hello
	#include <stdio.h>

	__global__ void kernel() {
	printf("Hello World of CUDA\n");
	}

	int main() {
	kernel<<<1,1>>>();
	return cudaDeviceSynchronize();
	[Inline Frame] torch_cuda.dll!std::_Default_allocator_traits<std::allocator<std::_Tree_node<unsigned int,void >>>::deallocate(std::allocator<std::_Tree_node<unsigned int,void >> &) Line 689 C++
	[Inline Frame] torch_cuda.dll!std::_Tree_node<unsigned int,void >::_Freenode0(std::allocator<std::_Tree_node<unsigned int,void >> &) Line 373 C++
	[Inline Frame] torch_cuda.dll!std::_Tree_val<std::_Tree_simple_types<unsigned int>>::_Erase_head(std::allocator<std::_Tree_node<unsigned int,void *>> &) Line 753 C++
	[Inline Frame] torch_cuda.dll!std::_Tree<std::_Tset_traits<unsigned int,std::less<unsigned int>,std::allocator<unsigned int>,0>>::{dtor}() Line 1191 C++
	> torch_cuda.dll!torch::jit::fuser::newForReduction(torch::jit::fuser::TensorView * tv, const std::vector<unsigned int,std::allocator<unsigned int>> & axes) Line 438 C++
	torch_cuda.dll!torch::jit::fuser::reductionOp(torch::jit::fuser::BinaryOpType reduction_op_type, const std::vector<int,std::allocator<int>> & axes, torch::jit::fuser::Val * init, to
	#include <arm_neon.h>
	#include <math.h>
	#include <stdio.h>


	void run_neon_reciproc(float data_in[4], float data_out[4]) {
	float32x4_t input = vld1q_f32(data_in);
	float32x4_t out = vrecpeq_f32(input);
	//out = vmulq_f32(vrecpsq_f32(input, out), out);
	//out = vmulq_f32(vrecpsq_f32(input, out), out);
	#!/usr/bin/env python3
	import boto3
	import os
	import bz2
	import json
	import subprocess
	from datetime import datetime


	def get_git_commit_history(path, branch="master"):
	#!/usr/bin/env python3

	from datetime import datetime
	from typing import Any, Dict, List, Optional, Union
	from urllib.request import urlopen, Request
	import json
	import enum
	import os