Skip to content

Instantly share code, notes, and snippets.

@malfet
malfet / facbench.hs
Created September 29, 2012 18:01
Measuring 123456! evaluation time using various language constructs
import Text.Printf
import Control.Exception
import System.CPUTime
-- time function implementation borrowed from
-- http://www.haskell.org/haskellwiki/Timing_computations
time :: IO t -> IO t
time a = do
start <- getCPUTime
v <- a
#!/usr/bin/env python3
from elftools.elf.elffile import ELFFile
import struct
import sys
# From https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
def sizeof_fmt(num, suffix='B'):
for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
if abs(num) < 1024.0:
import torch
def hflip(img: torch.Tensor) -> torch.Tensor: return img.flip(-1)
print(torch.jit.script(hflip)(torch.rand(3, 8, 8)))
@malfet
malfet / hello.S
Created August 30, 2020 14:15
HelloWorld in x86_64 assembly
# as -o hello.o hello.S ; cc -o hello hello.o -nostdlib
.text
.globl _start
.type _start, @function
_start:
movl $1, %eax # sys_write(
movl $1, %edi # fd = stdout,
movl $.LC0, %esi # buf = LC0,
movl $12, %edx # 12);
syscall
@malfet
malfet / wrong-vmul-ps.c
Last active September 12, 2020 00:25
GCC masm=intel bug
// gcc -c -Os -mavx512f -masm=intel
#include <immintrin.h>
float foo(float* con) {
__mmask16 msk = 0x00ff;
__m512 a = _mm512_maskz_loadu_ps(msk, con);
__m512 b = _mm512_set1_ps(con[1]);
__m512 c = _mm512_mul_ps(a,b);
return ((float *)&c)[0];
}
// nvcc -o hello hello.cu; ./hello
#include <stdio.h>
__global__ void kernel() {
printf("Hello World of CUDA\n");
}
int main() {
kernel<<<1,1>>>();
return cudaDeviceSynchronize();
[Inline Frame] torch_cuda.dll!std::_Default_allocator_traits<std::allocator<std::_Tree_node<unsigned int,void *>>>::deallocate(std::allocator<std::_Tree_node<unsigned int,void *>> &) Line 689 C++
[Inline Frame] torch_cuda.dll!std::_Tree_node<unsigned int,void *>::_Freenode0(std::allocator<std::_Tree_node<unsigned int,void *>> &) Line 373 C++
[Inline Frame] torch_cuda.dll!std::_Tree_val<std::_Tree_simple_types<unsigned int>>::_Erase_head(std::allocator<std::_Tree_node<unsigned int,void *>> &) Line 753 C++
[Inline Frame] torch_cuda.dll!std::_Tree<std::_Tset_traits<unsigned int,std::less<unsigned int>,std::allocator<unsigned int>,0>>::{dtor}() Line 1191 C++
> torch_cuda.dll!torch::jit::fuser::newForReduction(torch::jit::fuser::TensorView * tv, const std::vector<unsigned int,std::allocator<unsigned int>> & axes) Line 438 C++
torch_cuda.dll!torch::jit::fuser::reductionOp(torch::jit::fuser::BinaryOpType reduction_op_type, const std::vector<int,std::allocator<int>> & axes, torch::jit::fuser::Val * init, to
#include <arm_neon.h>
#include <math.h>
#include <stdio.h>
void run_neon_reciproc(float data_in[4], float data_out[4]) {
float32x4_t input = vld1q_f32(data_in);
float32x4_t out = vrecpeq_f32(input);
//out = vmulq_f32(vrecpsq_f32(input, out), out);
//out = vmulq_f32(vrecpsq_f32(input, out), out);
#!/usr/bin/env python3
from datetime import datetime
from typing import Any, Dict, List, Optional, Union
from urllib.request import urlopen, Request
import json
import enum
import os