Skip to content

Instantly share code, notes, and snippets.

View tanakamura's full-sized avatar

Takashi Nakamura tanakamura

View GitHub Profile
matmul_x86_fma_:
.LFB1165:
.cfi_startproc
pushq %rbp
.cfi_def_cfa_offset 16
.cfi_offset 6, -16
addq %rdi, %rcx
vxorps %xmm0, %xmm0, %xmm0
movq %rsp, %rbp
.cfi_def_cfa_register 6
#include <immintrin.h>
#include <stdio.h>
#include <intrin.h>
/*
lookup8:
subq $12, %rsp
.seh_stackalloc 12
.seh_endprologue
movl %edx, (%rsp)
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
double
sec(void)
{
struct timespec ts;
/*
==4KB==
memset L1/L2 1.8[usec], 2230.9[MB/s]
memset L2 1.9[usec], 2198.6[MB/s]
memset L1 5.9[usec], 696.6[MB/s]
memset wo cache 19.9[usec], 206.2[MB/s]
flush wb 10.1[usec], 406.2[MB/s]
flush4k wb 8.9[usec], 459.9[MB/s]
flush empty 4.1[usec], 990.8[MB/s]
flush4k empty 3.2[usec], 1270.1[MB/s]
module top(led_0,
sw_0);
(* IOSTANDARD="LVCMOS33"*) (* PACKAGE_PIN = "T22"*) output led_0;
(* IOSTANDARD="LVCMOS25"*) (* PACKAGE_PIN = "F22"*) input sw_0;
assign led_0 = sw_0;
endmodule
set output_dir ./out
file mkdir $output_dir
read_verilog top.v
synth_design -top top -part xc7z020clg484-1 -flatten rebuilt
opt_design
place_design
phys_opt_design
module design_1_wrapper
();
wire [63:0] gpio_out0;
wire [63:0] gpio_in0;
assign gpio_in0[0] = gpio_out0[1];
PS7 PS7_i (
.EMIOGPIOO (gpio_out0),
/*
clflush 4278.300100[MiB/s]
clflushopt 44717.389375[MiB/s]
memset 30407.684046[MiB/s]
*/
#include <stdio.h>
#include <x86intrin.h>
#include <sys/time.h>
#include <string.h>
(https://github.com/tanakamura/instruction-bench)
Intel(R) Xeon Phi(TM) CPU 7210 @ 1.30GHz
== latency/throughput ==
reg64: add: latency: CPI= 1.11, IPC= 0.90
reg64: add:throughput: CPI= 0.64, IPC= 1.57
reg64: lea: latency: CPI= 1.03, IPC= 0.97
reg64: lea:throughput: CPI= 0.65, IPC= 1.55
reg64: load: latency: CPI= 4.04, IPC= 0.25
reg64: load:throughput: CPI= 1.02, IPC= 0.98
Performance counter stats for 'sh -c ../configure; make -j256':
631718.546417 task-clock (msec) # 4.435 CPUs utilized
79,129 context-switches # 0.125 K/sec
46,742 cpu-migrations # 0.074 K/sec
14,459,831 page-faults # 0.023 M/sec
794,193,966,671 cycles # 1.257 GHz (50.80%)
<not supported> stalled-cycles-frontend
<not supported> stalled-cycles-backend
396,607,076,972 instructions # 0.50 insns per cycle (75.66%)