Skip to content

Instantly share code, notes, and snippets.

View astojanov's full-sized avatar

Alen Stojanov astojanov

View GitHub Profile
#include <immintrin.h>
static inline __m256i avx_mm256_div_epi16_division(const __m256i &a_epi16, const __m256i &b_epi16) {
//
// Convert to two 32-bit integers
//
const __m256i a_hi_epi32 = _mm256_srai_epi32(a_epi16, 16);
const __m256i a_lo_epi32_shift = _mm256_slli_epi32(a_epi16, 16);
const __m256i a_lo_epi32 = _mm256_srai_epi32(a_lo_epi32_shift, 16);
const __m256i b_hi_epi32 = _mm256_srai_epi32(b_epi16, 16);
#include <immintrin.h>
//
// Performs division on 16-bit elements, using floating point division
//
static inline __m128i sse_mm_div_epi16_division(const __m128i &a_epi16, const __m128i &b_epi16) {
const __m128i lo_mask = _mm_set1_epi32(0xFFFF);
//
// Convert to two 32-bit integers
#include <immintrin.h>
#include <cstdint>
#include <iostream>
#include <iomanip>
#include <cassert>
//
// Transpose 8x8 registers
//
Java HotSpot(TM) 64-Bit Server VM warning: printing of assembly code is enabled; turning on DebugNonSafepoints to gain additional output
CompilerOracle: print *JVector8.dot
Compiled method (c1) 807 864 % 3 ch.ethz.acl.ngen.precison.JVector8::dot @ 35 (134 bytes)
total in heap [0x000000010f6dd010,0x000000010f6ddbb0] = 2976
relocation [0x000000010f6dd138,0x000000010f6dd1b0] = 120
main code [0x000000010f6dd1c0,0x000000010f6dd640] = 1152
stub code [0x000000010f6dd640,0x000000010f6dd6d0] = 144
oops [0x000000010f6dd6d0,0x000000010f6dd6d8] = 8
metadata [0x000000010f6dd6d8,0x000000010f6dd6e0] = 8
scopes data [0x000000010f6dd6e0,0x000000010f6dd830] = 336
// 1. package ch.ethz.acl.ngen.saxpy;
// 2.
// 3. public class JSaxpy {
// 4. public void apply(float[] a, float[] b, float s, int n){
// 5. for (int i = 0; i < n; i += 1) {
// 6. a[i] += b[i] * s;
// 7. }
// 8. }
// 9. }
// Code being analyzed
//
// 1. package ch.ethz.acl.ngen.saxpy;
// 2.
// 3. public class JSaxpy {
// 4. public void apply(int[] a, int[] b, int s, int n){
// 5. for (int i = 0; i < n; i += 1) {
// 6. a[i] += b[i] * s;
// 7. }
// 8. }
install.packages("ggplot2")
library(ggplot2)
getLegend <- function() { return(theme(legend.position="bottom")) }
getTitle <- function(tmp) { return(ggtitle(tmp)) }
getAxisTextElement <- function() { return(element_text(hjust=0.5, size=8, colour="black")) }
getAxisTitleElement <- function() { return(element_text(hjust=0.5, size=10)) }
getTitleElement <- function () { return(element_text(hjust=0, size=10)) }
colorPalette <- c("#000000", "#585858", "#585858", "#585858", "#585858", "#585858", "#585858")
shapePalette <- c(16, 23, 24, 25, 22)
IppsBigNumState* createBigNumState(int len, const Ipp32u* pData) {
int size;
ippsBigNumGetSize(len, &size);
IppsBigNumState* pBN = (IppsBigNumState*) ippMalloc(size);;
ippsBigNumInit(len, pBN);
if (pData != NULL) {
ippsSet_BN(IppsBigNumPOS, len, pData, pBN);
}
return pBN;
}
@astojanov
astojanov / Makefile
Last active March 25, 2016 02:56
RSA Key Generation: Intel IPP vs OpenSSL
CPPFLAGS = -O3 -xHost -no-multibyte-chars
INCLUDE = -I/opt/local/include/ -I/opt/intel/ipp/include
LIBS = -L/opt/local/lib/ -lssl -lcrypto -L/opt/intel/ipp/lib -ipp=crypto
all: perf.h
icpc $(INCLUDE) $(CFLAGS) -c main.cpp -o main.o
icpc $(INCLUDE) $(CFLAGS) -c perf.cpp -o perf.o
icpc $(LIBS) perf.o main.o -o rsatest
clean: