Skip to content

Instantly share code, notes, and snippets.

@Chubek
Last active August 1, 2023 05:44
Show Gist options
  • Save Chubek/7c62f186b5589cef16050c5e552d827e to your computer and use it in GitHub Desktop.
Save Chubek/7c62f186b5589cef16050c5e552d827e to your computer and use it in GitHub Desktop.
Newton-Rhapson with Intel Intrinsics in Assembly + C

The following two files, newton.c and newton.S are mainly for demonstration and served their purpose as for me warming up to x86-64 Assembly once more. I did not test them because I have other stuff brewing. I share them with you so I can discuss the code only. And I want your opinions.

The Assembly subroutine, newton_rhapson only has 6 instructions and 3 of them are loading the arrays. Let's go through it together:

.global newton_rhapson
.text

newton_rhapson:
	#define NUMBERS %rdi
	#define JACOBIAN %rsi
	#define CURRENTX %rdx
	#define DESTINATION %rcx
	
	vmovupd NUMBERS, VPDNUMBERS
	vmovupd JACOBIAN, VPDJACOBIAN
	vmovupd CURRENTX, VPDCURRENTX

	vdivpd VPDNUMBERS, VPDJACOBIAN	
	vsubpd VPDJACOBIAN, VPDCURRENTX

	vmovupd VPDCURRENTX, DESTINATION

	ret

I first declrate the aforementioned lable as a global, so it can be seen by C as an extern symbol. And then, I used CPP to define several macros that will make the reading much easier. It's kinda Lisp-like ain't it?

I am going to explain the instructions now:

  • vmovupd -> This instruction loads an array of 64 bytes (given AVX512), 32 bytes (given AVX2) or 16 bytes (given SSE4), maps the bytes to double floats, and stores them in the extedned registers. Notice that above, at the beginning of the file, we have defined these extended registers to match with the right extension, e.g. ZMM registers for AVX512.

  • vdipd and vsubpd -> These instructions do a SIMD division and SIMD subtraction, SIMD being vectorized "single instruction, mulitple data".

  • vmovupd -> This time we do the reverse, excpet we copy the vector into the mmory address ad DESTINATION which is general-purpose register RCX.

In the C code, we declare this function as extern:

extern void newton_rhapson(DOUBLE *numbers, DOUBLE * jacobian, DOUBLE *currentx, DOUBLE *dest);

We must compile with:

gcc newton.S newton.c -o newton

We then accept the input from a redirected STDIN in form of echo 'N=[...] J=[...] X=[...]' | newton.

So as I said this code most probably does not work, but I just wanted to share it with you since the aim is to seek general advice / critique etc.

Please visit my mainpage Github profile where I store a looot of goodies that you may even find useful: https://github.com/Chubek

Thanks.

NOTE: Remember the extension for the Assembly file must be a captial S because then it will expand the CPP macros. You could, however, expand them yourself and pre-save it via cpp or gcc -E.

PS: I'm ashamed to say it so I say it backwards -> (: buj na ydeen em

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#ifdef __AVX512__
#define IMM_MAX 64
#elif __AVX2__
#define IMM_MAX 32
#elif __SSE4__
#define IMM_MAX 16
#else
#error \
"Intrinsic targets not supprtet either by your microprocessor, or your compiler"
#endif
#define TMP_PARSE 1024
#define BYTE char
#define STRING BYTE *
#define DOUBLE double
#define IMM_DOUBLE(NAME) double NAME[IMM_MAX]
extern void newton_rhapson(DOUBLE *numbers, DOUBLE * jacobian,
DOUBLE *currentx, DOUBLE *dest);
void parse_floats(DOUBLE *dst, STRING src) {
BYTE chr, stack[TMP_PARSE], *stackptr = &stack[0];
while ((chr = *src++)) {
if (chr == ',') {
src[src - stackptr] = '\0';
*(dst++) = strtod(stackptr, NULL);
stackptr = ++src;
}
}
}
void get_numbers_and_perform(void) {
BYTE letter1, letter2, letter3;
fscanf(stdin, "%c=[%*s%c=[%*s%c=[%*s", &letter1, &letter2, &letter3);
IMM_DOUBLE(numbers) = {0};
IMM_DOUBLE(jacobian) = {0};
IMM_DOUBLE(currentx) = {0};
fseek(stdin, 0, SEEK_SET);
if (letter1 != 'N') {
fprintf(stderr, "ERROR: First array must be marked with N\n");
exit(EXIT_FAILURE);
} else if (letter2 != 'J') {
fprintf(stderr, "ERROR: Second array must be marked with J\n");
exit(EXIT_FAILURE);
} else if (letter3 != 'X') {
fprintf(stderr, "ERROR: First array must be marked with X\n");
exit(EXIT_FAILURE);
}
BYTE buff[BUFFER_SIZE] = {0};
fscanf(stdin, "N=[%s]", &buff[0]);
parse_floats(&numbers[0], &buff[0]);
buff = {0};
fscanf(stdin, "J=[%s]", &buff[0]);
parse_floats(&jacobian[0], &buff[0]);
buff = {0};
fscanf(stdin, "X=[%s]", &buff[0]);
parse_floats(&currentx[0], &buff[0]);
buff = {0};
newton_rhapson(&numbers[0], &jacobian[0], &currentx[0], &buff[0]);
printf("%s\n", &buff[0]);
}
int main(int argc, char **argv) {
if (isatty(fileno(stdin))) {
fprintf(stderr,
"ERROR: Please redirect the floating points through STDIN, denoted "
"by N=[], "
"J=[], and X=[].k Example: echo 'N=[1.22, 144.2] J=[242, 111.2 "
"111] X=[113.22, 111.2 444.1]' | %s\n",
argv[0]);
exit(EXIT_FAILURE);
}
get_numbers_and_perform();
}
#ifdef __AVX512__
#define VPDNUMBERS %zmm1
#define VPDJACOBIAN %zmm2
#define VPDCURRENTX %zmm3
#elif __AVX2__
#define VPDNUMBERS %ymm1
#define VPDJACOBIAN %ymm2
#define VPDCURRENTX %ymm3
#elif __SSE4__
#define VPDNUMBERS %xmm1
#define VPDJACOBIAN %xmm2
#define VPDCURRENTX %xmm3
#else
#error "Intrinsic targets not supprtet either by your microprocessor, or your compiler"
#endif
.global newton_rhapson
.text
newton_rhapson:
#define NUMBERS %rdi
#define JACOBIAN %rsi
#define CURRENTX %rdx
#define DESTINATION %rcx
vmovupd NUMBERS, VPDNUMBERS
vmovupd JACOBIAN, VPDJACOBIAN
vmovupd CURRENTX, VPDCURRENTX
vdivpd VPDNUMBERS, VPDJACOBIAN
vsubpd VPDJACOBIAN, VPDCURRENTX
vmovupd VPDCURRENTX, DESTINATION
ret
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment