ynkdir/libcallex_win.diff

## libcallex_win.diff
diff --git a/autoload/Makefile.msc b/autoload/Makefile.msc
index d0907f2..ef13346 100644
--- a/autoload/Makefile.msc
+++ b/autoload/Makefile.msc
@@ -1,5 +1,20 @@
+!if "$(ARCH)" == "x64"
+ASMFILE=msc_x64_call
+!elseif "$(ARCH)" == "x86"
+ASMFILE=msc_x86_call
+!else
+!error set ARCH=x64|x86
+!endif
+
 all : libcallex.dll
 	@echo done

-libcallex.dll : libcallex.cxx
-	cl /LD /EHsc libcallex.cxx libffi.lib
+libcallex.dll : libcallex.cxx $(ASMFILE).obj
+	cl /LD /EHsc libcallex.cxx $(ASMFILE).obj
+
+msc_x64_call.obj : msc_x64_call.asm
+	ml64 /c msc_x64_call.asm
+
+msc_x86_call.obj : msc_x86_call.asm
+	ml /c msc_x86_call.asm
+
diff --git a/autoload/Makefile.w32 b/autoload/Makefile.w32
index dd3f22d..0404bdd 100644
--- a/autoload/Makefile.w32
+++ b/autoload/Makefile.w32
@@ -1,5 +1,16 @@
+
+ifeq ($(ARCH), x64)
+ASMFILE=mingw_x64_call.s
+else
+ifeq ($(ARCH), x86)
+ASMFILE=mingw_x86_call.s
+else
+$(error set ARCH=x64|x86)
+endif
+endif
+
 all : libcallex.dll
 	@echo done

-libcallex.dll : libcallex.cxx
-	g++ -static-libgcc -static-libstdc++ -shared -o libcallex.dll libcallex.cxx -lffi
+libcallex.dll : libcallex.cxx $(ASMFILE)
+	g++ -static -static-libgcc -static-libstdc++ -shared -o libcallex.dll libcallex.cxx $(ASMFILE)
diff --git a/autoload/libcallex.cxx b/autoload/libcallex.cxx
index 6f667cb..3ca98ce 100644
--- a/autoload/libcallex.cxx
+++ b/autoload/libcallex.cxx
@@ -85,86 +85,23 @@ const char* libcallex_call(const char* context) {
 			}
 			narg++;
 		}
-#if defined(_WIN64) && defined(_MVC_VER)
-		// XXX: NOT TESTED
-		// XXX: replace push to mov
-		// at lease 32 byte, aligned to 16 byte
-		INTPTR_T stackroom = 32;
-		if (narg > 4)
-			stackroom += 16 * (narg % 2);
-		_asm sub rsp, stackroom
-		for (unsigned long n = narg; n > 4; n--) {
-			INTPTR_T a_ = args[n - 1];
-			_asm push a_
-		}
-		if (narg > 3) {
-			INTPTR_T a_ = args[3];
-			_asm mov a_, r9
-		}
-		if (narg > 2) {
-			INTPTR_T a_ = args[2];
-			_asm mov a_, r8
-		}
-		if (narg > 1) {
-			INTPTR_T a_ = args[1];
-			_asm mov a_, rdx
-		}
-		if (narg > 0) {
-			INTPTR_T a_ = args[0];
-			_asm mov a_, rcx
-		}
-		_asm {
-			call p_
-			mov r_, rax
-		}
-		if (narg > 4) {
-			INTPTR_T a_ = (narg - 4) * sizeof(void *);
-			_asm add rsp, a_
-		}
-		_asm add rsp, stackroom
-#elif defined(_WIN32) && defined(_MVC_VER)
-		for (unsigned long n = 0; n < narg; n++) {
-			INTPTR_T a_ = args[narg-n-1];
-			_asm {
-				mov eax, a_
-				push eax
-			}
-		}
-		_asm {
-			call p_
-			mov r_, eax
-		}
-#elif defined(_WIN64) && defined(___GNUC__)
-		// XXX: NOT TESTED
-		// XXX: replace push to mov
-		// at lease 32 byte, aligned to 16 byte
-		INTPTR_T stackroom = 32;
-		if (narg > 4)
-			stackroom += 16 * (narg % 2);
-		__asm__ ("subq %0, %%rsp"::"r"(stackroom));
-		for (unsigned long n = narg; n > 4; n--)
-			__asm__ ("pushq %0"::"r"(args[n-1]));
-		if (narg > 3) __asm__ ("movq %0, %%r9"::"r"(args[3]));
-		if (narg > 2) __asm__ ("movq %0, %%r8"::"r"(args[2]));
-		if (narg > 1) __asm__ ("movq %0, %%rdx"::"r"(args[1]));
-		if (narg > 0) __asm__ ("movq %0, %%rcx"::"r"(args[0]));
-		__asm__ ("call %0":"=r"(r_):"r"(p_));
-		if (narg > 4)
-			__asm__ ("addq %0, %%rsp"::"r"((narg - 4) * sizeof(void*)));
-		__asm__ ("addq %0, %%rsp"::"r"(stackroom));
+#if defined(_WIN64) && defined(_MSC_VER)
+		// NOTE: Vim's Number is 32bit. We can not handle 64bit pointer as Number.
+		// FIXME: double is not supported
+		extern intptr_t msc_x64_call(FUNCTION p, long narg, INTPTR_T* args);
+		r_ = msc_x64_call(p_, narg, args);
+#elif defined(_WIN32) && defined(_MSC_VER)
+		// FIXME: double is not supported
+		extern intptr_t msc_x86_call(FUNCTION p, long narg, INTPTR_T* args);
+		r_ = msc_x86_call(p_, narg, args);
+#elif defined(_WIN64) && defined(__GNUC__)
+		// FIXME: double is not supported
+		extern intptr_t mingw_x64_call(FUNCTION p, long narg, INTPTR_T* args);
+		r_ = mingw_x64_call(p_, narg, args);
 #elif defined(_WIN32) && defined(__GNUC__)
-		for (unsigned long n = 0; n < narg; n++) {
-			INTPTR_T a_ = args[narg-n-1];
-			__asm__ (
-				"push %0"
-				::"r"(a_)
-			);
-		}
-		__asm__ (
-			"call %0"
-			:"=r"(r_)
-			:"r"(p_)
-		);
+		// FIXME: double is not supported
+		extern intptr_t mingw_x86_call(FUNCTION p, long narg, INTPTR_T* args);
+		r_ = mingw_x86_call(p_, narg, args);
 #elif defined(__linux__) && defined(__x86_64__) && defined(__GNUC__)
 		for (unsigned long n = narg; n > 6; n--)
 			__asm__ ("pushq %0"::"r"(args[n-1]));
diff --git a/autoload/libcallex.dll b/autoload/libcallex.dll
index cf4d871..ba4a068 100644
Binary files a/autoload/libcallex.dll and b/autoload/libcallex.dll differ
diff --git a/autoload/mingw_x64_call.s b/autoload/mingw_x64_call.s
new file mode 100644
index 0000000..b069e3b
--- /dev/null
+++ b/autoload/mingw_x64_call.s
@@ -0,0 +1,69 @@
+	.text
+	.global	mingw_x64_call
+	.def	mingw_x64_call
+# intptr_t mingw_x64_call(FUNCTION p, long narg, INTPTR_T* args)
+args$ = 32
+narg$ = 24
+p$ = 16
+mingw_x64_call:
+	.cfi_startproc
+	pushq %rbp
+	movq %rsp, %rbp
+	movq %rcx, p$(%rbp)
+	movq %rdx, narg$(%rbp)
+	movq %r8, args$(%rbp)
+	# stacksize is at lease 32 byte, aligned to 16 byte
+	# cutting corners with (4 + narg * 2) * 8
+	leaq 4(,%rdx,2), %rdx
+	leaq (,%rdx,8), %rdx
+	sub %rdx, %rsp
+	# while narg >= 5:
+	#  narg--
+	#  rsp[narg] = args[narg]
+	# if narg > 3:
+	#  r9 = args[3]
+	# if narg > 2:
+	#  r8 = args[2]
+	# if narg > 1:
+	#  rdx = args[1]
+	# if narg > 0:
+	#  rcx = args[0]
+	movq narg$(%rbp), %rcx
+argN:
+	cmpq $5, %rcx
+	jl arg4
+	dec %rcx
+	movq args$(%rbp), %rax
+	movq (%rax,%rcx,8), %rax
+	movq %rax, (%rsp,%rcx,8)
+	jmp argN
+arg4:
+	cmpq $4, %rcx
+	jl arg3
+	movq args$(%rbp), %rax
+	movq 24(%rax), %rax
+	movq %rax, %r9
+arg3:
+	cmpq $3, %rcx
+	jl arg2
+	movq args$(%rbp), %rax
+	movq 16(%rax), %rax
+	movq %rax, %r8
+arg2:
+	cmpq $2, %rcx
+	jl arg1
+	movq args$(%rbp), %rax
+	movq 8(%rax), %rax
+	movq %rax, %rdx
+arg1:
+	cmpq $1, %rcx
+	jl docall
+	movq args$(%rbp), %rax
+	movq (%rax), %rax
+	movq %rax, %rcx
+docall:
+	call *p$(%rbp)
+	movq %rbp, %rsp
+	popq %rbp
+	ret
+	.cfi_endproc
diff --git a/autoload/mingw_x86_call.s b/autoload/mingw_x86_call.s
new file mode 100644
index 0000000..d19d84c
--- /dev/null
+++ b/autoload/mingw_x86_call.s
@@ -0,0 +1,27 @@
+	.text
+	.global	_mingw_x86_call
+	.def	_mingw_x86_call
+# intptr_t mingw_x86_call(FUNCTION p, long narg, INTPTR_T* args)
+args$ = 16
+narg$ = 12
+p$ = 8
+_mingw_x86_call:
+	.cfi_startproc
+	pushl %ebp
+	movl %esp, %ebp
+	# while narg > 0:
+	#   push args[--narg]
+	movl narg$(%ebp), %ecx
+argN:
+	cmpl $0, %ecx
+	jle docall
+	dec %ecx
+	movl args$(%ebp), %eax
+	pushl (%eax,%ecx,4)
+	jmp argN
+docall:
+	call *p$(%ebp)
+	movl %ebp, %esp
+	popl %ebp
+	ret
+	.cfi_endproc
diff --git a/autoload/msc_x64_call.asm b/autoload/msc_x64_call.asm
new file mode 100644
index 0000000..d5744b3
--- /dev/null
+++ b/autoload/msc_x64_call.asm
@@ -0,0 +1,71 @@
+PUBLIC msc_x64_call
+
+_TEXT SEGMENT
+; intptr_t msc_x64_call(FUNCTION p, long narg, INTPTR_T* args)
+args$ = 32
+narg$ = 24
+p$ = 16
+msc_x64_call proc
+	push rbp
+	mov rbp, rsp
+	mov p$[rbp], rcx
+	mov narg$[rbp], rdx
+	mov args$[rbp], r8
+	; stacksize is at lease 32 byte, aligned to 16 byte
+	; cutting corners with (4 + narg * 2) * 8
+	lea rdx, [rdx*2+4]
+	lea rdx, [rdx*8]
+	sub rsp, rdx
+	; while narg >= 5:
+	;  narg--
+	;  rsp[narg] = args[narg]
+	; if narg > 3:
+	;  r9 = args[3]
+	; if narg > 2:
+	;  r8 = args[2]
+	; if narg > 1:
+	;  rdx = args[1]
+	; if narg > 0:
+	;  rcx = args[0]
+	mov rcx, narg$[rbp]
+argN:
+	cmp rcx, 5
+	jl arg4
+	dec rcx
+	mov rax, args$[rbp]
+	mov rax, [rax+rcx*8]
+	mov [rsp+rcx*8], rax
+	jmp argN
+arg4:
+	cmp rcx, 4
+	jl arg3
+	mov rax, args$[rbp]
+	mov rax, [rax+24]
+	mov r9, rax
+arg3:
+	cmp rcx, 3
+	jl arg2
+	mov rax, args$[rbp]
+	mov rax, [rax+16]
+	mov r8, rax
+arg2:
+	cmp rcx, 2
+	jl arg1
+	mov rax, args$[rbp]
+	mov rax, [rax+8]
+	mov rdx, rax
+arg1:
+	cmp rcx, 1
+	jl docall
+	mov rax, args$[rbp]
+	mov rax, [rax]
+	mov rcx, rax
+docall:
+	call qword ptr p$[rbp]
+	mov rsp, rbp
+	pop rbp
+	ret
+msc_x64_call endp
+_TEXT ENDS
+
+end
diff --git a/autoload/msc_x86_call.asm b/autoload/msc_x86_call.asm
new file mode 100644
index 0000000..6c88c81
--- /dev/null
+++ b/autoload/msc_x86_call.asm
@@ -0,0 +1,33 @@
+.686P
+.model flat
+
+; why need underscore?
+PUBLIC _msc_x86_call
+
+_TEXT SEGMENT
+; intptr_t msc_x86_call(FUNCTION p, long narg, INTPTR_T* args)
+args$ = 16
+narg$ = 12
+p$ = 8
+_msc_x86_call proc
+	push ebp
+	mov ebp, esp
+	; while narg > 0:
+	;   push args[--narg]
+	mov ecx, narg$[ebp]
+argN:
+	cmp ecx, 0
+	jle docall
+	dec ecx
+	mov eax, args$[ebp]
+	push [eax+ecx*4]
+	jmp argN
+docall:
+	call dword ptr p$[ebp]
+	mov esp, ebp
+	pop ebp
+	ret
+_msc_x86_call endp
+_TEXT ENDS
+
+end
	diff --git a/autoload/Makefile.msc b/autoload/Makefile.msc
	index d0907f2..ef13346 100644
	--- a/autoload/Makefile.msc
	+++ b/autoload/Makefile.msc
	@@ -1,5 +1,20 @@
	+!if "$(ARCH)" == "x64"
	+ASMFILE=msc_x64_call
	+!elseif "$(ARCH)" == "x86"
	+ASMFILE=msc_x86_call
	+!else
	+!error set ARCH=x64\|x86
	+!endif
	+
	all : libcallex.dll
	@echo done

	-libcallex.dll : libcallex.cxx
	- cl /LD /EHsc libcallex.cxx libffi.lib
	+libcallex.dll : libcallex.cxx $(ASMFILE).obj
	+ cl /LD /EHsc libcallex.cxx $(ASMFILE).obj
	+
	+msc_x64_call.obj : msc_x64_call.asm
	+ ml64 /c msc_x64_call.asm
	+
	+msc_x86_call.obj : msc_x86_call.asm
	+ ml /c msc_x86_call.asm
	+
	diff --git a/autoload/Makefile.w32 b/autoload/Makefile.w32
	index dd3f22d..0404bdd 100644
	--- a/autoload/Makefile.w32
	+++ b/autoload/Makefile.w32
	@@ -1,5 +1,16 @@
	+
	+ifeq ($(ARCH), x64)
	+ASMFILE=mingw_x64_call.s
	+else
	+ifeq ($(ARCH), x86)
	+ASMFILE=mingw_x86_call.s
	+else
	+$(error set ARCH=x64\|x86)
	+endif
	+endif
	+
	all : libcallex.dll
	@echo done

	-libcallex.dll : libcallex.cxx
	- g++ -static-libgcc -static-libstdc++ -shared -o libcallex.dll libcallex.cxx -lffi
	+libcallex.dll : libcallex.cxx $(ASMFILE)
	+ g++ -static -static-libgcc -static-libstdc++ -shared -o libcallex.dll libcallex.cxx $(ASMFILE)
	diff --git a/autoload/libcallex.cxx b/autoload/libcallex.cxx
	index 6f667cb..3ca98ce 100644
	--- a/autoload/libcallex.cxx
	+++ b/autoload/libcallex.cxx
	@@ -85,86 +85,23 @@ const char* libcallex_call(const char* context) {
	}
	narg++;
	}
	-#if defined(_WIN64) && defined(_MVC_VER)
	- // XXX: NOT TESTED
	- // XXX: replace push to mov
	- // at lease 32 byte, aligned to 16 byte
	- INTPTR_T stackroom = 32;
	- if (narg > 4)
	- stackroom += 16 * (narg % 2);
	- _asm sub rsp, stackroom
	- for (unsigned long n = narg; n > 4; n--) {
	- INTPTR_T a_ = args[n - 1];
	- _asm push a_
	- }
	- if (narg > 3) {
	- INTPTR_T a_ = args[3];
	- _asm mov a_, r9
	- }
	- if (narg > 2) {
	- INTPTR_T a_ = args[2];
	- _asm mov a_, r8
	- }
	- if (narg > 1) {
	- INTPTR_T a_ = args[1];
	- _asm mov a_, rdx
	- }
	- if (narg > 0) {
	- INTPTR_T a_ = args[0];
	- _asm mov a_, rcx
	- }
	- _asm {
	- call p_
	- mov r_, rax
	- }
	- if (narg > 4) {
	- INTPTR_T a_ = (narg - 4) * sizeof(void *);
	- _asm add rsp, a_
	- }
	- _asm add rsp, stackroom
	-#elif defined(_WIN32) && defined(_MVC_VER)
	- for (unsigned long n = 0; n < narg; n++) {
	- INTPTR_T a_ = args[narg-n-1];
	- _asm {
	- mov eax, a_
	- push eax
	- }
	- }
	- _asm {
	- call p_
	- mov r_, eax
	- }
	-#elif defined(_WIN64) && defined(___GNUC__)
	- // XXX: NOT TESTED
	- // XXX: replace push to mov
	- // at lease 32 byte, aligned to 16 byte
	- INTPTR_T stackroom = 32;
	- if (narg > 4)
	- stackroom += 16 * (narg % 2);
	- __asm__ ("subq %0, %%rsp"::"r"(stackroom));
	- for (unsigned long n = narg; n > 4; n--)
	- __asm__ ("pushq %0"::"r"(args[n-1]));
	- if (narg > 3) __asm__ ("movq %0, %%r9"::"r"(args[3]));
	- if (narg > 2) __asm__ ("movq %0, %%r8"::"r"(args[2]));
	- if (narg > 1) __asm__ ("movq %0, %%rdx"::"r"(args[1]));
	- if (narg > 0) __asm__ ("movq %0, %%rcx"::"r"(args[0]));
	- __asm__ ("call %0":"=r"(r_):"r"(p_));
	- if (narg > 4)
	- __asm__ ("addq %0, %%rsp"::"r"((narg - 4) * sizeof(void*)));
	- __asm__ ("addq %0, %%rsp"::"r"(stackroom));
	+#if defined(_WIN64) && defined(_MSC_VER)
	+ // NOTE: Vim's Number is 32bit. We can not handle 64bit pointer as Number.
	+ // FIXME: double is not supported
	+ extern intptr_t msc_x64_call(FUNCTION p, long narg, INTPTR_T* args);
	+ r_ = msc_x64_call(p_, narg, args);
	+#elif defined(_WIN32) && defined(_MSC_VER)
	+ // FIXME: double is not supported
	+ extern intptr_t msc_x86_call(FUNCTION p, long narg, INTPTR_T* args);
	+ r_ = msc_x86_call(p_, narg, args);
	+#elif defined(_WIN64) && defined(__GNUC__)
	+ // FIXME: double is not supported
	+ extern intptr_t mingw_x64_call(FUNCTION p, long narg, INTPTR_T* args);
	+ r_ = mingw_x64_call(p_, narg, args);
	#elif defined(_WIN32) && defined(__GNUC__)
	- for (unsigned long n = 0; n < narg; n++) {
	- INTPTR_T a_ = args[narg-n-1];
	- __asm__ (
	- "push %0"
	- ::"r"(a_)
	- );
	- }
	- __asm__ (
	- "call %0"
	- :"=r"(r_)
	- :"r"(p_)
	- );
	+ // FIXME: double is not supported
	+ extern intptr_t mingw_x86_call(FUNCTION p, long narg, INTPTR_T* args);
	+ r_ = mingw_x86_call(p_, narg, args);
	#elif defined(__linux__) && defined(__x86_64__) && defined(__GNUC__)
	for (unsigned long n = narg; n > 6; n--)
	__asm__ ("pushq %0"::"r"(args[n-1]));
	diff --git a/autoload/libcallex.dll b/autoload/libcallex.dll
	index cf4d871..ba4a068 100644
	Binary files a/autoload/libcallex.dll and b/autoload/libcallex.dll differ
	diff --git a/autoload/mingw_x64_call.s b/autoload/mingw_x64_call.s
	new file mode 100644
	index 0000000..b069e3b
	--- /dev/null
	+++ b/autoload/mingw_x64_call.s
	@@ -0,0 +1,69 @@
	+ .text
	+ .global mingw_x64_call
	+ .def mingw_x64_call
	+# intptr_t mingw_x64_call(FUNCTION p, long narg, INTPTR_T* args)
	+args$ = 32
	+narg$ = 24
	+p$ = 16
	+mingw_x64_call:
	+ .cfi_startproc
	+ pushq %rbp
	+ movq %rsp, %rbp
	+ movq %rcx, p$(%rbp)
	+ movq %rdx, narg$(%rbp)
	+ movq %r8, args$(%rbp)
	+ # stacksize is at lease 32 byte, aligned to 16 byte
	+ # cutting corners with (4 + narg * 2) * 8
	+ leaq 4(,%rdx,2), %rdx
	+ leaq (,%rdx,8), %rdx
	+ sub %rdx, %rsp
	+ # while narg >= 5:
	+ # narg--
	+ # rsp[narg] = args[narg]
	+ # if narg > 3:
	+ # r9 = args[3]
	+ # if narg > 2:
	+ # r8 = args[2]
	+ # if narg > 1:
	+ # rdx = args[1]
	+ # if narg > 0:
	+ # rcx = args[0]
	+ movq narg$(%rbp), %rcx
	+argN:
	+ cmpq $5, %rcx
	+ jl arg4
	+ dec %rcx
	+ movq args$(%rbp), %rax
	+ movq (%rax,%rcx,8), %rax
	+ movq %rax, (%rsp,%rcx,8)
	+ jmp argN
	+arg4:
	+ cmpq $4, %rcx
	+ jl arg3
	+ movq args$(%rbp), %rax
	+ movq 24(%rax), %rax
	+ movq %rax, %r9
	+arg3:
	+ cmpq $3, %rcx
	+ jl arg2
	+ movq args$(%rbp), %rax
	+ movq 16(%rax), %rax
	+ movq %rax, %r8
	+arg2:
	+ cmpq $2, %rcx
	+ jl arg1
	+ movq args$(%rbp), %rax
	+ movq 8(%rax), %rax
	+ movq %rax, %rdx
	+arg1:
	+ cmpq $1, %rcx
	+ jl docall
	+ movq args$(%rbp), %rax
	+ movq (%rax), %rax
	+ movq %rax, %rcx
	+docall:
	+ call *p$(%rbp)
	+ movq %rbp, %rsp
	+ popq %rbp
	+ ret
	+ .cfi_endproc
	diff --git a/autoload/mingw_x86_call.s b/autoload/mingw_x86_call.s
	new file mode 100644
	index 0000000..d19d84c
	--- /dev/null
	+++ b/autoload/mingw_x86_call.s
	@@ -0,0 +1,27 @@
	+ .text
	+ .global _mingw_x86_call
	+ .def _mingw_x86_call
	+# intptr_t mingw_x86_call(FUNCTION p, long narg, INTPTR_T* args)
	+args$ = 16
	+narg$ = 12
	+p$ = 8
	+_mingw_x86_call:
	+ .cfi_startproc
	+ pushl %ebp
	+ movl %esp, %ebp
	+ # while narg > 0:
	+ # push args[--narg]
	+ movl narg$(%ebp), %ecx
	+argN:
	+ cmpl $0, %ecx
	+ jle docall
	+ dec %ecx
	+ movl args$(%ebp), %eax
	+ pushl (%eax,%ecx,4)
	+ jmp argN
	+docall:
	+ call *p$(%ebp)
	+ movl %ebp, %esp
	+ popl %ebp
	+ ret
	+ .cfi_endproc
	diff --git a/autoload/msc_x64_call.asm b/autoload/msc_x64_call.asm
	new file mode 100644
	index 0000000..d5744b3
	--- /dev/null
	+++ b/autoload/msc_x64_call.asm
	@@ -0,0 +1,71 @@
	+PUBLIC msc_x64_call
	+
	+_TEXT SEGMENT
	+; intptr_t msc_x64_call(FUNCTION p, long narg, INTPTR_T* args)
	+args$ = 32
	+narg$ = 24
	+p$ = 16
	+msc_x64_call proc
	+ push rbp
	+ mov rbp, rsp
	+ mov p$[rbp], rcx
	+ mov narg$[rbp], rdx
	+ mov args$[rbp], r8
	+ ; stacksize is at lease 32 byte, aligned to 16 byte
	+ ; cutting corners with (4 + narg * 2) * 8
	+ lea rdx, [rdx*2+4]
	+ lea rdx, [rdx*8]
	+ sub rsp, rdx
	+ ; while narg >= 5:
	+ ; narg--
	+ ; rsp[narg] = args[narg]
	+ ; if narg > 3:
	+ ; r9 = args[3]
	+ ; if narg > 2:
	+ ; r8 = args[2]
	+ ; if narg > 1:
	+ ; rdx = args[1]
	+ ; if narg > 0:
	+ ; rcx = args[0]
	+ mov rcx, narg$[rbp]
	+argN:
	+ cmp rcx, 5
	+ jl arg4
	+ dec rcx
	+ mov rax, args$[rbp]
	+ mov rax, [rax+rcx*8]
	+ mov [rsp+rcx*8], rax
	+ jmp argN
	+arg4:
	+ cmp rcx, 4
	+ jl arg3
	+ mov rax, args$[rbp]
	+ mov rax, [rax+24]
	+ mov r9, rax
	+arg3:
	+ cmp rcx, 3
	+ jl arg2
	+ mov rax, args$[rbp]
	+ mov rax, [rax+16]
	+ mov r8, rax
	+arg2:
	+ cmp rcx, 2
	+ jl arg1
	+ mov rax, args$[rbp]
	+ mov rax, [rax+8]
	+ mov rdx, rax
	+arg1:
	+ cmp rcx, 1
	+ jl docall
	+ mov rax, args$[rbp]
	+ mov rax, [rax]
	+ mov rcx, rax
	+docall:
	+ call qword ptr p$[rbp]
	+ mov rsp, rbp
	+ pop rbp
	+ ret
	+msc_x64_call endp
	+_TEXT ENDS
	+
	+end
	diff --git a/autoload/msc_x86_call.asm b/autoload/msc_x86_call.asm
	new file mode 100644
	index 0000000..6c88c81
	--- /dev/null
	+++ b/autoload/msc_x86_call.asm
	@@ -0,0 +1,33 @@
	+.686P
	+.model flat
	+
	+; why need underscore?
	+PUBLIC _msc_x86_call
	+
	+_TEXT SEGMENT
	+; intptr_t msc_x86_call(FUNCTION p, long narg, INTPTR_T* args)
	+args$ = 16
	+narg$ = 12
	+p$ = 8
	+_msc_x86_call proc
	+ push ebp
	+ mov ebp, esp
	+ ; while narg > 0:
	+ ; push args[--narg]
	+ mov ecx, narg$[ebp]
	+argN:
	+ cmp ecx, 0
	+ jle docall
	+ dec ecx
	+ mov eax, args$[ebp]
	+ push [eax+ecx*4]
	+ jmp argN
	+docall:
	+ call dword ptr p$[ebp]
	+ mov esp, ebp
	+ pop ebp
	+ ret
	+_msc_x86_call endp
	+_TEXT ENDS
	+
	+end