Last active March 25, 2024 06:16
A multi-threaded web server in Assembly x86


A minimalist yet multi-threaded HTTP server coded in pure Assembly x86, designed to output a "Hello world" message in HTML format.

The server employs a pool of threads consuming connections from a queue that is synchronized using mutex in spinlock and futex for condition variable.


  • Linux/AMD64 (tested in Docker and Ubuntu)
  • NASM 2.15.05
  • GNU ld 2.38


$ nasm -f elf -o server.o server.asm
$ ld -m elf_i386 -o server server.o
$ ./server

Listening on the port 3000


$ nasm -f elf64 -o server.o server.asm
$ ld -o server server.o
$ ./server

Listening on the port 3000


  • GDB 12.1
  • strace 5.16

You can run nasm by adding the option -g, then:

$ gdb server


For using strace:

$ strace ./server

execve("./server", ["./server"], 0x7ffdc96aed48 /* 34 vars */) = 0
[ Process PID=1996977 runs in 32 bit mode. ]
clone(child_stack=0xf7fb9ff8, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_PARENT|CLONE_THREAD|CLONE_IOstrace: Process 1996978 attached
) = 1996978
[pid 1996978] futex(0x804a034, FUTEX_WAIT_PRIVATE, 0, NULL <unfinished ...>
[pid 1996977] mmap2(NULL, 4194304, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_GROWSDOWN, 1, 0) = 0xf76ba000
[pid 1996977] clone(child_stack=0xf7ab9ff8, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_PARENT|CLONE_THREAD|CLONE_IOstrace: Process 1996979 attached
) = 1996979
[pid 1996977] mmap2(NULL, 4194304, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_GROWSDOWN, 2, 0 <unfinished ...>
[pid 1996979] futex(0x804a034, FUTEX_WAIT_PRIVATE, 0, NULL <unfinished ...>
[pid 1996977] <... mmap2 resumed>)      = 0xf71ba000
[pid 1996977] clone(child_stack=0xf75b9ff8, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_PARENT|CLONE_THREAD|CLONE_IOstrace: Process 1996980 attached
) = 1996980
[pid 1996980] futex(0x804a034, FUTEX_WAIT_PRIVATE, 0, NULL <unfinished ...>
[pid 1996977] mmap2(NULL, 4194304, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_GROWSDOWN, 3, 0) = 0xf6cba000
[pid 1996977] clone(child_stack=0xf70b9ff8, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_PARENT|CLONE_THREAD|CLONE_IOstrace: Process 1996981 attached
) = 1996981
[pid 1996981] futex(0x804a034, FUTEX_WAIT_PRIVATE, 0, NULL <unfinished ...>
[pid 1996977] mmap2(NULL, 4194304, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_GROWSDOWN, 4, 0) = 0xf67ba000
[pid 1996977] clone(child_stack=0xf6bb9ff8, flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_PARENT|CLONE_THREAD|CLONE_IOstrace: Process 1996982 attached
) = 1996982
[pid 1996982] futex(0x804a034, FUTEX_WAIT_PRIVATE, 0, NULL <unfinished ...>
[pid 1996977] socket(AF_INET, SOCK_STREAM, IPPROTO_IP) = 3
[pid 1996977] bind(3, {sa_family=AF_INET, sin_port=htons(3000), sin_addr=inet_addr("")}, 16) = 0
[pid 1996977] listen(3, 2)              = 0
[pid 1996977] write(1, "Listening to the port 3000\n\0", 28Listening to the port 3000
) = 28
[pid 1996977] accept4(3, NULL, NULL, 0

GDB tricks

p (<cast>) <variable>
x/d $register
i r
i r <register>


All my experiments on ASM x86 are located in this repository.

FROM --platform=linux/amd64 ubuntu
RUN apt-get update
RUN apt-get install make binutils build-essential -y
RUN apt-get install nasm gdb strace -y
; This HTTP server works using a pool of threads.
; When a new connection is established, the client connection (clientfd) is enqueued.
; The queue uses two pointers and employs a mutex and condvar for synchronization.
; Each thread in the pool waits in the queue through a futex until a new connection is enqueued.
global _start
; Syscalls constants
%define SYS_futex 240 ; futex
%define SYS_mmap2 192 ; allocate memory into heap
%define SYS_clone 120 ; create thread
%define SYS_socket 359 ; open socket
%define SYS_bind 361 ; bind to open socket
%define SYS_listen 363 ; listen to the socket
%define SYS_accept4 364 ; accept connections to the socket
%define SYS_write 4 ; write
%define SYS_close 6 ; close
%define SYS_exit 1 ; exit
%define SYS_exit_group 252 ; exit
; Misc constants
%define STDOUT 1
%define QUEUE_SIZE 10
; Socket constants
%define AF_INET 0x2
%define SOCK_PROTOCOL 0x0
%define SIN_ZERO 0x0
%define IP_ADDRESS 0x0 ;
%define PORT 0xB80B ; 3000 (big-endian)
%define BACKLOG 0x2
; Threading constants
%define STACK_SIZE (4096 * 1024) ; 4MB
%define PROT_READ 0x1
%define PROT_WRITE 0x2
%define MAP_GROWSDOWN 0x100
%define MAP_ANONYMOUS 0x0020 ; No file descriptor involved
%define MAP_PRIVATE 0x0002 ; Do not share across processes
%define CLONE_VM 0x00000100
%define CLONE_FS 0x00000200
%define CLONE_FILES 0x00000400
%define CLONE_SIGHAND 0x00000800
%define CLONE_PARENT 0x00008000
%define CLONE_THREAD 0x00010000
%define CLONE_IO 0x80000000
%define THREAD_FLAGS \
; Futex constants
%define FUTEX_WAIT 0
%define FUTEX_WAKE 1
section .data
queue: dd QUEUE_SIZE dup(0) ; initialize array with zero's
front: dd 0 ; the front pointer for connection queue
rear: dd 0 ; the rear pointer (size) for connection queue
mutex: dd 1 ; a shared variable to synchronize threads in spinlock
condvar: dd 0 ; a shared variable to synchronize threads in futex
section .bss
sockfd: resd 1 ; the socket file descriptor
section .text
listenMsg: db "Listening to the port 3000", 0xA, 0
listenMsgLen: equ $- listenMsg
; ==========================
; ======== _start ==========
; ==========================
mov edi, 0 ; thread pool counter
mov ebx, _thandle ; save the function pointer to be used in the thread
call _pthread ; create a new thread
inc edi
cmp edi, 5 ; pool size
je .socket
jmp .pool
; open a new socket
; socket(int family, int type, int proto)
mov ebx, AF_INET
mov ecx, SOCK_STREAM
mov eax, SYS_socket
int 0x80
test eax, eax
js _error
mov [sockfd], eax ; save the fd into memory
; define the struct by pushing 16 bytes onto the stack
; family, port, ip_addr, sin_zero
push dword SIN_ZERO ; 4 bytes
push dword IP_ADDRESS ; 4 bytes
push word PORT ; 2 bytes
push word AF_INET ; 2 bytes
; bind socket to an IP address and Port
; bind(int fd, struct *str, int strlen)
mov ebx, [sockfd]
mov edx, 16
mov ecx, esp ; esp is the stack pointer, top AF_INET
mov eax, SYS_bind
int 0x80
add esp, 12 ; pop 12 bytes from the stack
test eax, eax
js _error
; make socket to listen on the bound address
; listen(int fd, int backlog)
mov ebx, [sockfd]
mov ecx, BACKLOG
mov eax, SYS_listen
int 0x80
test eax, eax
js _error
; print "Listening on the port 3000" in STDOUT
mov esi, listenMsg
mov edi, listenMsgLen
call _print
; block until a new connection is established
; accept4(int fd, struct*, int, int)
mov ebx, [sockfd]
mov ecx, 0x0
mov edx, 0x0
mov esi, 0x0
mov eax, SYS_accept4
int 0x80
mov edi, eax ; save the client socket (eax) in the register (edi)
call _enqueue ; enqueue the register
jmp .accept ; repeat in loop
; ============================
; ======== _thandle ==========
; ============================
mov eax, [rear] ; check queue size
cmp eax, 0 ; compare
je .wait ; wait while queue is empty
call _dequeue ; dequeue a connection (element is stored in edi)
jmp .handle_task ; handle the task
call _wait_condvar ; wait on futex controlled by an integer (condvar)
jmp _thandle ; repeat in loop
push edi ; push edi (connection) onto the stack
call _handle ; call the handle function
pop ebp ; pop connection from the stack
jmp _thandle ; repeat in loop
response: db `HTTP/1.1 200 OK\r\nContent-Length: 22\r\n\r\n<h1>Hello, World!</h1>`, 0
responseLen: equ $- response
; ===========================
; ======== _handle ==========
; ===========================
push ebp ; create a stack frame
mov ebp, esp ; preserve base pointer
mov ebx, [ebp + 8] ; 1st argument in the stack (connection)
pop ebp ; drop stack frame
; write response into the connection socket
mov ecx, response
mov edx, responseLen
mov eax, SYS_write
int 0x80
; close the client socket
mov eax, SYS_close
int 0x80
mov ebx, STDOUT
mov ecx, esi
mov edx, edi
mov eax, SYS_write
int 0x80
error: db "An error occurred", 0
errorLen: equ $- error
mov ebx, STDOUT
mov ecx, error
mov edx, errorLen
mov eax, SYS_write
int 0x80
; Terminates all threads
mov ebx, 1
mov eax, SYS_exit_group
int 0x80
; ============================
; ======== _pthread ==========
; ============================
; Creates a POSIX thread using a local stack
; ebx contains the function pointer (_thandle)
; push the function pointer onto the stack
push ebx
; memory allocation (stack-like)
; after syscall, 4MB will be allocated in the memory
; mmap2(addr*, int len, int prot, int flags)
mov ebx, 0x0
mov ecx, STACK_SIZE
mov eax, SYS_mmap2
int 0x80
; thread creation
; clone(int flags, thread_stack*)
lea ecx, [eax + STACK_SIZE - 8] ; stack pointer for the thread
pop dword [ecx] ; pop function pointer into ecx (stack pointer)
mov eax, SYS_clone
int 0x80
; ============================
; ======== _enqueue ==========
; ============================
; Enqueue connections into the queue
; edi register contains the connection to be enqueued
call _lock_mutex ; spinlock in mutex
mov ebx, [rear] ; preserve rear pointer
mov dword [queue + ebx * 4], edi ; enqueue the connection
inc dword [rear] ; increment the rear pointer (size)
call _emit_signal ; futex wake the any suspended thread
call _unlock_mutex ; unlock mutex
; ============================
; ======== _dequeue ==========
; ============================
; Dequeue connections from the queue
call _lock_mutex ; spinlock in mutex
xor ebx, ebx ; clear register
xor edi, edi ; clear register
xor edx, edx ; clear register
lea ecx, [queue] ; load queue address into ecx
mov ebx, [front] ; current pointer
cmp ebx, [rear] ; check if reached end of queue
je .empty ; return if empty
mov edi, dword [ecx + ebx * 4] ; fetch the 1st element
inc ebx ; increment current pointer (next pointer)
mov edx, dword [ecx + ebx * 4] ; save next pointer into register
cmp edx, 0 ; check if reached end
je .return ; return if reached end
mov dword [ecx + (ebx - 1) * 4], edx ; shift the next element into the previous position
cmp ebx, [rear] ; check if reached end of queue
jle .shift ; repeat and keep shifting until end
mov dword [ecx + (ebx - 1) * 4], 0 ; empty the last index after shifting
dec dword [rear] ; decrement rear pointer (reduced size)
call _unlock_mutex ; unlock mutex
mov edi, 0 ; save into register the value 0 (none)
call _unlock_mutex ; unlock mutex
; ===============================
; ======== _lock_mutex ==========
; ===============================
mov eax, 0
xchg eax, [mutex] ; atomically exchange mutex value with 0
test eax, eax ; test if mutex was previously unlocked
jnz .done ; if mutex was previously unlocked, we have successfully locked it
pause ; otherwise, spin and retry (reduce CPU usage)
jmp _lock_mutex ; keep trying to lock
; =================================
; ======== _unlock_mutex ==========
; =================================
mov dword [mutex], 1 ; restore original value into mutex
; =================================
; ======== _wait_condvar ==========
; =================================
; Waits on a condition variable.
; Uses futex syscall for underlying synchronization and thread scheduling.
mov ebx, condvar ; 1st arg: the address of variable
cmp dword [ebx], 1 ; check the value has changed
je .done ; terminates if successfully acquired lock
mov ecx, FUTEX_WAIT | FUTEX_PRIVATE_FLAG ; 2nd arg: futex op
mov edx, 0 ; 3rd arg: the target value
xor esi, esi ; 4th arg: empty
xor edi, edi ; 5th arg: empty
mov eax, SYS_futex
int 0x80
test eax, eax
jz .done
jmp _error
; ================================
; ======== _emit_signal ==========
; ================================
; Awake threads that are waiting on condition variable.
; Uses futex syscall for underlying synchronization and thread scheduling.
; 1st: uaddr* | 2nd: futex_op | 3rd: target_val | 4th: empty | 5th: empty
mov ebx, condvar
mov ecx, FUTEX_WAKE | FUTEX_PRIVATE_FLAG ; the difference is in the FUTEX_WAKE flag
mov edx, 0
xor esi, esi
xor edi, edi
mov eax, SYS_futex
int 0x80
; This HTTP server works using a pool of threads.
; When a new connection is established, the client connection (clientfd) is enqueued.
; The queue uses two pointers and employs a mutex and condvar for synchronization.
; Each thread in the pool waits in the queue through a futex until a new connection is enqueued.
global _start
; Syscalls constants
%define SYS_futex 202 ; futex
%define SYS_mmap 9 ; allocate memory into heap
%define SYS_clone 56 ; create thread
%define SYS_socket 41 ; open socket
%define SYS_bind 49 ; bind to open socket
%define SYS_listen 50 ; listen to the socket
%define SYS_accept4 288 ; accept connections to the socket
%define SYS_write 1 ; write
%define SYS_close 3 ; close
%define SYS_exit 60 ; exit
%define SYS_exit_group 231 ; exit
; Misc constants
%define STDOUT 1
%define QUEUE_SIZE 10
; Socket constants
%define AF_INET 0x2
%define SOCK_PROTOCOL 0x0
%define SIN_ZERO 0x0
%define IP_ADDRESS 0x0 ;
%define PORT 0xB80B ; 3000 (big-endian)
%define BACKLOG 0x2
; Threading constants
%define STACK_SIZE (4096 * 1024) ; 4MB
%define PROT_READ 0x1
%define PROT_WRITE 0x2
%define MAP_GROWSDOWN 0x100
%define MAP_ANONYMOUS 0x0020 ; No file descriptor involved
%define MAP_PRIVATE 0x0002 ; Do not share across processes
%define CLONE_VM 0x00000100
%define CLONE_FS 0x00000200
%define CLONE_FILES 0x00000400
%define CLONE_SIGHAND 0x00000800
%define CLONE_PARENT 0x00008000
%define CLONE_THREAD 0x00010000
%define CLONE_IO 0x80000000
%define THREAD_FLAGS \
; Futex constants
%define FUTEX_WAIT 0
%define FUTEX_WAKE 1
section .data
queue: dq QUEUE_SIZE dup(0) ; initialize array with zero's
front: dq 0 ; the front pointer for connection queue
rear: dq 0 ; the rear pointer (size) for connection queue
mutex: dq 1 ; a shared variable to synchronize threads in spinlock
condvar: dq 0 ; a shared variable to synchronize threads in futex
section .bss
sockfd: resq 1 ; the socket file descriptor
section .text
listenMsg: db "Listening to the port 3000", 0xA, 0
listenMsgLen: equ $- listenMsg
; ==========================
; ======== _start ==========
; ==========================
mov r8, 0 ; thread pool counter
mov rdi, _thandle ; save the function pointer to be used in the thread
call _pthread ; create a new thread
inc r8
cmp r8, 5 ; pool size
je .socket
jmp .pool
; open a new socket
; socket(int family, int type, int proto)
mov rdi, AF_INET
mov rsi, SOCK_STREAM
mov rax, SYS_socket
test rax, rax
js _error
mov [sockfd], rax ; save the fd into memory
; define the struct by pushing 12 bytes onto the stack
; family, port, ip_addr, sin_zero
push dword SIN_ZERO ; 4 bytes
push dword IP_ADDRESS ; 4 bytes
push word PORT ; 2 bytes
push word AF_INET ; 2 bytes
; bind socket to an IP address and Port
; bind(int fd, struct *str, int strlen)
mov rdi, [sockfd]
mov rsi, rsp ; rsp is the stack pointer, top AF_INET
mov rdx, 16
mov rax, SYS_bind
add rsp, 12 ; pop 12 bytes from the stack
test rax, rax
js _error
; make socket to listen on the bound address
; listen(int fd, int backlog)
mov rdi, [sockfd]
mov rsi, BACKLOG
mov rax, SYS_listen
test rax, rax
js _error
; print "Listening on the port 3000" in STDOUT
mov r10, listenMsg
mov r8, listenMsgLen
call _print
; block until a new connection is established
; accept4(int fd, struct*, int, int)
mov rdi, [sockfd]
mov rsi, 0x0
mov rdx, 0x0
mov r10, 0x0
mov rax, SYS_accept4
mov r8, rax ; save the client socket (rax) in the register (r8)
call _enqueue ; enqueue the register
jmp .accept ; repeat in loop
; ============================
; ======== _thandle ==========
; ============================
mov eax, [rear] ; check queue size
cmp eax, 0 ; compare
je .wait ; wait while queue is empty
call _dequeue ; dequeue a connection (element is stored in r8)
jmp .handle_task ; handle the task
call _wait_condvar ; wait on futex controlled by an integer (condvar)
jmp _thandle ; repeat in loop
push r8 ; push r8 (connection) onto the stack
call _handle ; call the handle function
pop rbp ; pop connection from the stack
jmp _thandle ; repeat in loop
response: db `HTTP/1.1 200 OK\r\nContent-Length: 22\r\n\r\n<h1>Hello, World!</h1>`, 0
responseLen: equ $- response
; ===========================
; ======== _handle ==========
; ===========================
push rbp ; create a stack frame
mov rbp, rsp ; preserve base pointer
mov rdi, [rbp + 16] ; 1st argument in the stack (connection)
pop rbp ; drop stack frame
; write response into the connection socket
mov rsi, response
mov rdx, responseLen
mov rax, SYS_write
; close the client socket
mov rax, SYS_close
mov rdi, STDOUT
mov rsi, r10
mov rdx, r8
mov rax, SYS_write
error: db "An error occurred", 0
errorLen: equ $- error
mov rdi, STDOUT
mov rsi, error
mov rdx, errorLen
mov rax, SYS_write
; Terminates all threads
mov rdi, 1
mov rax, SYS_exit_group
; ============================
; ======== _pthread ==========
; ============================
; Creates a POSIX thread using a local stack
; rdi contains the function pointer (_thandle)
; push the function pointer onto the stack
push rdi
; memory allocation (stack-like)
; after syscall, 4MB will be allocated in the memory
; mmap(addr*, int len, int prot, int flags)
mov rdi, 0x0
mov rsi, STACK_SIZE
mov rax, SYS_mmap
; thread creation
; clone(int flags, thread_stack*)
lea rsi, [rax + STACK_SIZE - 8] ; stack pointer for the thread
pop qword [rsi]
mov rax, SYS_clone
; ============================
; ======== _enqueue ==========
; ============================
; Enqueue connections into the queue
; r8 register contains the connection to be enqueued
call _lock_mutex ; spinlock in mutex
mov rdi, [rear] ; preserve rear pointer
mov qword [queue + rdi * 8], r8 ; enqueue the connection
inc qword [rear] ; increment the rear pointer (size)
call _emit_signal ; futex wake the any suspended thread
call _unlock_mutex ; unlock mutex
; ============================
; ======== _dequeue ==========
; ============================
; Dequeue connections from the queue
call _lock_mutex ; spinlock in mutex
xor rdi, rdi ; clear register
xor r8, r8 ; clear register
xor rdx, rdx ; clear register
lea rsi, [queue] ; load queue address into rsi
mov rdi, [front] ; current pointer
cmp rdi, [rear] ; check if reached end of queue
je .empty ; return if empty
mov r8, qword [rsi + rdi * 8] ; fetch the 1st element
inc rdi ; increment current pointer (next pointer)
mov rdx, qword [rsi + rdi * 8] ; save next pointer into register
cmp rdx, 0 ; check if reached end
je .return ; return if reached end
mov qword [rsi + (rdi - 1) * 8], rdx ; shift the next element into the previous position
cmp rdi, [rear] ; check if reached end of queue
jle .shift ; repeat and keep shifting until end
mov qword [rsi + (rdi - 1) * 8], 0 ; empty the last index after shifting
dec qword [rear] ; decrement rear pointer (reduced size)
call _unlock_mutex ; unlock mutex
mov r8, 0 ; save into register the value 0 (none)
call _unlock_mutex ; unlock mutex
; ===============================
; ======== _lock_mutex ==========
; ===============================
mov rax, 0
xchg rax, [mutex] ; atomically exchange mutex value with 0
test rax, rax ; test if mutex was previously unlocked
jnz .done ; if mutex was previously unlocked, we have successfully locked it
pause ; otherwise, spin and retry (reduce CPU usage)
jmp _lock_mutex ; keep trying to lock
; =================================
; ======== _unlock_mutex ==========
; =================================
mov qword [mutex], 1 ; restore original value into mutex
; =================================
; ======== _wait_condvar ==========
; =================================
; Waits on a condition variable.
; Uses futex syscall for underlying synchronization and thread scheduling.
mov rdi, condvar ; 1st arg: the address of variable
mov rsi, FUTEX_WAIT | FUTEX_PRIVATE_FLAG ; 2nd arg: futex op
mov rdx, 0 ; 3rd arg: the target value
xor r10, r10 ; 4th arg: empty
xor r8, r8 ; 5th arg: empty
mov rax, SYS_futex
test rax, rax
jz .done
jmp _error
; ================================
; ======== _emit_signal ==========
; ================================
; Awake threads that are waiting on condition variable.
; Uses futex syscall for underlying synchronization and thread scheduling.
; 1st: uaddr* | 2nd: futex_op | 3rd: target_val | 4th: empty | 5th: empty
mov rdi, condvar
mov rsi, FUTEX_WAKE | FUTEX_PRIVATE_FLAG ; the difference is in the FUTEX_WAKE flag
mov rdx, 0
xor r10, r10
xor r8, r8
mov rax, SYS_futex
