Skip to content

Instantly share code, notes, and snippets.

@talawahtech
Last active August 21, 2024 12:35
Show Gist options
  • Save talawahtech/ce2fe1f6a3e3851d15e912e0a4e93734 to your computer and use it in GitHub Desktop.
Save talawahtech/ce2fe1f6a3e3851d15e912e0a4e93734 to your computer and use it in GitHub Desktop.
CloudFormation template for "Extreme HTTP Performance Tuning" post: https://talawah.io/blog/extreme-http-performance-tuning-one-point-two-million/
AWSTemplateFormatVersion: '2010-09-09'
Description: Extreme Performance Tuning Benchmark Environment
Parameters:
AmiId:
Type: AWS::SSM::Parameter::Value<AWS::EC2::Image::Id>
Default: '/aws/service/ami-amazon-linux-latest/amzn2-ami-hvm-x86_64-gp2'
InstanceKeyPair:
Type: AWS::EC2::KeyPair::KeyName
InstanceSecurityGroup:
Type: AWS::EC2::SecurityGroup::Id
InstanceSubnet:
Type: AWS::EC2::Subnet::Id
InstanceVolumeSize:
Type: Number
Default: 8
Resources:
Client:
Type: AWS::EC2::Instance
Properties:
InstanceType: 'c5n.4xlarge'
Tags:
- Key: 'Name'
Value: 'extreme-client'
- Key: 'Role' # Used by cloud-init script to conditionally apply changes to only the client or server
Value: 'client'
LaunchTemplate:
LaunchTemplateId: !Ref 'LaunchTemplate'
Version: !GetAtt 'LaunchTemplate.LatestVersionNumber'
Server:
Type: AWS::EC2::Instance
Properties:
InstanceType: 'c5n.xlarge'
Tags:
- Key: 'Name'
Value: 'extreme-server'
- Key: 'Role' # Used by cloud-init script to conditionally apply changes to only the client or server
Value: 'server'
LaunchTemplate:
LaunchTemplateId: !Ref 'LaunchTemplate'
Version: !GetAtt 'LaunchTemplate.LatestVersionNumber'
ClusterPlacementGroup:
Type: AWS::EC2::PlacementGroup
Properties:
Strategy: cluster
# Allows 'aws ec2 describe-tags' to be called from the cloud-init script so it can differentiate client from server
Ec2Role:
Type: AWS::IAM::Role
Properties:
Path: /
Policies:
- PolicyName: 'AllowInstanceLogs'
PolicyDocument:
Version: '2012-10-17'
Statement:
- Effect: Allow
Action: [ 'ec2:DescribeTags' ]
Resource: '*'
AssumeRolePolicyDocument:
Statement:
- Effect: Allow
Principal:
Service: ['ec2.amazonaws.com']
Action: ['sts:AssumeRole']
Ec2InstanceProfile:
Type: AWS::IAM::InstanceProfile
Properties:
Path: /
Roles: [!Ref 'Ec2Role']
LaunchTemplate:
Type: AWS::EC2::LaunchTemplate
Properties:
LaunchTemplateName: !Ref 'AWS::StackName'
LaunchTemplateData:
ImageId: !Ref 'AmiId'
KeyName: !Ref 'InstanceKeyPair'
IamInstanceProfile:
Arn: !GetAtt 'Ec2InstanceProfile.Arn'
Placement:
GroupName: !Ref 'ClusterPlacementGroup'
NetworkInterfaces:
- DeviceIndex: 0
Ipv6AddressCount: 0 # Ensure that we don't get assigned any IPv6 addresses, even if it is the default for the subnet
SubnetId: !Ref 'InstanceSubnet'
Groups:
- !Ref 'InstanceSecurityGroup'
BlockDeviceMappings:
- DeviceName: '/dev/xvda'
Ebs:
VolumeSize: !Ref 'InstanceVolumeSize'
VolumeType: 'gp3'
UserData:
Fn::Base64: !Sub |
Content-Type: multipart/mixed; boundary="==BOUNDARY=="
MIME-Version: 1.0
--==BOUNDARY==
Content-Type: text/cloud-config; charset="us-ascii"
Content-Disposition: attachment; filename="cloud-config.txt"
# Automatically reboot after cloud-init completes to apply kernel param changes
power_state:
mode: reboot
message: Rebooting to apply new kernel params
timeout: 10
condition: True
bootcmd:
# These commands run on every boot, not just the first boot
#### Disable iptables
- modprobe -rv ip_tables
##### ENA driver configuration. Disable generic receive offloading
- ethtool -K eth0 gro off
##### ENA driver configuration. Enable adaptive IRQ coalescing (server only)
- export INSTANCE_ID=$(curl -s http://169.254.169.254/latest/meta-data/instance-id)
- echo INSTANCE_ID = ${!INSTANCE_ID}
- export INSTANCE_ROLE=$(aws ec2 describe-tags --region ${AWS::Region} --filters "Name=resource-id,Values=${!INSTANCE_ID}" "Name=key,Values=Role" --output text | cut -f5)
- echo INSTANCE_ROLE = ${!INSTANCE_ROLE}
- if [ "${!INSTANCE_ROLE}" == "server" ]; then ethtool -C eth0 adaptive-rx on; fi
- if [ "${!INSTANCE_ROLE}" == "server" ]; then ethtool -C eth0 tx-usecs 256; fi
##### Disable irqbalance and fix IRQs to cpus. Assumes # of irqs/queues = # of cpus!!!
## Note ${!} is the CF escape sequence for the bash equivalent and ${!!} is needed to get a literal ${!}
## sleep to give irqbalance time to shutdown before manually setting the values
- systemctl stop irqbalance.service
- echo sleeping
- sleep 5
- export IRQS=($(grep eth0 /proc/interrupts | awk '{print $1}' | tr -d :))
- for i in ${!!IRQS[@]}; do echo $i > /proc/irq/${!IRQS[i]}/smp_affinity_list; done;
- echo irq affinity
- for i in ${!!IRQS[@]}; do cat /proc/irq/${!IRQS[i]}/smp_affinity_list; done;
##### Setup Transmit Packet Steering (XPS) to map queue x to cpu x for outgoing packets. Assumes # of queues = # of cpus!!!
## A hex bitmap is used in this case, not the cpu id so we raise 2 to the power of i and convert it to hex
## Note ${!} is the CF escape sequence for the bash equivalent and ${!!} is needed to get a literal ${!}
- export TXQUEUES=($(ls -1qdv /sys/class/net/eth0/queues/tx-*))
- for i in ${!!TXQUEUES[@]}; do printf '%x' $((2**i)) > ${!TXQUEUES[i]}/xps_cpus; done;
- echo 'xps_cpus'
- for i in ${!!TXQUEUES[@]}; do cat ${!TXQUEUES[i]}/xps_cpus; done;
## Stop dhclient and set address lifetime to "forever"
- dhclient -x -pf /var/run/dhclient-eth0.pid
- dhclient -x -pf /var/run/dhclient6-eth0.pid
- ip addr change $( ip -4 addr show dev eth0 | grep 'inet' | awk '{ print $2 " brd " $4 " scope global"}') dev eth0 valid_lft forever preferred_lft forever
packages:
- git
- gcc
- make
- htop
- iperf3
- dstat
- pcp-system-tools
- perf
- iproute-tc
--==BOUNDARY==
Content-Type: text/x-shellscript; charset="us-ascii"
Content-Disposition: attachment; filename="user-data-script.txt"
#!/bin/bash
# Configure sysctls
cat > /etc/sysctl.d/90-extreme.conf <<- EOF
vm.swappiness=0
vm.dirty_ratio=80
net.core.somaxconn=2048
net.ipv4.tcp_max_syn_backlog=10000
net.core.busy_poll=1
net.core.default_qdisc=noqueue
net.ipv4.tcp_congestion_control=reno
EOF
# Reload sysctl to pick up new configs
sysctl -p
# Disable ssm agent. It doesn't really affect throughput, but any network activity can affect p99 and stdev for latency
systemctl stop amazon-ssm-agent
systemctl disable amazon-ssm-agent
# Install docker and stress-ng from amazon-linux-extras
amazon-linux-extras enable -y docker testing
yum install -y docker stress-ng
# Add the ec2-user and to the docker group so you can execute Docker commands without using sudo
usermod -a -G docker ec2-user
# Configure and start docker with iptables support disabled
mkdir -p /etc/systemd/system/docker.service.d/
cat > /etc/systemd/system/docker.service.d/startup_options.conf <<- EOF
[Service]
ExecStart=
ExecStart=/usr/bin/dockerd -H fd:// --bridge=none --iptables=false --ip-forward=false --live-restore
EOF
systemctl daemon-reload
systemctl enable docker
systemctl start docker
# Build (t)wrk
# Note that the luajit-devel package comes from the amazon-linux-extras repo for BCC
amazon-linux-extras enable BCC
yum clean metadata
yum install -y openssl11-devel luajit-devel-2.1.0
cd /home/ec2-user/
git clone https://github.com/talawahtech/wrk --single-branch --branch twrk twrk
cd twrk
make WITH_LUAJIT=/usr WITH_OPENSSL=/usr CFLAGS="-I /usr/include/luajit-2.1"
mv twrk /usr/local/bin/
chown -R ec2-user:ec2-user /home/ec2-user/twrk/
# Build and run the libreactor (round 20) docker container on the server
cd /home/ec2-user/
git clone https://github.com/TechEmpower/FrameworkBenchmarks --branch R20 --single-branch
chown -R ec2-user:ec2-user /home/ec2-user/FrameworkBenchmarks/
cd FrameworkBenchmarks/frameworks/C/libreactor/
docker build . -f libreactor.dockerfile --network host -t libreactor
docker build . -f libreactor-server.dockerfile --network host -t libreactor-server
# Install Flamegraph tools
cd /home/ec2-user/
git clone https://github.com/brendangregg/FlameGraph
chown -R ec2-user:ec2-user /home/ec2-user/FlameGraph/
# Download custom palette.map
wget -q https://gist.githubusercontent.com/talawahtech/b043e2dbf12af746de06b9b86c1a8b80/raw/ -O palette.map
chown ec2-user:ec2-user /home/ec2-user/palette.map
# Download network monitor script
wget -q https://gist.githubusercontent.com/talawahtech/de78601f1201d9586ac19fff420024b8/raw/ -O netmonitor.sh
chmod a+x netmonitor.sh
mv netmonitor.sh /usr/local/bin/
#### Set kernel params to disable speculative execution mitigations. Requires a reboot to take effect, which is handled above
sed -i 's/^GRUB_CMDLINE_LINUX_DEFAULT="/&nospectre_v1 nospectre_v2 pti=off mds=off tsx_async_abort=off /' /etc/default/grub
grub2-mkconfig -o /boot/grub2/grub.cfg
#### Disable syscall auditing (but otherwise leave auditd functioning).
echo "-a never,task" > /etc/audit/rules.d/disable-syscall-auditing.rules
/sbin/augenrules --load
--==BOUNDARY==--
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment