SEJeff/etc_limits.d_solana-limits.conf.j2

## etc_limits.d_solana-limits.conf.j2
$ cat roles/solana/templates/solana-limits.conf.j2
# {{ ansible_managed }}
# If the network gets real sad and requires a manual restart, it is
# possible solana-ledger-tool needs to run, but it requires 500k open
# files to run.
solana           soft   nofile  1000000
solana           hard   nofile  1000000

## etc_sysctl.d_solana-tuning.conf.j2
$ cat roles/solana/templates/solana-tuning.conf.j2
## {{ ansible_managed }}
# Taken from: https://github.com/mvines/sosh/tree/5a48d8481408b7f33355862d3f3e32d485853598#tuning

# Increase UDP buffer size
net.core.rmem_default = 134217728
net.core.rmem_max = 134217728
net.core.wmem_default = 134217728
net.core.wmem_max = 134217728

# Increase memory mapped files limit
vm.max_map_count = 2000000

# Previously, the system_monitor_service in solana recommended some very
# bad limits that broke quic and caused the validator to emit vote only
# blocks. There is a page from the Shinobi Systems folks that mention not
# following its advice and setting net.core.optmem_max to zero here:
#    https://stakeview.app/poor_validator.html
#
# This is mentioned in this solana issue:
#    https://github.com/solana-labs/solana/issues/33789
#
# And was fixed in this PR:
#    https://github.com/solana-labs/solana/pull/34149
net.core.optmem_max = 20480
net.core.netdev_max_backlog = 8192

## solana-group-vars.yml
# heavily cut down for brevity

# Default is 200m per:
#    https://github.com/solana-labs/solana/blob/19454bf56efb41cebd354a942d3b63235541292a/ledger/src/blockstore_cleanup_service.rs#L34
#
# This cuts the disk usage of the ledger in half and reduces io a bit.
solana_limit_ledger_size: 100_000_000

solana_service_environment:
  # Look at the service logs and just silence one crate at a time using this syntax:
  #    https://docs.rs/env_logger/latest/env_logger/#enabling-logging
  #
  # Set the default log level to "info" but the solana_metrics crate to "warn" since
  # the metric logs are both excessively verbose and generally useless for issues.
  RUST_LOG: info,solana_metrics=warn
  # Get the actual backtrace if the validator panics and dies
  RUST_BACKTRACE: 1

## solana.service.j2
$ cat roles/solana/templates/solana.service.j2
# {{ ansible_managed }}
{#- some fun hidden cli args in: https://github.com/solana-labs/solana/blob/master/validator/src/cli.rs #}

[Unit]
Description=Solana validator
After=network.target

[Service]
User=solana
Group=solana
ExecStart=/usr/local/bin/solana-validator \
{% if solana_voting %}
    --identity {{ solana_identity_keypair_path }} \
    --vote-account {{ solana_vote_account_keypair_path }} \
{% else %}
    --no-voting \
    --identity {{ solana_identity_keypair_path }} \
{# Enable RPC requests #}
    --full-rpc-api \
{% endif %}
{% for validator in solana_known_validators %}
    --known-validator {{ validator }} \
{% endfor %}
{% for entrypoint in solana_bootstrap_nodes %}
    --entrypoint {{ entrypoint }} \
{% endfor %}
    --ledger {{ solana_ledger_path }} \
    --rpc-port {{ solana_rpc_port }} \
    --dynamic-port-range {{ solana_dynamic_port_range }} \
    --expected-genesis-hash {{ solana_expected_genesis_hash }} \
    --expected-shred-version {{ solana_expected_shred_version }} \
    --wal-recovery-mode {{ solana_wal_recovery_mode }} \
    --use-snapshot-archives-at-startup when-newest \
    --incremental-snapshots \
{# Do not publish rpc port and address to p2p #}
    --private-rpc \
{% if solana_use_snapshot %}
    --no-genesis-fetch --no-snapshot-fetch \
{% endif %}
{% for arg in solana_extra_args %}
    {{ arg }} \
{% endfor %}
{# https://docs.solanalabs.com/operations/guides/validator-start#limiting-ledger-size-to-conserve-disk-space #}
    --limit-ledger-size{% if solana_limit_ledger_size %} {{ solana_limit_ledger_size }}{% endif %} \
{# log to stdout for the systemd journal #}
    --log -

##### Graceful Restart #####
# Wait for the node to create a snapshot and not to be the leader for a given amount of
# time before gracefully stopping. This ensures minimal skipped slots. If the node is currently
# marked unhealthy and the `--skip-health-check` is not passed, it won't exit until it is healthy.
# This also sets the stop timeout to twice the min-idle-time to ensure a snapshot is created.
#
# See:
#    https://docs.solanalabs.com/operations/best-practices/general#restarting-your-validator
TimeoutStopSec=10min
ExecStop=-/usr/local/bin/solana-validator --ledger {{ chain_base_path }} exit \
    --min-idle-time 5 \
    --skip-health-check

{% for name, value in solana_service_environment.items() | sort
%}Environment="{{ name }}={{ value }}"
{% endfor %}

# From: https://docs.solanalabs.com/operations/setup-a-validator#linux
LimitNOFILE=1000000
Restart=always
RestartSec=10s

[Install]
WantedBy=multi-user.target
	$ cat roles/solana/templates/solana-limits.conf.j2
	# {{ ansible_managed }}
	# If the network gets real sad and requires a manual restart, it is
	# possible solana-ledger-tool needs to run, but it requires 500k open
	# files to run.
	solana soft nofile 1000000
	solana hard nofile 1000000
	$ cat roles/solana/templates/solana-tuning.conf.j2
	## {{ ansible_managed }}
	# Taken from: https://github.com/mvines/sosh/tree/5a48d8481408b7f33355862d3f3e32d485853598#tuning

	# Increase UDP buffer size
	net.core.rmem_default = 134217728
	net.core.rmem_max = 134217728
	net.core.wmem_default = 134217728
	net.core.wmem_max = 134217728

	# Increase memory mapped files limit
	vm.max_map_count = 2000000

	# Previously, the system_monitor_service in solana recommended some very
	# bad limits that broke quic and caused the validator to emit vote only
	# blocks. There is a page from the Shinobi Systems folks that mention not
	# following its advice and setting net.core.optmem_max to zero here:
	# https://stakeview.app/poor_validator.html
	#
	# This is mentioned in this solana issue:
	# https://github.com/solana-labs/solana/issues/33789
	#
	# And was fixed in this PR:
	# https://github.com/solana-labs/solana/pull/34149
	net.core.optmem_max = 20480
	net.core.netdev_max_backlog = 8192
	# heavily cut down for brevity

	# Default is 200m per:
	# https://github.com/solana-labs/solana/blob/19454bf56efb41cebd354a942d3b63235541292a/ledger/src/blockstore_cleanup_service.rs#L34
	#
	# This cuts the disk usage of the ledger in half and reduces io a bit.
	solana_limit_ledger_size: 100_000_000

	solana_service_environment:
	# Look at the service logs and just silence one crate at a time using this syntax:
	# https://docs.rs/env_logger/latest/env_logger/#enabling-logging
	#
	# Set the default log level to "info" but the solana_metrics crate to "warn" since
	# the metric logs are both excessively verbose and generally useless for issues.
	RUST_LOG: info,solana_metrics=warn
	# Get the actual backtrace if the validator panics and dies
	RUST_BACKTRACE: 1
	$ cat roles/solana/templates/solana.service.j2
	# {{ ansible_managed }}
	{#- some fun hidden cli args in: https://github.com/solana-labs/solana/blob/master/validator/src/cli.rs #}

	[Unit]
	Description=Solana validator
	After=network.target

	[Service]
	User=solana
	Group=solana
	ExecStart=/usr/local/bin/solana-validator \
	{% if solana_voting %}
	--identity {{ solana_identity_keypair_path }} \
	--vote-account {{ solana_vote_account_keypair_path }} \
	{% else %}
	--no-voting \
	--identity {{ solana_identity_keypair_path }} \
	{# Enable RPC requests #}
	--full-rpc-api \
	{% endif %}
	{% for validator in solana_known_validators %}
	--known-validator {{ validator }} \
	{% endfor %}
	{% for entrypoint in solana_bootstrap_nodes %}
	--entrypoint {{ entrypoint }} \
	{% endfor %}
	--ledger {{ solana_ledger_path }} \
	--rpc-port {{ solana_rpc_port }} \
	--dynamic-port-range {{ solana_dynamic_port_range }} \
	--expected-genesis-hash {{ solana_expected_genesis_hash }} \
	--expected-shred-version {{ solana_expected_shred_version }} \
	--wal-recovery-mode {{ solana_wal_recovery_mode }} \
	--use-snapshot-archives-at-startup when-newest \
	--incremental-snapshots \
	{# Do not publish rpc port and address to p2p #}
	--private-rpc \
	{% if solana_use_snapshot %}
	--no-genesis-fetch --no-snapshot-fetch \
	{% endif %}
	{% for arg in solana_extra_args %}
	{{ arg }} \
	{% endfor %}
	{# https://docs.solanalabs.com/operations/guides/validator-start#limiting-ledger-size-to-conserve-disk-space #}
	--limit-ledger-size{% if solana_limit_ledger_size %} {{ solana_limit_ledger_size }}{% endif %} \
	{# log to stdout for the systemd journal #}
	--log -

	##### Graceful Restart #####
	# Wait for the node to create a snapshot and not to be the leader for a given amount of
	# time before gracefully stopping. This ensures minimal skipped slots. If the node is currently
	# marked unhealthy and the `--skip-health-check` is not passed, it won't exit until it is healthy.
	# This also sets the stop timeout to twice the min-idle-time to ensure a snapshot is created.
	#
	# See:
	# https://docs.solanalabs.com/operations/best-practices/general#restarting-your-validator
	TimeoutStopSec=10min
	ExecStop=-/usr/local/bin/solana-validator --ledger {{ chain_base_path }} exit \
	--min-idle-time 5 \
	--skip-health-check

	{% for name, value in solana_service_environment.items() \| sort
	%}Environment="{{ name }}={{ value }}"
	{% endfor %}

	# From: https://docs.solanalabs.com/operations/setup-a-validator#linux
	LimitNOFILE=1000000
	Restart=always
	RestartSec=10s

	[Install]
	WantedBy=multi-user.target