Skip to content

Instantly share code, notes, and snippets.

@nh2
Created January 19, 2024 18:33
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save nh2/13425a1f18b4c1ce82edb63c10b163c9 to your computer and use it in GitHub Desktop.
Save nh2/13425a1f18b4c1ce82edb63c10b163c9 to your computer and use it in GitHub Desktop.
Open-sourcing Benaco's NixOS Ceph service module
{ config, lib, pkgs, ... }:
with lib;
let
cfg = config.services.ceph-benaco;
inherit (pkgs.callPackage ../helpers.nix {}) ensureUnitExists;
in
{
###### interface
options = {
services.ceph-benaco = {
enable = mkEnableOption "Ceph distributed filesystem";
package = mkOption {
type = types.package;
default = pkgs.ceph-benaco;
defaultText = literalExpression "pkgs.ceph-benaco";
description = "Ceph package to use.";
};
fsid = mkOption {
type = types.str;
description = "Unique cluster identifier.";
};
clusterName = mkOption {
type = types.str;
description = "Cluster name.";
default = "ceph";
};
initialMonitors = mkOption {
type = types.listOf (types.submodule {
options = {
hostname = mkOption {
type = types.str;
description = "Initial monitor hostname.";
};
ipAddress = mkOption {
type = types.str;
description = "Initial monitor IP address.";
};
};
});
description = "Initial monitors.";
};
mdsNodes = mkOption {
type = types.listOf (types.submodule {
options = {
hostname = mkOption {
type = types.str;
description = "MDS hostname.";
};
ipAddress = mkOption {
type = types.str;
description = "MDS IP address.";
};
};
});
description = "MDS nodes.";
};
publicNetworks = mkOption {
type = types.listOf types.str;
description = "Public network(s) of the cluster.";
};
adminKeyring = mkOption {
type = types.path;
description = "Ceph admin keyring to install on the machine.";
};
monitor = {
enable = mkEnableOption "Activate a Ceph monitor on this machine.";
initialKeyring = mkOption {
type = types.path;
description = "Keyring file to use when initializing a new monitor";
example = "/path/to/ceph.mon.keyring";
};
nodeName = mkOption {
type = types.str;
description = "Ceph monitor node name.";
example = "node1";
};
bindAddr = mkOption {
type = types.str;
description = "IP address that the OSDs shall bind to.";
example = "10.0.0.1";
};
advertisedPublicAddr = mkOption {
type = types.str;
description = "IP address that the monitor shall advertise.";
example = "10.0.0.1";
};
};
manager = {
enable = mkEnableOption "Activate a Ceph manager on this machine.";
nodeName = mkOption {
type = types.str;
description = "Ceph manager node name.";
example = "node1";
};
};
osdBindAddr = mkOption {
type = types.str;
description = "IP address that the OSDs shall bind to.";
example = "10.0.0.1";
};
osdAdvertisedPublicAddr = mkOption {
type = types.str;
description = "IP address that the OSDs shall advertise.";
example = "10.0.0.1";
};
osds = mkOption {
default = {};
example = {
osd1 = {
enable = true;
bootstrapKeyring = "/path/to/ceph.client.bootstrap-osd.keyring";
id = 1;
uuid = "11111111-1111-1111-1111-111111111111";
blockDevice = "/dev/sdb";
blockDeviceUdevRuleMatcher = ''KERNEL=="sdb"'';
};
osd2 = {
enable = true;
bootstrapKeyring = "/path/to/ceph.client.bootstrap-osd.keyring";
id = 2;
uuid = "22222222-2222-2222-2222-222222222222";
blockDevice = "/dev/sdc";
blockDeviceUdevRuleMatcher = ''KERNEL=="sdc"'';
};
};
description = ''
This option allows you to define multiple Ceph OSDs.
A common idiom is to use one OSD per physical hard drive.
Note that the OSD names given as attributes of this key
are NOT what ceph calls OSD IDs (instead, those are defined
by the 'services.ceph-benaco.osds.*.id' fields).
Instead, the name is an identifier local and unique to the
current machine only, used only to name the systemd service
for that OSD.
'';
type = types.attrsOf (types.submodule {
options = {
enable = mkEnableOption "Activate a Ceph OSD on this machine.";
bootstrapKeyring = mkOption {
type = types.path;
description = "Ceph OSD bootstrap keyring.";
example = "/path/to/ceph.client.bootstrap-osd.keyring";
};
id = mkOption {
type = types.int;
description = "The ID of this OSD. Must be unique in the Ceph cluster.";
example = 1;
};
uuid = mkOption {
type = types.str;
description = "The UUID of this OSD. Must be unique in the Ceph cluster.";
example = "abcdef12-abcd-1234-abcd-1234567890ab";
};
systemdExtraRequiresAfter = mkOption {
type = types.listOf types.str;
default = [];
description = ''
Add the specified systemd units to the "requires" and "after"
lists of the systemd service of this OSD.
Useful, for example, to decrypt the underlying block devices with LUKS first.
NixOS modules allow override those lists from outside, but for that
the names of the systemd services for the OSDs need to be known;
this option is a convenience to not have to know them from outside.
'';
example = "decrypt-my-disk.service";
};
skipZap = mkOption {
type = types.bool;
default = false;
description = ''
Whether to skip the zapping of the the OSD device on initial OSD
installation.
Skipping is needed because <command>ceph-volume</command> cannot
zap device-mapper devices:
<link xlink:href="https://tracker.ceph.com/issues/24504" />
In that case you need to wipe the device manually.
In the common case of placing the OSD on a cryptsetup LUKS device
(which is a device-mapper device), re-creating the encryption
from scratch with a new key zaps anything anyway, in which case
zapping can be skipped here.
'';
};
blockDevice = mkOption {
type = types.str;
description = "The block device used to store the OSD.";
example = "/dev/sdb";
};
blockDeviceUdevRuleMatcher = mkOption {
type = types.str;
description = ''
An udev rule matcher matching the block device used to store the OSD.
Will be spliced into the udev rule that is
used to set access permissions to the ceph user via an udev rule.
This is a matcher instead of just a device name to allow flexibility:
Normal disks can be easily matched with <code>KERNEL=="sda1"</code>, but
device-mapper may not; for example, decrypted cryptsetup LUKS devices
have a less useful <code>KERNEL=="dm-4"</code> and may better be matched
using <code>ENV{DM_NAME}=="mydisk-decrypted"</code>.
'';
example = ''KERNEL=="sdb"'';
};
dbBlockDevice = mkOption {
type = types.nullOr types.str;
default = null;
description = ''
The block device used to store the OSD's BlueStore DB device.
Put this on a faster device than <option>blockDevice</option> to improve performance.
See <link xlink:href="http://docs.ceph.com/docs/master/rados/configuration/bluestore-config-ref/" />
for details.
'';
example = "/dev/sdc";
};
dbBlockDeviceUdevRuleMatcher = mkOption {
type = types.nullOr types.str;
default = null;
description = ''
Like <option>blockDeviceUdevRuleMatcher</option> but for the
<option>dbBlockDevice</option>.
'';
example = ''KERNEL=="sdc"'';
};
};
});
};
mds = {
enable = mkEnableOption "Activate a Ceph MDS on this machine.";
nodeName = mkOption {
type = types.str;
description = "Ceph MDS node name.";
example = "node1";
};
listenAddr = mkOption {
type = types.str;
description = "IP address that the MDS shall advertise.";
example = "10.0.0.1";
};
};
extraConfig = mkOption {
type = types.str;
default = "";
description = ''
Additional ceph.conf settings.
See the sample file for inspiration:
<link xlink:href="https://github.com/ceph/ceph/blob/master/src/sample.ceph.conf" />
'';
};
};
};
###### implementation
config = let
monDir = "/var/lib/ceph/mon/${cfg.clusterName}-${cfg.monitor.nodeName}";
mgrDir = "/var/lib/ceph/mgr/${cfg.clusterName}-${cfg.manager.nodeName}";
mdsDir = "/var/lib/ceph/mds/${cfg.clusterName}-${cfg.mds.nodeName}";
# File permissions for things that are on locations wiped at start
# (e.g. /run or its /var/run symlink).
ensureTransientCephDirs = ''
install -m 770 -o ${config.users.users.ceph.name} -g ${config.users.groups.ceph.name} -d /var/run/ceph
'';
# File permissions from cluster deployed with ceph-deploy.
ensureCephDirs = ''
install -m 3770 -o ${config.users.users.ceph.name} -g ${config.users.groups.ceph.name} -d /var/log/ceph
install -m 770 -o ${config.users.users.ceph.name} -g ${config.users.groups.ceph.name} -d /var/run/ceph
install -m 750 -o ${config.users.users.ceph.name} -g ${config.users.groups.ceph.name} -d /var/lib/ceph
install -m 755 -o ${config.users.users.ceph.name} -g ${config.users.groups.ceph.name} -d /var/lib/ceph/mon
install -m 755 -o ${config.users.users.ceph.name} -g ${config.users.groups.ceph.name} -d /var/lib/ceph/mgr
install -m 755 -o ${config.users.users.ceph.name} -g ${config.users.groups.ceph.name} -d /var/lib/ceph/osd
'';
# Utilities called by Ceph device health scraping, see:
# https://docs.ceph.com/en/latest/rados/operations/devices/#enabling-monitoring
# As per https://github.com/ceph/ceph-container/pull/1490/commits/c49e821599965ae92a88b2c78077ee03c4405895,
# both the OSDs and the `mon` need this.
# Ceph calls these utilities with `sudo`. That requires sudoers entries.
# Sudoers entries require absolute path; that exact (nix store) path needs to
# be used by Ceph, so it needs to be given to the systemd unit via `path`.
# This is why we pair each `sudoersExtraRule` with the `package` to put onto
# that `path`.
#
# Entries are based on:
# https://github.com/ceph/ceph/blob/a2f5a3c1dbfa4dce41e25da4f029a8fdb8c8d864/sudoers.d/ceph-smartctl
cephMonitoringSudoersCommandsAndPackages = [
{
package = pkgs.smartmontools;
sudoersExtraRule = { # entry for `security.sudo.extraRules`
users = [ config.users.users.ceph.name ];
commands = [{
command = "${lib.getBin pkgs.smartmontools}/bin/smartctl -x --json=o /dev/*";
options = [ "NOPASSWD" ];
}];
};
}
{
package = pkgs.nvme-cli;
sudoersExtraRule = { # entry for `security.sudo.extraRules`
users = [ config.users.users.ceph.name ];
commands = [{
command = "${lib.getBin pkgs.nvme-cli}/bin/nvme * smart-log-add --json /dev/*";
options = [ "NOPASSWD" ];
}];
};
}
];
cephDeviceHealthMonitoringPathsOrPackages = with pkgs; [
# Contains `sudo`. Ceph wraps this around the other health check programs.
# Cannot use `pkgs.sudo` because that one is not SUID, see:
# https://discourse.nixos.org/t/sudo-uid-issues/9133
"/run/wrappers" # `systemd.services.<name>.path` adds the `bin/` subdir of this
] ++ map ({ package, ... }: package) cephMonitoringSudoersCommandsAndPackages;
makeCephOsdSetupSystemdService = localOsdServiceName: osdConfig:
let
osdExistenceFile = "/var/lib/ceph/osd/.${toString osdConfig.id}.${osdConfig.uuid}.nix-existence";
in
mkIf osdConfig.enable {
description = "Initialize Ceph OSD";
requires = osdConfig.systemdExtraRequiresAfter;
after = osdConfig.systemdExtraRequiresAfter;
path = with pkgs; [
# The following are currently missing in Ceph's wrapping, see https://github.com/NixOS/nixpkgs/issues/147801#issue-1065600852
util-linux # for `lsblk`
lvm2 # for `lvs`
];
# TODO Use `udevadm trigger --settle` instead of the separate `udevadm settle`
# once that feature is available to us with systemd >= 238;
# see https://github.com/systemd/systemd/commit/792cc203a67edb201073351f5c766fce3d5eab45
preStart = ''
set -x
${ensureCephDirs}
install -m 755 -o ${config.users.users.ceph.name} -g ${config.users.groups.ceph.name} -d /var/lib/ceph/bootstrap-osd
# `install` is not atomic, see
# https://lists.gnu.org/archive/html/bug-coreutils/2010-02/msg00243.html
# so use `mktemp` + `mv` to make it atomic.
TMPFILE=$(mktemp --tmpdir=/var/lib/ceph/bootstrap-osd/)
install -o ${config.users.users.ceph.name} -g ${config.users.groups.ceph.name} ${osdConfig.bootstrapKeyring} "$TMPFILE"
mv "$TMPFILE" /var/lib/ceph/bootstrap-osd/ceph.keyring
# Trigger udev rules for permissions of block devices and wait for them to settle.
udevadm trigger --name-match=${osdConfig.blockDevice}
'' + lib.optionalString (osdConfig.dbBlockDevice != null) ''
udevadm trigger --name-match=${osdConfig.dbBlockDevice}
'' +
''
udevadm settle
'' + (optionalString (!osdConfig.skipZap) (
''
# Zap OSD block devices, otherwise `ceph-osd` below will try to fsck if there's some old
# ceph data on the block device (see https://tracker.ceph.com/issues/24099).
${cfg.package}/bin/ceph-volume lvm zap ${osdConfig.blockDevice}
'' + lib.optionalString (osdConfig.dbBlockDevice != null) ''
${cfg.package}/bin/ceph-volume lvm zap ${osdConfig.dbBlockDevice}
''
));
script = ''
set -euo pipefail
set -x
until [ -f /etc/ceph/ceph.client.admin.keyring ]
do
sleep 1
done
OSD_SECRET=$(${cfg.package}/bin/ceph-authtool --gen-print-key)
echo "{\"cephx_secret\": \"$OSD_SECRET\"}" | \
${cfg.package}/bin/ceph osd new ${osdConfig.uuid} ${toString osdConfig.id} -i - \
-n client.bootstrap-osd -k ${osdConfig.bootstrapKeyring}
mkdir -p /var/lib/ceph/osd/${cfg.clusterName}-${toString osdConfig.id}
ln -s ${osdConfig.blockDevice} /var/lib/ceph/osd/${cfg.clusterName}-${toString osdConfig.id}/block
'' + lib.optionalString (osdConfig.dbBlockDevice != null) ''
ln -s ${osdConfig.dbBlockDevice} /var/lib/ceph/osd/${cfg.clusterName}-${toString osdConfig.id}/block.db
'' +
''
${cfg.package}/bin/ceph-authtool --create-keyring /var/lib/ceph/osd/ceph-${toString osdConfig.id}/keyring \
--name osd.${toString osdConfig.id} --add-key $OSD_SECRET
${cfg.package}/bin/ceph-osd -i ${toString osdConfig.id} --mkfs --osd-uuid ${osdConfig.uuid} --setuser ${config.users.users.ceph.name} --setgroup ${config.users.groups.ceph.name} --osd-objectstore bluestore
touch ${osdExistenceFile}
'';
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
PermissionsStartOnly = true; # only run the script as ceph, preStart as root
User = config.users.users.ceph.name;
Group = config.users.groups.ceph.name;
};
unitConfig = {
ConditionPathExists = "!${osdExistenceFile}";
};
};
makeCephOsdSystemdService = localOsdServiceName: osdConfig: mkIf osdConfig.enable {
description = "Ceph OSD";
# Note we do not have to add `osdConfig.systemdExtraRequiresAfter` here because
# that's already a dependency of our dependency `ceph-osd-setup-*`.
requires = [
(ensureUnitExists config "ceph-osd-setup-${localOsdServiceName}.service")
];
requiredBy = [ "multi-user.target" ];
after = [
"network.target"
"local-fs.target"
"time-sync.target"
(ensureUnitExists config "ceph-osd-setup-${localOsdServiceName}.service")
];
wants = [
"network.target"
"local-fs.target"
"time-sync.target"
];
path = [
# TODO: use wrapProgram in the ceph package for this in the future
pkgs.getopt
]
++ cephDeviceHealthMonitoringPathsOrPackages
;
restartTriggers = [ config.environment.etc."ceph/ceph.conf".source ];
preStart = ''
${ensureTransientCephDirs}
${lib.getLib cfg.package}/libexec/ceph/ceph-osd-prestart.sh --cluster ${cfg.clusterName} --id ${toString osdConfig.id}
'';
serviceConfig = {
LimitNOFILE="1048576";
LimitNPROC="1048576";
ExecStart=''
${cfg.package}/bin/ceph-osd -f --cluster ${cfg.clusterName} --id ${toString osdConfig.id} --setuser ${config.users.users.ceph.name} --setgroup ${config.users.groups.ceph.name} "--public_bind_addr=${cfg.osdBindAddr}" "--public_addr=${cfg.osdAdvertisedPublicAddr}"
'';
ExecReload=''
${pkgs.coreutils}/bin/kill -HUP $MAINPID
'';
Restart="on-failure";
ProtectHome="true";
ProtectSystem="full";
PrivateTmp="true";
TasksMax="infinity";
# StartLimitBurst="3";
};
# startLimitIntervalSec = 30 * 60;
};
in mkIf cfg.enable {
environment.systemPackages = [ cfg.package ];
networking.firewall = {
allowedTCPPorts = [
# Ceph outside of VPN because it is very data heavy and causes packet loss.
# We enable msgr-v2 only because that allows its own on-wire encryption.
3300 # ceph msgr-v2
];
allowedTCPPortRanges = [
{ from = 6800; to = 7300; } # https://docs.ceph.com/en/pacific/rados/configuration/network-config-ref/
];
};
# Reminder of how `ceph.conf` works:
#
# * Ceph upstream docs now recommend to use underscores instead of spaces.
# * Options in more specific sections like `[mon]` override those in less
# specific sections like `[global]`. But all options can be written into all sections,
# and an option has the same name, no matter in which section it is written.
# Thus, put options in `[global]`, and only use a diffent section
# if you want to override an option you've set in `global`.
#
# Sample: https://github.com/ceph/ceph/blob/master/src/sample.ceph.conf
environment.etc."ceph/ceph.conf".text =
let commaSep = builtins.concatStringsSep ",";
in ''
[global]
fsid = ${cfg.fsid}
mon_initial_members = ${commaSep (map (mon: mon.hostname) cfg.initialMonitors)}
mon_host = ${commaSep (map (mon: mon.ipAddress) cfg.initialMonitors)}
# Ceph clusters go into WARN health mode, until
# the following setting is made strict by setting it to `false`:
# See: https://docs.ceph.com/en/latest/security/CVE-2021-20288/#recommendations
# As of writing, this setting is not documented outside of the CVE note :(
#
# While for new clusters the warning no longer seems to appear, it still
# appears in our existing clusters unless this option is set, see:
# https://tracker.ceph.com/issues/53751#note-7
auth_allow_insecure_global_id_reclaim = false
# Disable dirfrag prefetch on MDS restart to prevent out-of-memory after
# many files were opened.
# Note this option has no effect on Ceph < 15, because it doesn't exist there.
# TODO: Remove this once we're on a Ceph version that includes this default,
# see https://github.com/ceph/ceph/pull/44667.
# This is assuming that the commit fixes existing clusters, see
# https://github.com/ceph/ceph/pull/44667#issuecomment-1036103397
# If it doesn't this can only be removed once we have no existing
# cluster with the old default.
mds_oft_prefetch_dirfrags = false
# Disable sleep between HDD recovery operations, otherwise recovery
# will take forever when small objects (e.g. CephFS files) are on HDD.
# See https://tracker.ceph.com/issues/23595#note-12
osd_recovery_sleep_hdd = 0.0
# Increase scrub intervals by 4x.
# Since we store many small files on HDD, and scrubbing apparently
# iterates over all objects
# we have no chance to scrub at the default intervals.
#
# (This was written when we had 400M files across 30 HDDs.)
# Change this back once we have reduced our number of files per disk.
osd_scrub_min_interval = 345600
osd_scrub_max_interval = 2419200
osd_deep_scrub_interval = 2419200
public_network = ${commaSep cfg.publicNetworks}
auth_cluster_required = cephx
auth_service_required = cephx
auth_client_required = cephx
# Enforce on-wire transport encryption.
ms_cluster_mode = secure
ms_service_mode = secure
ms_client_mode = secure
${cfg.extraConfig}
'';
environment.etc."ceph/ceph.client.admin.keyring" = {
source = cfg.adminKeyring;
mode = "0600";
# Make ceph own this keyring so that it can use it to get keys for its daemons.
user = "ceph";
group = "ceph";
};
users.users.ceph = {
isNormalUser = false;
isSystemUser = true;
# TODO: Legacy UID / GID chosen from before we configured the UID declaratively.
# In the future, we whould change this whole module to use
# `config.ids.uids.ceph`, like the upstream nixpkgs Ceph module does.
# Switching away from `nogroup` would also be good as described there.
# For both cases, we'll have to `chown` all relevant existing files on
# deployments, such as `/var/lib/ceph`, and log files.
uid = 1001;
group = config.users.groups.nogroup.name;
};
users.groups.ceph = {
# TODO: Same TODO as above for the `uid`.
gid = 499;
};
# Allow ceph daemons (which run as user ceph) to collect device health metrics.
security.sudo.extraRules =
map ({ sudoersExtraRule, ... }: sudoersExtraRule) cephMonitoringSudoersCommandsAndPackages;
# The udevadm trigger/settle in `makeCephOsdSetupSystemdService` waits for these rules rule to be applied.
services.udev.extraRules =
lib.concatStringsSep "\n" (
lib.mapAttrsToList (_localOsdServiceName: osdConfig:
''
SUBSYSTEM=="block", ${osdConfig.blockDeviceUdevRuleMatcher}, OWNER="${config.users.users.ceph.name}", GROUP="${config.users.groups.ceph.name}", MODE="0660"
''
+ lib.optionalString (osdConfig.dbBlockDeviceUdevRuleMatcher != null) (
''
SUBSYSTEM=="block", ${osdConfig.dbBlockDeviceUdevRuleMatcher}, OWNER="${config.users.users.ceph.name}", GROUP="${config.users.groups.ceph.name}", MODE="0660"
''
)
) cfg.osds
);
systemd.services = {
ceph-mon-setup = mkIf cfg.monitor.enable {
description = "Initialize ceph monitor";
preStart = ensureCephDirs;
script = let
# `--addv` seems currently required to get msgr-v2 working, see:
# https://tracker.ceph.com/issues/53751#note-11
monmapNodes = builtins.concatStringsSep " " (lib.concatMap (mon: [ "--addv" mon.hostname "[v2:${mon.ipAddress}:3300,v1:${mon.ipAddress}:6789]" ]) cfg.initialMonitors);
# Monitors cannot simply be changed in config, one has to update the monmap, see note [replacing-ceph-monmap-ips-for-existing-cluster]
in ''
set -euo pipefail
rm -rf "${monDir}" # Start from scratch.
echo "Initializing monitor."
MONMAP_DIR=`mktemp -d`
${cfg.package}/bin/monmaptool --create ${monmapNodes} --fsid ${cfg.fsid} "$MONMAP_DIR/monmap"
${cfg.package}/bin/ceph-mon --cluster ${cfg.clusterName} --mkfs -i ${cfg.monitor.nodeName} --monmap "$MONMAP_DIR/monmap" --keyring ${cfg.monitor.initialKeyring}
rm -r "$MONMAP_DIR"
touch ${monDir}/done
'';
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
PermissionsStartOnly = true; # only run the script as ceph
User = config.users.users.ceph.name;
Group = config.users.groups.ceph.name;
};
unitConfig = {
ConditionPathExists = "!${monDir}/done";
};
};
ceph-mon = mkIf cfg.monitor.enable {
description = "Ceph monitor";
requires = [ (ensureUnitExists config "ceph-mon-setup.service") ];
requiredBy = [ "multi-user.target" ];
after = [ "network.target" "local-fs.target" "time-sync.target" (ensureUnitExists config "ceph-mon-setup.service") ];
wants = [ "network.target" "local-fs.target" "time-sync.target" ];
restartTriggers = [ config.environment.etc."ceph/ceph.conf".source ];
path = cephDeviceHealthMonitoringPathsOrPackages;
preStart = ensureTransientCephDirs;
serviceConfig = {
LimitNOFILE="1048576";
LimitNPROC="1048576";
ExecStart=''
${cfg.package}/bin/ceph-mon -f --cluster ${cfg.clusterName} --id ${cfg.monitor.nodeName} --setuser ${config.users.users.ceph.name} --setgroup ${config.users.groups.ceph.name} "--public_bind_addr=${cfg.monitor.bindAddr}" "--public_addr=${cfg.monitor.advertisedPublicAddr}"
'';
ExecReload=''
${pkgs.coreutils}/bin/kill -HUP $MAINPID
'';
PrivateDevices="yes";
ProtectHome="true";
ProtectSystem="full";
PrivateTmp="true";
TasksMax="infinity";
Restart="on-failure";
# StartLimitBurst="5";
RestartSec="10";
};
# startLimitIntervalSec = 30 * 60;
};
ceph-mgr-setup = mkIf cfg.manager.enable {
description = "Initialize Ceph manager";
preStart = ensureCephDirs;
script = ''
set -euo pipefail
mkdir -p ${mgrDir}
until [ -f /etc/ceph/ceph.client.admin.keyring ]
do
sleep 1
done
${cfg.package}/bin/ceph auth get-or-create mgr.${cfg.manager.nodeName} mon 'allow profile mgr' mds 'allow *' osd 'allow *' -o ${mgrDir}/keyring
touch "${mgrDir}/.nix_done"
'';
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
PermissionsStartOnly = true; # only run the script as ceph
User = config.users.users.ceph.name;
Group = config.users.groups.ceph.name;
};
unitConfig = {
ConditionPathExists = "!${mgrDir}/.nix_done";
};
};
ceph-mgr = mkIf cfg.manager.enable {
description = "Ceph manager";
requires = [ (ensureUnitExists config "ceph-mgr-setup.service") ];
requiredBy = [ "multi-user.target" ];
after = [ "network.target" "local-fs.target" "time-sync.target" (ensureUnitExists config "ceph-mgr-setup.service") ];
wants = [ "network.target" "local-fs.target" "time-sync.target" ];
restartTriggers = [ config.environment.etc."ceph/ceph.conf".source ];
preStart = ensureTransientCephDirs;
serviceConfig = {
LimitNOFILE="1048576";
LimitNPROC="1048576";
ExecStart=''
${cfg.package}/bin/ceph-mgr -f --cluster ${cfg.clusterName} --id ${cfg.manager.nodeName} --setuser ${config.users.users.ceph.name} --setgroup ${config.users.groups.ceph.name}
'';
ExecReload=''
${pkgs.coreutils}/bin/kill -HUP $MAINPID
'';
Restart="on-failure";
RestartSec=10;
# StartLimitBurst="3";
};
# startLimitIntervalSec = 30 * 60;
};
ceph-mds-setup = mkIf cfg.mds.enable {
description = "Initialize Ceph MDS";
preStart = ensureCephDirs;
script = ''
set -euo pipefail
mkdir -p ${mdsDir}
until [ -f /etc/ceph/ceph.client.admin.keyring ]
do
sleep 1
done
${cfg.package}/bin/ceph auth get-or-create mds.${cfg.mds.nodeName} osd 'allow rwx' mds 'allow' mon 'allow profile mds' -o ${mdsDir}/keyring
touch "${mdsDir}/.nix_done"
'';
serviceConfig = {
Type = "oneshot";
RemainAfterExit = true;
PermissionsStartOnly = true; # only run the script as ceph
User = config.users.users.ceph.name;
Group = config.users.groups.ceph.name;
};
unitConfig = {
ConditionPathExists = "!${mdsDir}/.nix_done";
};
};
ceph-mds = mkIf cfg.mds.enable {
description = "Ceph MDS";
requires = [ (ensureUnitExists config "ceph-mds-setup.service") ];
requiredBy = [ "multi-user.target" ];
after = [ "network.target" "local-fs.target" "time-sync.target" (ensureUnitExists config "ceph-mds-setup.service") ];
wants = [ "network.target" "local-fs.target" "time-sync.target" ];
restartTriggers = [ config.environment.etc."ceph/ceph.conf".source ];
preStart = ensureTransientCephDirs;
serviceConfig = {
LimitNOFILE="1048576";
LimitNPROC="1048576";
ExecStart=''
${cfg.package}/bin/ceph-mds -f --cluster ${cfg.clusterName} --id ${cfg.mds.nodeName} --setuser ${config.users.users.ceph.name} --setgroup ${config.users.groups.ceph.name} "--public_addr=${cfg.mds.listenAddr}"
'';
ExecReload=''
${pkgs.coreutils}/bin/kill -HUP $MAINPID
'';
Restart="on-failure";
# StartLimitBurst="3";
};
# startLimitIntervalSec = 30 * 60;
};
}
# Make one OSD service for each configured OSD.
// lib.mapAttrs' (localOsdServiceName: osdConfig: nameValuePair "ceph-osd-setup-${localOsdServiceName}" (makeCephOsdSetupSystemdService localOsdServiceName osdConfig)) cfg.osds
// lib.mapAttrs' (localOsdServiceName: osdConfig: nameValuePair "ceph-osd-${localOsdServiceName}" (makeCephOsdSystemdService localOsdServiceName osdConfig)) cfg.osds;
};
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment