Create a gist now

Instantly share code, notes, and snippets.

@marcan /linux.sh
Last active Sep 17, 2017

What would you like to do?
Linux kernel initialization, translated to bash
#!/boot/bzImage
# Linux kernel userspace initialization code, translated to bash
# (Minus floppy disk handling, because seriously, it's 2017.)
# Not 100% accurate, but gives you a good idea of how kernel init works
# GPLv2, Copyright 2017 Hector Martin <marcan@marcan.st>
# Based on Linux 4.10-rc2.
# Note: pretend chroot is a builtin and affects the current process
# Note: kernel actually uses major/minor device numbers instead of device name
# strings in a few places, but I simplified it by using strings
# everywhere even though that is not completely accurate.
panic() {
echo "$*"
while true; do
sleep 1
done
}
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts.c#L363
do_mount_root() {
mount -t $2 "$1" /root $rootflags || return $?
cd /root
echo "VFS: Mounted root ($2 filesystem) on device $major:$minor"
}
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts.c#L381
mount_block_root() {
if [ -z $rootfstype ]; then
rootfstype=$built_in_filesystem_types
fi
for fs in ${rootfstype//,/ }; do
do_mount_root $1 $fs
ret=$?
case $ret in
13|22) # EACCES or EINVAL
;;
*)
echo "VFS: Cannot open root device \"$root_device_name\" or $1: error $ret"
echo "Please append a correct \"root=\" boot option; here are the available partitions:"
printk_all_partitions
panic "VFS: Unable to mount root fs on $1"
esac
done
echo "List of all partitions:"
printk_all_partitions
echo "No filesystem could mount root, tried: ${rootfstype//,/ }"
panic "VFS: Unable to mount root fs on $1"
}
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts.c#L512
mount_root() {
if [ "$root" = "/dev/nfs" ]; then
mount_nfs_root && return
echo "VFS: Unable to mount root fs via NFS, trying floppy."
root=/dev/fd0
fi
if [ "$root" = "/dev/fd0" ]; then
# floppy switching nonsense
fi
# This is really a mknod, as the kernel is working with the device number
cp -a "$root" /dev/root || echo "Failed to create /dev/root: $?"
mount_block_root /dev/root
}
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts_rd.c#L185
rd_load_image() {
# Supports more compression algorithms in practice
gzip -d <$1 >/dev/ram || cat $1 >/dev/ram
# Bunch of nonsense special casing for floppies skipped
# Everyone but S/390 gets a cute spinner here...
}
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts_initrd.c#L119
initrd_load() {
mknod /dev/ram b 1 0
if rd_load_image /initrd.image && [ "$root" != "/dev/ram0" ]; then
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts_initrd.c#L51
# This is the deprecated "change_root" mechanism; see Documentation/initrd.txt for details.
# In this mode, the initrd should contain /linuxrc and it is *not* responsible for mounting the rootfs.
rm /initrd.image
mknod /dev/root.old b 1 0
# mount initrd on rootfs' /root
mount_block_root /dev/root.old
mkdir /old
cd /old
# try loading default modules from initrd
load_default_modules
(
exec </dev/console >&0 2>&0
cd /root
mount --move . /
chroot .
setsid /linuxrc
)
# move initrd to rootfs' /old
mount --move .. .
# switch root and cwd back to / of rootfs
chroot ..
cd /
mount_root
echo -n "Trying to move old root to /initrd ... "
mount --move /old /root/initrd
ret=$?
if [ $ret = 0 ]; then
echo "okay"
else
if [ $ret = 2 ]; then # ENOENT
echo "/initrd does not exit. Ignored."
else
echo "failed"
fi
echo "Unmounting old root"
umount -l /old
echo -n "Trying to free ramdisk memory ... "
blockdev --flushbufs /dev/root.old && echo "okay" || echo "failed"
if
return 0
else
# Otherwise, if root=/dev/ram0, this is the "new" "pivot_root" initrd mechanism.
# The initrd is just mounted like any other root FS and $init is called in it.
# See Documentation/initrd.txt for what the initrd has to do in this case.
# Note that this is obsolete too in the more recent initramfs case.
rm /initrd.image
return 1
fi
}
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts.c#L549
prepare_namespace() {
if [ ! -z "$rootdelay" ]; then
echo "Waiting $rootdelay sec before mounting root device..."
sleep $rootdelay
fi
wait_for_device_probe # wait for devices
md_run_setup # md-raid autoconfig: https://github.com/torvalds/linux/blob/v4.10-rc2/init/do_mounts_md.c#L303
if [ ! -z "$root" ]; then
root_device_name="$root"
case "$root" in
mtd*|ubi*)
mount_block_root "$root"
mount -t devtmpfs devtmpfs dev # only if CONFIG_DEVTMPFS_MOUNT
mount --move . /
chroot .
return
;;
esac
root_device_name="${root##/dev/}"
fi
if ! initrd_load; then
if [ ! -z $root_wait ]; then
echo "Waiting for root device $root..."
while ! driver_probe_done || [ ! -e $root ]; do
sleep 1
done
fi
mount_root
fi
mount -t devtmpfs devtmpfs dev # only if CONFIG_DEVTMPFS_MOUNT
mount --move . /
chroot .
}
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/initramfs.c#L608
populate_rootfs() {
## https://github.com/torvalds/linux/blob/v4.10-rc2/scripts/gen_initramfs_list.sh#L50
## OR (if initramfs disabled): https://github.com/torvalds/linux/blob/v4.10-rc2/init/noinitramfs.c#L28
# default initramfs
cd /
mkdir /dev
mknod /dev/console c 5 1
mkdir /root
# additional kernel built-in initramfs contents (not a real device)
cpio -i < /dev/internal_initramfs
# note: /dev/initrd isn't a real device but represents the initrd memory
# /initrd.image is a real file on rootfs
if [ -e /dev/initrd ]; then
echo "Trying to unpack rootfs image as initramfs..."
# actual kernel code for cpio can deal with compression & concatenation
if ! cpio -i < /dev/initrd; then
echo "rootfs image is not an initramfs; looks like an initrd"
cp /dev/initrd /initrd.image
fi
free_initrd # gets rid of /dev/initrd: https://github.com/torvalds/linux/blob/v4.10-rc2/init/initramfs.c#L527
# Try loading default modules from initramfs. This gives
# us a chance to load before device_initcalls.
load_default_modules
fi
}
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/main.c#L952
kernel_init() {
## https://github.com/torvalds/linux/blob/v4.10-rc2/init/main.c#L999
early_kernel_init
# Note: at this point, as part of basic VFS init, a rootfs (special tmpfs) is mounted at /
## this is an initcall, called here: https://github.com/torvalds/linux/blob/v4.10-rc2/init/main.c#L873
## declared here: https://github.com/torvalds/linux/blob/v4.10-rc2/init/initramfs.c#L658
populate_rootfs
more_kernel_init
# Open the /dev/console on the rootfs, this should never fail
exec </dev/console >&0 2>&0 || echo "Warning: unable to open an initial console."
# check if there is an early userspace init. If yes, let it do all the work
if [ -z "$rdinit" ]; then
rdinit=/init
fi
if [ ! -e "$rdinit" ]; then
rdinit=
# Mount root, the whole shebang.
# Only done if there is *no* $rdinit (/init) in the initramfs!
prepare_namespace
fi
# Ok, we have completed the initial bootup, and
# we're essentially up and running. Get rid of the
# initmem segments and start the user-mode stuff..
#
# rootfs is available now, try loading the public keys
# and default modules
integrity_load_keys
load_default_modules
late_kernel_init
if [ ! -z "$rdinit" ]; then
# If present in the initramfs, $rdinit (/init) is responsible
# for *everything*, and this is the modern way of doing things.
# To find out what $rdinit has to do in that case, read
# Documentation/filesystems/ramfs-rootfs-initramfs.txt
exec $rdinit
echo "Failed to execute $rdinit (error $?)"
fi
if [ ! -z "$init" ]; then
# This could be the real /sbin/init, or an initrd /sbin/init.
exec $init
echo "Requested init $init failed (error $?)"
fi
exec /sbin/init || exec /etc/init || exec /bin/init || exec /bin/sh
panic "No working init found. Try passing init= option to kernel. See Linux Documentation/admin-guide/init.rst for guidance."
}
kernel_init

copumpkin commented Jan 9, 2017 edited

Nice! It might be interesting to link back from comments here to relevant parts of the actual source, so people can follow the correspondence more directly. Might also be a lot of work though 😄

Owner

marcan commented Jan 9, 2017

@copumpkin good point, I added some links back to the functions :)

f3rdy commented Jan 11, 2017

Thiy is valuable teaching. Thanks for that!

nonchip commented Jan 23, 2017

btw the chroot functionality you're assuming is actually a thing (called pivotroot) and used by early inits to mount the real root after running the initrd.

Owner

marcan commented Feb 3, 2017 edited

@nonchip not quite. pivot_root is a separate system call that affects the current mount namespace and all processes sharing it, while chroot only affects the current process. pivot_root is usually used in conjunction with chroot to ensure that the current working directory and root are correctly set. When I write chroot above I really do mean the good old chroot() system call. The problem is that it needs to affect the current process (the hypothetical shell, i.e. it needs to be built-in) while the traditional UNIX chroot command spawns a subprocess/subshell.

See https://github.com/torvalds/linux/blob/v4.10-rc2/fs/namespace.c#L3035 for more details on what exactly pivot_root does. It's very different from chroot (and it also only works on initrd/regular mounts, not on rootfs).

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment