#!/bin/sh
set -eu

log() {
    echo "[init] $*" >&2
}

panic() {
    echo "initramfs panic: $*" >&2
    exec sh
}

mount_or_panic() {
    mount "$@" || panic "mount failed: $*"
}

mount_retry() {
    dev="$1"
    target="$2"
    fstype="$3"
    opts="$4"

    i=0
    while :; do
		# BusyBox mount just needs a normal -o option string here.
		# The important bit is that overlayfs itself requires lowerdir/upperdir/workdir,
		# and workdir must live on the same filesystem as upperdir.
        if mount -o "$opts" -t "$fstype" "$dev" "$target"; then
            return 0
        fi

        i=$((i + 1))
        [ "$i" -ge 50 ] && panic "Timed out mounting $dev on $target"
        sleep 0.2
    done
}

mount_data_overlay() {
    dir="$1"

    case "$dir" in
        /*) ;;
        *) panic "overlay dir must be absolute: $dir" ;;
    esac

    lower="/newroot$dir"
    state="/newroot/data${dir}-overlay"
    upper="$state/upper"
    work="$state/work"

    [ -d "$lower" ] || mkdir -p "$lower"

    mkdir -p "$upper" "$work"

    log "Mounting overlay for $dir"

    mount_or_panic -t overlay overlay \
        -o "lowerdir=$lower,upperdir=$upper,workdir=$work" \
        "$lower"
}

wait_for_path() {
    path="$1"
    i=0
    while [ ! -e "$path" ]; do
        i=$((i + 1))
        [ "$i" -ge 50 ] && panic "Timed out waiting for $path"
        sleep 0.2
    done
}

get_cmdline_arg() {
    key="$1"
    for arg in $(cat /proc/cmdline); do
        case "$arg" in
            "$key"=*)
                echo "${arg#"$key"=}"
                return 0
                ;;
        esac
    done
    return 1
}

# Read KEY=VALUE pairs from /sys/class/block/*/uevent without spawning grep/cut.
get_uevent_value() {
    file="$1"
    want_key="$2"

    [ -f "$file" ] || return 1

    while IFS='=' read -r k v; do
        [ "$k" = "$want_key" ] && {
            echo "$v"
            return 0
        }
    done < "$file"

    return 1
}

# Return the /dev/<partition> path for the first partition whose GPT PARTNAME matches.
find_first_part_by_partname() {
    want_label="$1"

    for p in /sys/class/block/*; do
        [ -f "$p/partition" ] || continue

        partname="$(get_uevent_value "$p/uevent" PARTNAME || true)"
        [ "$partname" = "$want_label" ] || continue

        devname="$(basename "$p")"
        echo "/dev/$devname"
        return 0
    done

    return 1
}

wait_for_partnames() {
    timeout="${1:-3}"
    shift

    i=0
    while [ "$i" -lt "$timeout" ]; do
        all_found=1
        for name in "$@"; do
            if ! find_first_part_by_partname "$name" >/dev/null; then
                all_found=0
                break
            fi
        done

        [ "$all_found" -eq 1 ] && return 0

        sleep 1
        i=$((i + 1))
        log "Still waiting for $@ to populate($i)"
    done

    return 1
}

find_part_by_partuuid() {
    want="$1"

    for p in /sys/class/block/*; do
        [ -f "$p/partition" ] || continue

        partuuid="$(get_uevent_value "$p/uevent" PARTUUID || true)"
        [ "$partuuid" = "$want" ] || continue

        echo "/dev/$(basename "$p")"
        return 0
    done

    return 1
}

# Return the parent disk name for a partition device name.
# Examples:
#   sda2      -> sda
#   mmcblk0p2 -> mmcblk0
parent_disk_name_for_part() {
    part_devname="$1"

    real="$(readlink -f "/sys/class/block/$part_devname")" || return 1
    parent="$(basename "$(dirname "$real")")" || return 1

    echo "$parent"
    return 0
}

# Find a sibling partition on the same disk by GPT PARTNAME.
find_sibling_part_on_same_disk() {
    part_path="$1"
    want_label="$2"

    part_devname="$(basename "$part_path")"
    disk_devname="$(parent_disk_name_for_part "$part_devname")" || return 1

    for p in /sys/class/block/"$disk_devname"*; do
        [ -f "$p/partition" ] || continue

        partname="$(get_uevent_value "$p/uevent" PARTNAME || true)"
        [ "$partname" = "$want_label" ] || continue

        echo "/dev/$(basename "$p")"
        return 0
    done

    return 1
}

# Resolve preferred root device from sysfs.
# Prefer PARTUUID first, then optionally filesystem UUID if explicitly provided.
resolve_preferred_root() {
    pref_root="$1"

    [ -n "$pref_root" ] || return 1
    find_part_by_partuuid "$pref_root"
}

wanted_root_labels_for_slot() {
    slot="$1"

    case "$slot" in
        B|b)
            echo "rootfsB"
            ;;
        *)
            echo "rootfsA"
            ;;
    esac
}

find_fallback_root_for_slot() {
    slot="$1"

    for label in $(wanted_root_labels_for_slot "$slot"); do
        dev="$(find_first_part_by_partname "$label" || true)"
        if [ -n "$dev" ]; then
            echo "$dev"
            return 0
        fi
    done

    return 1
}

mkdir -p /dev /proc /sys /run
mount_or_panic -t devtmpfs devtmpfs /dev
mount_or_panic -t proc proc /proc
mount_or_panic -t sysfs sysfs /sys
mount_or_panic -t tmpfs tmpfs /run

mkdir -p /sys/fs/bpf

if ! mountpoint -q /sys/fs/bpf; then
    mount_or_panic -t bpf bpffs /sys/fs/bpf
fi

mount_or_panic --make-rshared /sys
mount_or_panic --make-rshared /run
mount_or_panic --make-shared /sys/fs/bpf

echo 1 > /proc/sys/kernel/printk

mkdir -p /dev/pts
mount_or_panic -t devpts devpts /dev/pts

# Optional early fan kick. Do not fail boot if this path is not ready yet.
if [ -w /sys/class/hwmon/hwmon0/pwm1 ]; then
    echo 100 > /sys/class/hwmon/hwmon0/pwm1 || true
fi

log "Booting kernel took $(cut -d' ' -f1 /proc/uptime) seconds."

. /etc/build-info || panic "failed to source /etc/build-info"

wait_for_partnames 5 rootfsA rootfsB data || panic "failed to wait for fs"

ROOT_CMD="$(get_cmdline_arg root || true)"
BOOT_PART="$(get_cmdline_arg bootpart || true)"
PREFERRED_PARTUUID="$(get_cmdline_arg pref_root || true)"

ROOT_DEV="$(resolve_preferred_root "$PREFERRED_PARTUUID" || true)"
if [ -n "$ROOT_DEV" ]; then
    log "Using preferred root device: $ROOT_DEV"
fi

if [ -z "$ROOT_DEV" ]; then
    ROOT_DEV="$(find_fallback_root_for_slot "$BOOT_PART" || true)"
    if [ -n "$ROOT_DEV" ]; then
		if [ -z "$BOOT_PART" ]; then
			BOOT_PART=A
		fi
        log "Preferred root not found. Falling back to first valid root device: $ROOT_DEV"
    fi
fi

[ -n "$ROOT_DEV" ] || panic "no usable root device found"

DATA_DEV="$(find_sibling_part_on_same_disk "$ROOT_DEV" data || true)"

[ -n "$DATA_DEV" ] || panic "no data partition found on same disk as $ROOT_DEV"

wait_for_path "$ROOT_DEV"
wait_for_path "$DATA_DEV"

e2fsck -p "$DATA_DEV" || {
    log "Auto fsck failed, forcing repair"
    e2fsck -y "$DATA_DEV" || panic "fsck failed on $DATA_DEV"
}

mkdir -p /newroot
mkdir -p /newroot/data
mkdir -p /newroot/var

mount_retry "$ROOT_DEV" /newroot ext4 ro
mount_retry "$DATA_DEV" /newroot/data ext4 rw

mkdir -p /newroot/data/var
mount_or_panic --bind /newroot/data/var /newroot/var
mount_or_panic --make-rshared /newroot/var

mount_data_overlay /etc
mount_data_overlay /opt/cni

if [ "$BOOT_PART" = "A" ]; then
    ALT_PART="$(find_sibling_part_on_same_disk "$ROOT_DEV" rootfsB || true)"
else
    ALT_PART="$(find_sibling_part_on_same_disk "$ROOT_DEV" rootfsA || true)"
fi

if [ -n "$ALT_PART" ]; then
    ln -sf "$ALT_PART" /dev/mksaltpart
fi

mkdir -p /run/monok8s
cat > /run/monok8s/boot-state.env <<EOF
BOOT_PART=$BOOT_PART
ROOT_DEV=$ROOT_DEV
DATA_DEV=$DATA_DEV
EOF

mkdir -p /run/altrootfs

mount_or_panic --move /dev /newroot/dev
mount_or_panic --move /proc /newroot/proc
mount_or_panic --move /sys /newroot/sys
mount_or_panic --move /run /newroot/run

log "Switching root to $ROOT_DEV (data: $DATA_DEV, slot: $BOOT_PART)"
exec switch_root /newroot /sbin/init

panic "switch_root returned unexpectedly"
