From e86b3b3383732061db879acc053b1022bbf9122b76db3c1036c2e4f0c3638dae Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=96=9F=E9=85=8C=20=E9=B5=AC=E5=85=84?= Date: Tue, 28 Apr 2026 03:56:27 +0800 Subject: [PATCH] Added supervised-init.sh to retry ctl init --- alpine/rootfs-extra/etc/local.d/monok8s.start | 2 +- .../usr/lib/monok8s/lib/supervised-init.sh | 57 +++++++++++++++++++ clitools/pkg/bootstrap/runner.go | 4 +- 3 files changed, 60 insertions(+), 3 deletions(-) create mode 100755 alpine/rootfs-extra/usr/lib/monok8s/lib/supervised-init.sh diff --git a/alpine/rootfs-extra/etc/local.d/monok8s.start b/alpine/rootfs-extra/etc/local.d/monok8s.start index 4e76376..30428c9 100755 --- a/alpine/rootfs-extra/etc/local.d/monok8s.start +++ b/alpine/rootfs-extra/etc/local.d/monok8s.start @@ -38,4 +38,4 @@ if [ -n "$K8S_MINOR" ]; then "$MIGRATION_STATE_DIR/k8s/$K8S_MINOR" fi -/usr/local/bin/ctl init --env-file "$CONFIG_DIR/cluster.env" >>/var/log/monok8s/bootstrap.log 2>&1 & +/usr/lib/monok8s/lib/supervised-init.sh & diff --git a/alpine/rootfs-extra/usr/lib/monok8s/lib/supervised-init.sh b/alpine/rootfs-extra/usr/lib/monok8s/lib/supervised-init.sh new file mode 100755 index 0000000..ff0a1b1 --- /dev/null +++ b/alpine/rootfs-extra/usr/lib/monok8s/lib/supervised-init.sh @@ -0,0 +1,57 @@ +#!/bin/sh +set -eu + +CONFIG_DIR=/opt/monok8s/config +LOG=/var/log/monok8s/bootstrap.log +STATE_DIR=/run/monok8s +FAIL_COUNT_FILE="$STATE_DIR/bootstrap-fail-count" +LOCK_DIR="$STATE_DIR/supervised-init.lock" + +# For debugging +HOLD_FILE="$CONFIG_DIR/bootstrap.hold" + +mkdir -p "$STATE_DIR" /var/log/monok8s + +if ! mkdir "$LOCK_DIR" 2>/dev/null; then + echo "[$(date -Is)] supervised-init already running" >> "$LOG" + exit 0 +fi + +trap 'rmdir "$LOCK_DIR"' EXIT INT TERM + +fail_count=0 +if [ -f "$FAIL_COUNT_FILE" ]; then + fail_count="$(cat "$FAIL_COUNT_FILE" 2>/dev/null || echo 0)" + case "$fail_count" in + ''|*[!0-9]*) fail_count=0 ;; + esac +fi + +while true; do + if [ -f "$HOLD_FILE" ]; then + echo "[$(date -Is)] bootstrap held by $HOLD_FILE" >> "$LOG" + sleep 300 + continue + fi + + echo "[$(date -Is)] starting ctl init" >> "$LOG" + + if /usr/local/bin/ctl init --env-file "$CONFIG_DIR/cluster.env" >> "$LOG" 2>&1; then + echo "[$(date -Is)] ctl init succeeded" >> "$LOG" + rm -f "$FAIL_COUNT_FILE" + exit 0 + fi + + fail_count=$((fail_count + 1)) + echo "$fail_count" > "$FAIL_COUNT_FILE" + + echo "[$(date -Is)] ctl init failed, count=$fail_count" >> "$LOG" + + case "$fail_count" in + 1) sleep 10 ;; + 2) sleep 30 ;; + 3) sleep 60 ;; + 4) sleep 120 ;; + *) sleep 300 ;; + esac +done diff --git a/clitools/pkg/bootstrap/runner.go b/clitools/pkg/bootstrap/runner.go index ab2af02..18797d3 100644 --- a/clitools/pkg/bootstrap/runner.go +++ b/clitools/pkg/bootstrap/runner.go @@ -70,7 +70,7 @@ func NewRunner(cfg *monov1alpha1.MonoKSConfig) *Runner { { RegKey: "EngageControlGate", Name: "Engage the control gate", - Desc: "Prevents agent polling resources prematurely", + Desc: "Prevents agent watching resources prematurely", }, { RegKey: "StartCRIO", @@ -165,7 +165,7 @@ func NewRunner(cfg *monov1alpha1.MonoKSConfig) *Runner { { RegKey: "ReleaseControlGate", Name: "Release the control gate", - Desc: "Allow agent to start polling resources", + Desc: "Allow agent to start watching resources", }, }, }