Added supervised-init.sh to retry ctl init

This commit is contained in:
2026-04-28 03:56:27 +08:00
parent 7b31a1dec3
commit e86b3b3383
3 changed files with 60 additions and 3 deletions

View File

@@ -38,4 +38,4 @@ if [ -n "$K8S_MINOR" ]; then
"$MIGRATION_STATE_DIR/k8s/$K8S_MINOR"
fi
/usr/local/bin/ctl init --env-file "$CONFIG_DIR/cluster.env" >>/var/log/monok8s/bootstrap.log 2>&1 &
/usr/lib/monok8s/lib/supervised-init.sh &

View File

@@ -0,0 +1,57 @@
#!/bin/sh
set -eu
CONFIG_DIR=/opt/monok8s/config
LOG=/var/log/monok8s/bootstrap.log
STATE_DIR=/run/monok8s
FAIL_COUNT_FILE="$STATE_DIR/bootstrap-fail-count"
LOCK_DIR="$STATE_DIR/supervised-init.lock"
# For debugging
HOLD_FILE="$CONFIG_DIR/bootstrap.hold"
mkdir -p "$STATE_DIR" /var/log/monok8s
if ! mkdir "$LOCK_DIR" 2>/dev/null; then
echo "[$(date -Is)] supervised-init already running" >> "$LOG"
exit 0
fi
trap 'rmdir "$LOCK_DIR"' EXIT INT TERM
fail_count=0
if [ -f "$FAIL_COUNT_FILE" ]; then
fail_count="$(cat "$FAIL_COUNT_FILE" 2>/dev/null || echo 0)"
case "$fail_count" in
''|*[!0-9]*) fail_count=0 ;;
esac
fi
while true; do
if [ -f "$HOLD_FILE" ]; then
echo "[$(date -Is)] bootstrap held by $HOLD_FILE" >> "$LOG"
sleep 300
continue
fi
echo "[$(date -Is)] starting ctl init" >> "$LOG"
if /usr/local/bin/ctl init --env-file "$CONFIG_DIR/cluster.env" >> "$LOG" 2>&1; then
echo "[$(date -Is)] ctl init succeeded" >> "$LOG"
rm -f "$FAIL_COUNT_FILE"
exit 0
fi
fail_count=$((fail_count + 1))
echo "$fail_count" > "$FAIL_COUNT_FILE"
echo "[$(date -Is)] ctl init failed, count=$fail_count" >> "$LOG"
case "$fail_count" in
1) sleep 10 ;;
2) sleep 30 ;;
3) sleep 60 ;;
4) sleep 120 ;;
*) sleep 300 ;;
esac
done

View File

@@ -70,7 +70,7 @@ func NewRunner(cfg *monov1alpha1.MonoKSConfig) *Runner {
{
RegKey: "EngageControlGate",
Name: "Engage the control gate",
Desc: "Prevents agent polling resources prematurely",
Desc: "Prevents agent watching resources prematurely",
},
{
RegKey: "StartCRIO",
@@ -165,7 +165,7 @@ func NewRunner(cfg *monov1alpha1.MonoKSConfig) *Runner {
{
RegKey: "ReleaseControlGate",
Name: "Release the control gate",
Desc: "Allow agent to start polling resources",
Desc: "Allow agent to start watching resources",
},
},
}