detect local cluster states

This commit is contained in:
2026-03-29 22:26:54 +08:00
parent ecceff225f
commit 03a5e5bedb
7 changed files with 524 additions and 16 deletions

View File

@@ -2,18 +2,462 @@ package node
import (
"context"
"errors"
"fmt"
"net"
"os"
"strings"
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/discovery"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/clientcmd"
"k8s.io/klog/v2"
)
func WaitForExistingClusterIfNeeded(context.Context, *NodeContext) error {
klog.Info("wait_for_existing_cluster_if_needed: TODO implement kubelet/admin.conf waits")
const (
adminKubeconfigPath = "/etc/kubernetes/admin.conf"
kubeletKubeconfigPath = "/etc/kubernetes/kubelet.conf"
)
func DetectLocalClusterState(ctx context.Context, nctx *NodeContext) error {
_ = ctx
if nctx == nil {
return fmt.Errorf("node context is nil")
}
_, err := os.Stat(adminKubeconfigPath)
hasAdmin := err == nil
if err != nil && !errors.Is(err, os.ErrNotExist) {
return fmt.Errorf("stat admin kubeconfig: %w", err)
}
_, err = os.Stat(kubeletKubeconfigPath)
hasKubelet := err == nil
if err != nil && !errors.Is(err, os.ErrNotExist) {
return fmt.Errorf("stat kubelet kubeconfig: %w", err)
}
state := LocalClusterState{
HasAdminKubeconfig: hasAdmin,
HasKubeletKubeconfig: hasKubelet,
}
switch {
case !hasAdmin && !hasKubelet:
state.MembershipKind = LocalMembershipFresh
case !hasAdmin && hasKubelet:
state.MembershipKind = LocalMembershipExistingWorker
case hasAdmin && hasKubelet:
state.MembershipKind = LocalMembershipExistingControlPlane
case hasAdmin && !hasKubelet:
state.MembershipKind = LocalMembershipPartial
default:
return fmt.Errorf("unreachable local cluster state")
}
nctx.LocalClusterState = &state
return nil
}
func CheckRequiredImages(ctx context.Context, n *NodeContext) error {
func WaitForExistingClusterIfNeeded(ctx context.Context, nctx *NodeContext) error {
switch nctx.LocalClusterState.MembershipKind {
case LocalMembershipFresh:
return nil
case LocalMembershipExistingWorker:
if err := StartKubelet(ctx, nctx); err != nil {
return fmt.Errorf("start kubelet: %w", err)
}
return nil
case LocalMembershipExistingControlPlane:
if err := StartKubelet(ctx, nctx); err != nil {
return fmt.Errorf("start kubelet: %w", err)
}
// Existing local control-plane state: wait for local apiserver if this
// machine is meant to be a control-plane node.
if strings.TrimSpace(nctx.Config.Spec.ClusterRole) == "control-plane" {
if err := waitForLocalAPIServer(ctx, nctx, 2*time.Minute); err != nil {
return fmt.Errorf("wait for local apiserver: %w", err)
}
if err := waitForAPIViaKubeconfig(ctx, adminKubeconfigPath, 2*time.Minute); err != nil {
return fmt.Errorf("wait for admin api: %w", err)
}
}
return nil
case LocalMembershipPartial:
// Be strict here. Partial state is suspicious.
return fmt.Errorf("partial local cluster state detected: admin=%t kubelet=%t",
nctx.LocalClusterState.HasAdminKubeconfig,
nctx.LocalClusterState.HasKubeletKubeconfig,
)
default:
return fmt.Errorf("unknown local membership kind %q", nctx.LocalClusterState.MembershipKind)
}
}
func CheckForVersionSkew(ctx context.Context, nctx *NodeContext) error {
role := strings.TrimSpace(nctx.Config.Spec.ClusterRole)
wantVersion := normalizeKubeVersion(strings.TrimSpace(nctx.Config.Spec.KubernetesVersion))
if wantVersion == "" {
return errors.New("spec.kubernetesVersion is required")
}
switch nctx.LocalClusterState.MembershipKind {
case LocalMembershipFresh:
// Fresh node has no existing cluster membership to compare against.
return nil
case LocalMembershipPartial:
return fmt.Errorf("cannot check version skew with partial local cluster state")
}
versionKubeconfig := chooseVersionKubeconfig(nctx.LocalClusterState)
if versionKubeconfig == "" {
return fmt.Errorf("no kubeconfig available for version detection")
}
currentVersion, err := getServerVersion(ctx, versionKubeconfig)
if err != nil {
if role == "control-plane" {
return fmt.Errorf("existing control-plane state found, but cluster version could not be determined: %w", err)
}
// Worker path stays permissive.
nctx.BootstrapState.UnsupportedWorkerVersionSkew = true
nctx.BootstrapState.VersionSkewReason = "cluster version could not be determined"
return nil
}
nctx.BootstrapState.DetectedClusterVersion = currentVersion
switch role {
case "control-plane":
if !isSupportedControlPlaneSkew(currentVersion, wantVersion) {
return fmt.Errorf(
"unsupported control-plane version skew: cluster=%s node=%s",
currentVersion, wantVersion,
)
}
case "worker":
if !isSupportedWorkerSkew(currentVersion, wantVersion) {
nctx.BootstrapState.UnsupportedWorkerVersionSkew = true
nctx.BootstrapState.VersionSkewReason = fmt.Sprintf(
"unsupported worker version skew: cluster=%s node=%s",
currentVersion, wantVersion,
)
}
default:
return fmt.Errorf("unsupported cluster role %q", role)
}
return nil
}
func ClassifyBootstrapAction(ctx context.Context, nctx *NodeContext) error {
_ = ctx
role := strings.TrimSpace(nctx.Config.Spec.ClusterRole)
initControlPlane := nctx.Config.Spec.InitControlPlane
wantVersion := normalizeKubeVersion(strings.TrimSpace(nctx.Config.Spec.KubernetesVersion))
if wantVersion == "" {
return errors.New("spec.kubernetesVersion is required")
}
nctx.BootstrapState = &BootstrapState{}
switch role {
case "worker":
switch nctx.LocalClusterState.MembershipKind {
case LocalMembershipFresh:
nctx.BootstrapState.Action = BootstrapActionJoinWorker
return nil
case LocalMembershipExistingWorker:
if nctx.BootstrapState.DetectedClusterVersion == "" {
nctx.BootstrapState.Action = BootstrapActionReconcileWorker
return nil
}
if versionEq(nctx.BootstrapState.DetectedClusterVersion, wantVersion) {
nctx.BootstrapState.Action = BootstrapActionReconcileWorker
} else {
nctx.BootstrapState.Action = BootstrapActionUpgradeWorker
}
return nil
case LocalMembershipExistingControlPlane, LocalMembershipPartial:
return fmt.Errorf("local state %q is invalid for worker role", nctx.LocalClusterState.MembershipKind)
default:
return fmt.Errorf("unknown local membership kind %q", nctx.LocalClusterState.MembershipKind)
}
case "control-plane":
switch nctx.LocalClusterState.MembershipKind {
case LocalMembershipFresh:
if initControlPlane {
nctx.BootstrapState.Action = BootstrapActionInitControlPlane
} else {
nctx.BootstrapState.Action = BootstrapActionJoinControlPlane
}
return nil
case LocalMembershipExistingControlPlane:
if nctx.BootstrapState.DetectedClusterVersion == "" {
return errors.New("existing control-plane state found, but detected cluster version is empty")
}
if versionEq(nctx.BootstrapState.DetectedClusterVersion, wantVersion) {
nctx.BootstrapState.Action = BootstrapActionReconcileControlPlane
} else {
nctx.BootstrapState.Action = BootstrapActionUpgradeControlPlane
}
return nil
case LocalMembershipExistingWorker:
return fmt.Errorf("local state %q is invalid for control-plane role", nctx.LocalClusterState.MembershipKind)
case LocalMembershipPartial:
return fmt.Errorf("partial local cluster state is invalid for control-plane role")
default:
return fmt.Errorf("unknown local membership kind %q", nctx.LocalClusterState.MembershipKind)
}
default:
return fmt.Errorf("unsupported cluster role %q", role)
}
}
func InitControlPlane(ctx context.Context, nctx *NodeContext) error {
if nctx == nil {
return errors.New("node context is nil")
}
if nctx.Config == nil {
return errors.New("node config is nil")
}
if strings.TrimSpace(nctx.Config.Spec.ClusterRole) != "control-plane" {
return fmt.Errorf("init control-plane called for non-control-plane role %q", nctx.Config.Spec.ClusterRole)
}
if !nctx.Config.Spec.InitControlPlane {
return errors.New("init control-plane called but spec.initControlPlane is false")
}
if nctx.BootstrapState.Action != BootstrapActionInitControlPlane {
return fmt.Errorf("init control-plane called with bootstrap action %q", nctx.BootstrapState.Action)
}
// Fresh init only. Existing control-plane recovery/wait belongs elsewhere.
switch nctx.LocalClusterState.MembershipKind {
case LocalMembershipFresh:
// continue
default:
return fmt.Errorf("init control-plane requires fresh local state, got %q", nctx.LocalClusterState.MembershipKind)
}
// Example:
// if err := RunKubeadmInit(ctx, nctx); err != nil {
// return fmt.Errorf("kubeadm init: %w", err)
// }
return nil
}
func waitForLocalAPIServer(ctx context.Context, nctx *NodeContext, timeout time.Duration) error {
addr := strings.TrimSpace(nctx.Config.Spec.APIServerAdvertiseAddress)
if addr == "" {
return errors.New("spec.apiServerAdvertiseAddress is required for local control-plane wait")
}
d := net.Dialer{Timeout: 2 * time.Second}
deadlineCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
target := net.JoinHostPort(addr, "6443")
ticker := time.NewTicker(2 * time.Second)
defer ticker.Stop()
for {
conn, err := d.DialContext(deadlineCtx, "tcp", target)
if err == nil {
_ = conn.Close()
return nil
}
select {
case <-deadlineCtx.Done():
return fmt.Errorf("apiserver %s did not become reachable within %s", target, timeout)
case <-ticker.C:
}
}
}
func waitForAPIViaKubeconfig(ctx context.Context, kubeconfigPath string, timeout time.Duration) error {
deadlineCtx, cancel := context.WithTimeout(ctx, timeout)
defer cancel()
ticker := time.NewTicker(2 * time.Second)
defer ticker.Stop()
for {
_, err := getServerVersion(deadlineCtx, kubeconfigPath)
if err == nil {
return nil
}
select {
case <-deadlineCtx.Done():
return fmt.Errorf("api via kubeconfig %s did not become reachable within %s", kubeconfigPath, timeout)
case <-ticker.C:
}
}
}
func getServerVersion(ctx context.Context, kubeconfigPath string) (string, error) {
restCfg, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath)
if err != nil {
return "", fmt.Errorf("build kubeconfig %s: %w", kubeconfigPath, err)
}
// Keep this short. This is a probe, not a long-running client.
restCfg.Timeout = 5 * time.Second
clientset, err := kubernetes.NewForConfig(restCfg)
if err != nil {
return "", fmt.Errorf("create clientset: %w", err)
}
disc := clientset.Discovery()
return discoverServerVersion(ctx, disc)
}
func discoverServerVersion(ctx context.Context, disc discovery.DiscoveryInterface) (string, error) {
info, err := disc.ServerVersion()
if err != nil {
return "", err
}
if info == nil || strings.TrimSpace(info.GitVersion) == "" {
return "", errors.New("server version is empty")
}
return normalizeKubeVersion(info.GitVersion), nil
}
type kubeVersion struct {
Major int
Minor int
Patch int
}
func parseKubeVersion(s string) (kubeVersion, error) {
s = strings.TrimSpace(s)
s = strings.TrimPrefix(s, "v")
var v kubeVersion
n, err := fmt.Sscanf(s, "%d.%d.%d", &v.Major, &v.Minor, &v.Patch)
// Accepts "1.29" or "1.29.3"
if err != nil || n < 2 {
return kubeVersion{}, fmt.Errorf("invalid kubernetes version %q", s)
}
return v, nil
}
// Control-plane: keep this strict.
// Accept same version, or a one-minor step where the node binary is newer than the current cluster.
// That covers normal control-plane upgrade flow but blocks nonsense.
func isSupportedControlPlaneSkew(clusterVersion, nodeVersion string) bool {
cv, err := parseKubeVersion(clusterVersion)
if err != nil {
return false
}
nv, err := parseKubeVersion(nodeVersion)
if err != nil {
return false
}
if cv.Major != nv.Major {
return false
}
if cv.Minor == nv.Minor {
return true
}
if nv.Minor == cv.Minor+1 {
return true
}
return false
}
// Worker: kubelet generally must not be newer than the apiserver.
// Older kubelets are allowed within supported skew range.
// Your requirement says unsupported worker skew should still proceed, so this
// only classifies support status and must NOT be used to block this function.
func isSupportedWorkerSkew(clusterVersion, nodeVersion string) bool {
cv, err := parseKubeVersion(clusterVersion)
if err != nil {
return false
}
nv, err := parseKubeVersion(nodeVersion)
if err != nil {
return false
}
if cv.Major != nv.Major {
return false
}
// kubelet newer than apiserver => unsupported
if nv.Minor > cv.Minor {
return false
}
// kubelet up to 3 minors older than apiserver => supported
if cv.Minor-nv.Minor <= 3 {
return true
}
return false
}
// This should not try to taint the node directly here.
// Just record intent and let a later reconcile step apply the taint.
func markUnsupportedWorkerVersionSkew(nctx *NodeContext, clusterVersion, nodeVersion string) {
// Replace this with whatever state carrier you already use.
//
// Example:
// nctx.Metadata.UnsupportedWorkerVersionSkew = true
// nctx.Metadata.UnsupportedWorkerVersionSkewReason =
// fmt.Sprintf("unsupported worker version skew: cluster=%s node=%s", clusterVersion, nodeVersion)
_ = nctx
_ = clusterVersion
_ = nodeVersion
}
// Optional helper if you want to probe readiness later through the API.
// Keeping this here in case you want a very cheap liveness call elsewhere.
func apiServerReady(ctx context.Context, kubeconfigPath string) error {
restCfg, err := clientcmd.BuildConfigFromFlags("", kubeconfigPath)
if err != nil {
return err
}
restCfg.Timeout = 5 * time.Second
clientset, err := kubernetes.NewForConfig(restCfg)
if err != nil {
return err
}
_, err = clientset.CoreV1().Namespaces().Get(ctx, metav1.NamespaceSystem, metav1.GetOptions{})
return err
}
func ValidateRequiredImagesPresent(ctx context.Context, n *NodeContext) error {
if n.Config.Spec.SkipImageCheck {
klog.Infof("skipping image check (skipImageCheck=true)")
return nil
@@ -65,6 +509,32 @@ func checkImagePresent(ctx context.Context, n *NodeContext, image string) error
}
return nil
}
func chooseVersionKubeconfig(state *LocalClusterState) string {
if state.HasAdminKubeconfig {
return adminKubeconfigPath
}
if state.HasKubeletKubeconfig {
return kubeletKubeconfigPath
}
return ""
}
func versionEq(a, b string) bool {
return normalizeKubeVersion(a) == normalizeKubeVersion(b)
}
func normalizeKubeVersion(v string) string {
v = strings.TrimSpace(v)
if v == "" {
return ""
}
if !strings.HasPrefix(v, "v") {
v = "v" + v
}
return v
}
func GenerateKubeadmConfig(context.Context, *NodeContext) error {
klog.Info("generate_kubeadm_config: TODO render kubeadm v1beta4 config from MonoKSConfig")
return nil