Fixed some race conditions

This commit is contained in:
2026-04-06 05:18:06 +08:00
parent 50d9440e0a
commit d662162921
4 changed files with 259 additions and 87 deletions

View File

@@ -3,12 +3,14 @@ package osupgrade
import (
"context"
"fmt"
"strings"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/client-go/util/retry"
"k8s.io/klog/v2"
monov1alpha1 "example.com/monok8s/pkg/apis/monok8s/v1alpha1"
@@ -30,13 +32,11 @@ func ensureProgressHeartbeat(ctx context.Context, clients *kube.Clients,
namespace string, nodeName string,
osu *monov1alpha1.OSUpgrade,
) (*monov1alpha1.OSUpgradeProgress, error) {
name := fmt.Sprintf("%s-%s", osu.Name, nodeName)
now := metav1.Now()
currentVersion := buildinfo.KubeVersion
targetVersion := ""
if osu.Status != nil {
targetVersion = osu.Status.ResolvedVersion
}
@@ -74,39 +74,129 @@ func ensureProgressHeartbeat(ctx context.Context, clients *kube.Clients,
return nil, fmt.Errorf("create OSUpgradeProgress %s/%s: %w", namespace, name, err)
}
existing, err := getProgress(ctx, clients, osup_gvr, namespace, name)
var out *monov1alpha1.OSUpgradeProgress
err = retry.RetryOnConflict(retry.DefaultRetry, func() error {
existing, err := getProgress(ctx, clients, osup_gvr, namespace, name)
if err != nil {
return fmt.Errorf("get existing OSUpgradeProgress %s/%s: %w", namespace, name, err)
}
// Keep spec aligned with source and node.
existing.Spec.NodeName = nodeName
existing.Spec.SourceRef.Name = osu.Name
existing, err = updateProgressSpec(ctx, clients, osup_gvr, existing)
if err != nil {
if isUnknownUpdateResult(err) {
latest, getErr := getProgress(ctx, clients, osup_gvr, namespace, name)
if getErr == nil {
out = latest
}
}
return fmt.Errorf("update OSUpgradeProgress spec %s/%s: %w", namespace, name, err)
}
if existing.Status == nil {
existing.Status = &monov1alpha1.OSUpgradeProgressStatus{}
}
existing.Status.CurrentVersion = currentVersion
existing.Status.TargetVersion = targetVersion
existing.Status.LastUpdatedAt = &now
if existing.Status.Phase == "" {
existing.Status.Phase = monov1alpha1.OSUpgradeProgressPhasePending
}
if existing.Status.Message == "" {
existing.Status.Message = "acknowledged"
}
existing, err = updateProgressStatus(ctx, clients, osup_gvr, existing)
if err != nil {
if isUnknownUpdateResult(err) {
latest, getErr := getProgress(ctx, clients, osup_gvr, namespace, name)
if getErr == nil {
out = latest
}
}
return fmt.Errorf("update OSUpgradeProgress status %s/%s: %w", namespace, name, err)
}
out = existing
return nil
})
if err != nil {
return nil, fmt.Errorf("get existing OSUpgradeProgress %s/%s: %w", namespace, name, err)
if out != nil {
return out, nil
}
return nil, err
}
// Spec should remain aligned with the source and node.
existing.Spec.NodeName = nodeName
existing.Spec.SourceRef.Name = osu.Name
klog.InfoS("updated osupgradeprogress", "name", out.Name, "namespace", out.Namespace)
return out, nil
}
if existing, err = updateProgressSpec(ctx, clients, osup_gvr, existing); err != nil {
return nil, fmt.Errorf("update OSUpgradeProgress spec %s/%s: %w", namespace, name, err)
func updateProgressRobust(
ctx context.Context,
clients *kube.Clients,
namespace string,
name string,
mutate func(*monov1alpha1.OSUpgradeProgress),
) (*monov1alpha1.OSUpgradeProgress, error) {
var out *monov1alpha1.OSUpgradeProgress
err := retry.RetryOnConflict(retry.DefaultRetry, func() error {
current, err := getProgress(ctx, clients, osup_gvr, namespace, name)
if err != nil {
return err
}
if current.Status == nil {
current.Status = &monov1alpha1.OSUpgradeProgressStatus{}
}
mutate(current)
updated, err := updateProgressStatus(ctx, clients, osup_gvr, current)
if err != nil {
if isUnknownUpdateResult(err) {
latest, getErr := getProgress(ctx, clients, osup_gvr, namespace, name)
if getErr == nil {
out = latest
}
}
return err
}
out = updated
return nil
})
if err != nil && out != nil {
// Unknown-result case: caller gets latest known server state plus error.
return out, err
}
if existing.Status == nil {
existing.Status = &monov1alpha1.OSUpgradeProgressStatus{}
return out, err
}
func isUnknownUpdateResult(err error) bool {
if err == nil {
return false
}
existing.Status.CurrentVersion = currentVersion
existing.Status.TargetVersion = targetVersion
existing.Status.LastUpdatedAt = &now
// Only set phase/message if they are still empty, so later real state machine
// updates are not clobbered by the heartbeat.
if existing.Status.Phase == "" {
existing.Status.Phase = monov1alpha1.OSUpgradeProgressPhasePending
if apierrors.IsTimeout(err) ||
apierrors.IsServerTimeout(err) ||
apierrors.IsTooManyRequests(err) {
return true
}
if existing, err = updateProgressStatus(ctx, clients, osup_gvr, existing); err != nil {
return nil, fmt.Errorf("update OSUpgradeProgress status %s/%s: %w", namespace, name, err)
}
klog.InfoS("updated osupgradeprogress", "name", existing.Name, "namespace", existing.Namespace)
return existing, nil
msg := strings.ToLower(err.Error())
return strings.Contains(msg, "request timed out") ||
strings.Contains(msg, "context deadline exceeded") ||
strings.Contains(msg, "etcdserver: request timed out") ||
strings.Contains(msg, "connection reset by peer") ||
strings.Contains(msg, "http2: client connection lost")
}
func createProgress(
@@ -199,17 +289,18 @@ func failProgress(
action string,
cause error,
) error {
now := metav1.Now()
_, err := updateProgressRobust(ctx, clients, osup.Namespace, osup.Name, func(cur *monov1alpha1.OSUpgradeProgress) {
now := metav1.Now()
if osup.Status == nil {
osup.Status = &monov1alpha1.OSUpgradeProgressStatus{}
}
if cur.Status == nil {
cur.Status = &monov1alpha1.OSUpgradeProgressStatus{}
}
osup.Status.LastUpdatedAt = &now
osup.Status.Message = fmt.Sprintf("%s: %v", action, cause)
osup.Status.Phase = monov1alpha1.OSUpgradeProgressPhaseFailed
if _, err := updateProgressStatus(ctx, clients, osup_gvr, osup); err != nil {
cur.Status.LastUpdatedAt = &now
cur.Status.Message = fmt.Sprintf("%s: %v", action, cause)
cur.Status.Phase = monov1alpha1.OSUpgradeProgressPhaseFailed
})
if err != nil {
klog.ErrorS(err, "failed to update osupgradeprogress status after error",
"action", action,
"name", osup.Name,
@@ -226,18 +317,18 @@ func markProgressCompleted(
osup *monov1alpha1.OSUpgradeProgress,
message string,
) error {
now := metav1.Now()
_, err := updateProgressRobust(ctx, clients, osup.Namespace, osup.Name, func(cur *monov1alpha1.OSUpgradeProgress) {
now := metav1.Now()
if osup.Status == nil {
osup.Status = &monov1alpha1.OSUpgradeProgressStatus{}
}
if cur.Status == nil {
cur.Status = &monov1alpha1.OSUpgradeProgressStatus{}
}
osup.Status.Phase = monov1alpha1.OSUpgradeProgressPhaseCompleted
osup.Status.Message = message
osup.Status.LastUpdatedAt = &now
osup.Status.CompletedAt = &now
_, err := updateProgressStatus(ctx, clients, osup_gvr, osup)
cur.Status.Phase = monov1alpha1.OSUpgradeProgressPhaseCompleted
cur.Status.Message = message
cur.Status.LastUpdatedAt = &now
cur.Status.CompletedAt = &now
})
if err != nil {
return fmt.Errorf("mark progress completed: %w", err)
}