Refine controller template and probe listeners

This commit is contained in:
2026-04-27 00:28:25 +08:00
parent 8fae920fc8
commit d7c2dac944
20 changed files with 780 additions and 217 deletions

1
.gitignore vendored
View File

@@ -1,3 +1,4 @@
.DS_Store
clitools/bin clitools/bin
packages/ packages/
out/ out/

View File

@@ -44,3 +44,12 @@ func addKnownTypes(scheme *runtime.Scheme) error {
metav1.AddToGroupVersion(scheme, SchemeGroupVersion) metav1.AddToGroupVersion(scheme, SchemeGroupVersion)
return nil return nil
} }
func NodeAgentLabels() map[string]string {
return map[string]string{
"app.kubernetes.io/name": NodeAgentName,
"app.kubernetes.io/component": "agent",
"app.kubernetes.io/part-of": "monok8s",
"app.kubernetes.io/managed-by": NodeControlName,
}
}

View File

@@ -133,6 +133,12 @@ type OSUpgradeProgressStatus struct {
CurrentStep int32 `json:"currentStep,omitempty" yaml:"currentStep,omitempty"` CurrentStep int32 `json:"currentStep,omitempty" yaml:"currentStep,omitempty"`
CurrentFrom string `json:"currentFrom,omitempty" yaml:"currentFrom,omitempty"` CurrentFrom string `json:"currentFrom,omitempty" yaml:"currentFrom,omitempty"`
CurrentTo string `json:"currentTo,omitempty" yaml:"currentTo,omitempty"` CurrentTo string `json:"currentTo,omitempty" yaml:"currentTo,omitempty"`
// ObservedRetryNonce records the last retryNonce value the agent accepted.
// When spec.retryNonce is changed by the user and differs from this value,
// the agent may retry a failed upgrade.
// +optional
ObservedRetryNonce string `json:"observedRetryNonce,omitempty"`
} }
func (osu OSUpgrade) StatusPhase() string { func (osu OSUpgrade) StatusPhase() string {
@@ -142,3 +148,11 @@ func (osu OSUpgrade) StatusPhase() string {
} }
return phase return phase
} }
func (osup OSUpgradeProgress) StatusPhase() string {
phase := ""
if osup.Status != nil {
phase = string(osup.Status.Phase)
}
return phase
}

View File

@@ -152,14 +152,11 @@ func watchOnce(
if !targetsNode(item, nodeName) { if !targetsNode(item, nodeName) {
continue continue
} }
if !shouldHandle(item) {
continue
}
klog.InfoS("found existing osupgradeprogress", klog.InfoS("found existing osupgradeprogress",
"name", item.Name, "name", item.Name,
"node", nodeName, "node", nodeName,
"phase", progressPhase(item.Status), "phase", item.StatusPhase(),
"resourceVersion", item.ResourceVersion, "resourceVersion", item.ResourceVersion,
) )
@@ -227,20 +224,11 @@ func watchOnce(
if !targetsNode(osup, nodeName) { if !targetsNode(osup, nodeName) {
continue continue
} }
if !shouldHandle(osup) {
klog.V(2).InfoS("skipping osupgradeprogress due to phase",
"name", osup.Name,
"node", nodeName,
"phase", progressPhase(osup.Status),
"eventType", evt.Type,
)
continue
}
klog.InfoS("received osupgradeprogress event", klog.V(4).InfoS("received osupgradeprogress event",
"name", osup.Name, "name", osup.Name,
"node", nodeName, "node", nodeName,
"phase", progressPhase(osup.Status), "phase", osup.StatusPhase(),
"eventType", evt.Type, "eventType", evt.Type,
"resourceVersion", osup.ResourceVersion, "resourceVersion", osup.ResourceVersion,
) )
@@ -262,28 +250,3 @@ func targetsNode(osup *monov1alpha1.OSUpgradeProgress, nodeName string) bool {
} }
return osup.Spec.NodeName == nodeName return osup.Spec.NodeName == nodeName
} }
func shouldHandle(osup *monov1alpha1.OSUpgradeProgress) bool {
if osup == nil {
return false
}
if osup.Status == nil {
return false
}
switch osup.Status.Phase {
case "",
monov1alpha1.OSUpgradeProgressPhasePending:
return true
default:
return false
}
}
func progressPhase(st *monov1alpha1.OSUpgradeProgressStatus) string {
if st == nil {
return ""
}
return string(st.Phase)
}

View File

@@ -3,6 +3,7 @@ package controller
import ( import (
"context" "context"
"errors" "errors"
"fmt"
"net" "net"
"net/http" "net/http"
"os" "os"
@@ -57,7 +58,7 @@ func NewCmdController(flags *genericclioptions.ConfigFlags) *cobra.Command {
}() }()
go func() { go func() {
httpErrCh <- httpListen(ctx, clients, conf) httpErrCh <- listenAndServe(ctx, clients, conf)
}() }()
select { select {
@@ -92,63 +93,125 @@ func NewCmdController(flags *genericclioptions.ConfigFlags) *cobra.Command {
return cmd return cmd
} }
func httpListen(ctx context.Context, clients *kube.Clients, conf ServerConfig) error { func listenAndServe(ctx context.Context, clients *kube.Clients, conf ServerConfig) error {
address, port := "", "8443"
addr := net.JoinHostPort(address, port)
nodeName := os.Getenv("NODE_NAME") nodeName := os.Getenv("NODE_NAME")
server := mkscontroller.NewServer(ctx, clients, conf.Namespace, nodeName) controllerServer := mkscontroller.NewServer(ctx, clients, conf.Namespace, nodeName)
s := &http.Server{ healthMux := http.NewServeMux()
Addr: addr, healthMux.HandleFunc("/healthz", func(w http.ResponseWriter, r *http.Request) {
Handler: server, w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("ok\n"))
})
healthMux.HandleFunc("/readyz", func(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("ok\n"))
})
healthAddr := net.JoinHostPort("", "8080")
controllerAddr := net.JoinHostPort("", "8443")
healthHTTPServer := &http.Server{
Addr: healthAddr,
Handler: healthMux,
IdleTimeout: 90 * time.Second,
ReadTimeout: 10 * time.Second,
WriteTimeout: 10 * time.Second,
MaxHeaderBytes: 1 << 20,
}
controllerHTTPServer := &http.Server{
Addr: controllerAddr,
Handler: controllerServer,
IdleTimeout: 90 * time.Second, IdleTimeout: 90 * time.Second,
ReadTimeout: 4 * time.Minute, ReadTimeout: 4 * time.Minute,
WriteTimeout: 4 * time.Minute, WriteTimeout: 4 * time.Minute,
MaxHeaderBytes: 1 << 20, MaxHeaderBytes: 1 << 20,
} }
serverErrCh := make(chan error, 1) serverErrCh := make(chan error, 2)
go func() { go func() {
if conf.TLSCertFile != "" { klog.InfoS("starting health HTTP server", "addr", healthAddr)
klog.InfoS("starting HTTPS server",
"addr", addr, err := healthHTTPServer.ListenAndServe()
"certFile", conf.TLSCertFile, if err != nil && !errors.Is(err, http.ErrServerClosed) {
"keyFile", conf.TLSPrivateKeyFile, serverErrCh <- fmt.Errorf("health HTTP server: %w", err)
)
serverErrCh <- s.ListenAndServeTLS(conf.TLSCertFile, conf.TLSPrivateKeyFile)
return return
} }
klog.InfoS("starting HTTP server", "addr", addr) serverErrCh <- nil
serverErrCh <- s.ListenAndServe() }()
go func() {
if conf.TLSCertFile != "" {
klog.InfoS("starting controller HTTPS server",
"addr", controllerAddr,
"certFile", conf.TLSCertFile,
"keyFile", conf.TLSPrivateKeyFile,
)
err := controllerHTTPServer.ListenAndServeTLS(conf.TLSCertFile, conf.TLSPrivateKeyFile)
if err != nil && !errors.Is(err, http.ErrServerClosed) {
serverErrCh <- fmt.Errorf("controller HTTPS server: %w", err)
return
}
serverErrCh <- nil
return
}
klog.InfoS("starting controller HTTP server", "addr", controllerAddr)
err := controllerHTTPServer.ListenAndServe()
if err != nil && !errors.Is(err, http.ErrServerClosed) {
serverErrCh <- fmt.Errorf("controller HTTP server: %w", err)
return
}
serverErrCh <- nil
}() }()
select { select {
case <-ctx.Done(): case <-ctx.Done():
klog.InfoS("shutting down HTTP server", "addr", addr) klog.InfoS("shutting down HTTP servers",
"healthAddr", healthAddr,
"controllerAddr", controllerAddr,
)
shutdownCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second) shutdownCtx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel() defer cancel()
err := s.Shutdown(shutdownCtx) var errs []error
if err != nil {
return err if err := healthHTTPServer.Shutdown(shutdownCtx); err != nil {
errs = append(errs, fmt.Errorf("shutdown health HTTP server: %w", err))
} }
err = <-serverErrCh if err := controllerHTTPServer.Shutdown(shutdownCtx); err != nil {
if err != nil && !errors.Is(err, http.ErrServerClosed) { errs = append(errs, fmt.Errorf("shutdown controller HTTP server: %w", err))
return err
} }
for i := 0; i < 2; i++ {
if err := <-serverErrCh; err != nil {
errs = append(errs, err)
}
}
if len(errs) > 0 {
return errors.Join(errs...)
}
return context.Canceled return context.Canceled
case err := <-serverErrCh: case err := <-serverErrCh:
if err != nil && !errors.Is(err, http.ErrServerClosed) { if err != nil {
klog.ErrorS(err, "HTTP server failed") klog.ErrorS(err, "HTTP server failed")
return err return err
} }
return nil
// One server exited cleanly unexpectedly. Treat that as failure because
// the process should keep both servers alive until ctx is canceled.
return fmt.Errorf("HTTP server exited unexpectedly")
} }
} }

View File

@@ -42,23 +42,6 @@ func NewCmdCreate(flags *genericclioptions.ConfigFlags) *cobra.Command {
return err return err
}, },
}, },
&cobra.Command{
Use: "controller",
Short: "Print controller deployment template",
RunE: func(cmd *cobra.Command, _ []string) error {
ns, _, err := flags.ToRawKubeConfigLoader().Namespace()
if err != nil {
return err
}
out, err := render.RenderControllerDeployments(ns)
if err != nil {
return err
}
_, err = fmt.Fprint(cmd.OutOrStdout(), out)
return err
},
},
) )
var authorizedKeysPath string var authorizedKeysPath string
@@ -90,6 +73,38 @@ func NewCmdCreate(flags *genericclioptions.ConfigFlags) *cobra.Command {
sshdcmd.Flags().StringVar(&authorizedKeysPath, "authkeys", "", "path to authorized_keys file") sshdcmd.Flags().StringVar(&authorizedKeysPath, "authkeys", "", "path to authorized_keys file")
cmd.AddCommand(&sshdcmd) cmd.AddCommand(&sshdcmd)
cconf := render.ControllerConf{}
controllercmd := cobra.Command{
Use: "controller",
Short: "Print controller deployment template",
RunE: func(cmd *cobra.Command, _ []string) error {
ns, _, err := flags.ToRawKubeConfigLoader().Namespace()
if err != nil {
return err
}
cconf.Namespace = ns
out, err := render.RenderControllerDeployments(cconf)
if err != nil {
return err
}
_, err = fmt.Fprint(cmd.OutOrStdout(), out)
return err
},
}
controllercmd.Flags().StringVar(
&cconf.Image,
"image",
"",
"Controller image, including optional registry and tag",
)
cmd.AddCommand(&controllercmd)
return cmd return cmd
} }

View File

@@ -19,8 +19,10 @@ import (
func init() { func init() {
klog.InitFlags(nil) klog.InitFlags(nil)
_ = flag.Set("logtostderr", "true")
if os.Getenv("DEBUG") != "" { if os.Getenv("DEBUG") != "" {
_ = flag.Set("v", "4") // debug level _ = flag.Set("v", "4")
} else { } else {
_ = flag.Set("v", "0") _ = flag.Set("v", "0")
} }
@@ -39,7 +41,11 @@ func NewRootCmd() *cobra.Command {
}, },
} }
// Expose klog stdlib flags through Cobra/pflag.
cmd.PersistentFlags().AddGoFlagSet(flag.CommandLine)
flags.AddFlags(cmd.PersistentFlags()) flags.AddFlags(cmd.PersistentFlags())
cmd.AddCommand( cmd.AddCommand(
versioncmd.NewCmdVersion(), versioncmd.NewCmdVersion(),
initcmd.NewCmdInit(flags), initcmd.NewCmdInit(flags),
@@ -49,5 +55,6 @@ func NewRootCmd() *cobra.Command {
controllercmd.NewCmdController(flags), controllercmd.NewCmdController(flags),
internalcmd.NewCmdInternal(), internalcmd.NewCmdInternal(),
) )
return cmd return cmd
} }

View File

@@ -65,13 +65,20 @@ func handleOSUpgradeProgressLocked(
} }
if osup.Spec.NodeName != nodeName { if osup.Spec.NodeName != nodeName {
klog.V(4).InfoS("skipping osupgradeprogress due to nodeName mismatch",
"name", osup.Name,
"node", nodeName,
"target", osup.Spec.NodeName,
)
return nil return nil
} }
if osup.Status.Phase != "" && if !shouldProcessProgress(osup) {
osup.Status.Phase != monov1alpha1.OSUpgradeProgressPhasePending && klog.V(2).InfoS("skipping osupgradeprogress due to phase",
osup.Status.Phase != monov1alpha1.OSUpgradeProgressPhaseDownloading { "name", osup.Name,
// tune this logic however you want "node", nodeName,
"phase", osup.StatusPhase(),
)
return nil return nil
} }
@@ -124,7 +131,9 @@ func handleOSUpgradeProgressLocked(
now := metav1.Now() now := metav1.Now()
cur.Status.CurrentVersion = buildinfo.KubeVersion cur.Status.CurrentVersion = buildinfo.KubeVersion
cur.Status.TargetVersion = plan.ResolvedTarget cur.Status.TargetVersion = plan.ResolvedTarget
cur.Status.PlannedPath = plannedPath(plan)
cur.Status.Phase = monov1alpha1.OSUpgradeProgressPhaseDownloading cur.Status.Phase = monov1alpha1.OSUpgradeProgressPhaseDownloading
cur.Status.ObservedRetryNonce = cur.Spec.RetryNonce
cur.Status.Message = fmt.Sprintf("downloading image: %s", first.URL) cur.Status.Message = fmt.Sprintf("downloading image: %s", first.URL)
cur.Status.LastUpdatedAt = &now cur.Status.LastUpdatedAt = &now
}) })
@@ -238,6 +247,26 @@ func handleOSUpgradeProgressLocked(
select {} select {}
} }
func shouldProcessProgress(osup *monov1alpha1.OSUpgradeProgress) bool {
if osup == nil {
return false
}
if osup.Status == nil {
return false
}
switch osup.Status.Phase {
case "",
monov1alpha1.OSUpgradeProgressPhasePending:
return true
case monov1alpha1.OSUpgradeProgressPhaseFailed:
return osup.Spec.RetryNonce != osup.Status.ObservedRetryNonce
default:
return false
}
}
func triggerReboot() error { func triggerReboot() error {
_ = os.WriteFile("/proc/sysrq-trigger", []byte("s\n"), 0) _ = os.WriteFile("/proc/sysrq-trigger", []byte("s\n"), 0)
_ = os.WriteFile("/proc/sysrq-trigger", []byte("u\n"), 0) _ = os.WriteFile("/proc/sysrq-trigger", []byte("u\n"), 0)

View File

@@ -335,6 +335,14 @@ func lowestPatchInMinor(versions []Version, major, minor int) (Version, bool) {
return Version{}, false return Version{}, false
} }
func plannedPath(plan *Plan) []string {
ppath := []string{}
for _, img := range plan.Path {
ppath = append(ppath, img.Version)
}
return ppath
}
func versionsToStrings(vs []Version) []string { func versionsToStrings(vs []Version) []string {
out := make([]string, 0, len(vs)) out := make([]string, 0, len(vs))
for _, v := range vs { for _, v := range vs {

View File

@@ -209,6 +209,7 @@ func failProgress(
cur.Status = &monov1alpha1.OSUpgradeProgressStatus{} cur.Status = &monov1alpha1.OSUpgradeProgressStatus{}
} }
cur.Status.ObservedRetryNonce = cur.Spec.RetryNonce
cur.Status.LastUpdatedAt = &now cur.Status.LastUpdatedAt = &now
cur.Status.Message = fmt.Sprintf("%s: %v", action, cause) cur.Status.Message = fmt.Sprintf("%s: %v", action, cause)
cur.Status.Phase = monov1alpha1.OSUpgradeProgressPhaseFailed cur.Status.Phase = monov1alpha1.OSUpgradeProgressPhaseFailed
@@ -237,6 +238,7 @@ func markProgressCompleted(
cur.Status = &monov1alpha1.OSUpgradeProgressStatus{} cur.Status = &monov1alpha1.OSUpgradeProgressStatus{}
} }
cur.Status.ObservedRetryNonce = cur.Spec.RetryNonce
cur.Status.Phase = monov1alpha1.OSUpgradeProgressPhaseCompleted cur.Status.Phase = monov1alpha1.OSUpgradeProgressPhaseCompleted
cur.Status.Message = message cur.Status.Message = message
cur.Status.CurrentVersion = osup.Status.CurrentVersion cur.Status.CurrentVersion = osup.Status.CurrentVersion

View File

@@ -73,17 +73,13 @@ func (s *Server) Initialize() {
ws.Consumes(restful.MIME_JSON) ws.Consumes(restful.MIME_JSON)
ws.Produces(restful.MIME_JSON) ws.Produces(restful.MIME_JSON)
ws.Route(ws.GET("/healthz").To(s.queryHealthz). ws.Route(ws.GET("/status").To(s.queryStatus).
Doc("Return basic controller status")) Doc("Return basic controller status"))
// Stub for now
ws.Route(ws.GET("/readyz").To(s.queryHealthz).
Doc("Stub for now"))
s.restfulCont.Add(ws) s.restfulCont.Add(ws)
} }
func (s *Server) queryHealthz(request *restful.Request, response *restful.Response) { func (s *Server) queryStatus(request *restful.Request, response *restful.Response) {
resp := StatusResponse{ resp := StatusResponse{
OK: true, OK: true,
Service: "monok8s-controller", Service: "monok8s-controller",

View File

@@ -20,7 +20,6 @@ import (
) )
const ( const (
controlAgentNodeSelectorValue = "true"
controlAgentImage = "localhost/monok8s/node-control:dev" controlAgentImage = "localhost/monok8s/node-control:dev"
kubeconfig = "/etc/kubernetes/admin.conf" kubeconfig = "/etc/kubernetes/admin.conf"
) )
@@ -265,12 +264,7 @@ func applyNodeAgentClusterRoleBinding(ctx context.Context, kubeClient kubernetes
func applyNodeAgentDaemonSet(ctx context.Context, kubeClient kubernetes.Interface, namespace string, labels map[string]string) error { func applyNodeAgentDaemonSet(ctx context.Context, kubeClient kubernetes.Interface, namespace string, labels map[string]string) error {
privileged := true privileged := true
dsLabels := map[string]string{ dsLabels := monov1alpha1.NodeAgentLabels()
"app.kubernetes.io/name": monov1alpha1.NodeAgentName,
"app.kubernetes.io/component": "agent",
"app.kubernetes.io/part-of": "monok8s",
"app.kubernetes.io/managed-by": monov1alpha1.NodeControlName,
}
want := &appsv1.DaemonSet{ want := &appsv1.DaemonSet{
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
@@ -294,7 +288,7 @@ func applyNodeAgentDaemonSet(ctx context.Context, kubeClient kubernetes.Interfac
HostPID: true, HostPID: true,
DNSPolicy: corev1.DNSClusterFirstWithHostNet, DNSPolicy: corev1.DNSClusterFirstWithHostNet,
NodeSelector: map[string]string{ NodeSelector: map[string]string{
monov1alpha1.NodeControlKey: controlAgentNodeSelectorValue, monov1alpha1.NodeControlKey: "true",
}, },
Tolerations: []corev1.Toleration{ Tolerations: []corev1.Toleration{
{Operator: corev1.TolerationOpExists}, {Operator: corev1.TolerationOpExists},

View File

@@ -61,7 +61,7 @@ func ApplyLocalNodeMetadataIfPossible(ctx context.Context, nctx *NodeContext) er
// Additional Labels // Additional Labels
if spec.EnableNodeControl { if spec.EnableNodeControl {
node.Labels[monov1alpah1.NodeControlKey] = controlAgentNodeSelectorValue node.Labels[monov1alpah1.NodeControlKey] = "true"
} }
_, err = client.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}) _, err = client.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{})

View File

@@ -14,13 +14,20 @@ import (
monov1alpha1 "example.com/monok8s/pkg/apis/monok8s/v1alpha1" monov1alpha1 "example.com/monok8s/pkg/apis/monok8s/v1alpha1"
buildinfo "example.com/monok8s/pkg/buildinfo" buildinfo "example.com/monok8s/pkg/buildinfo"
templates "example.com/monok8s/pkg/templates"
) )
func RenderControllerDeployments(namespace string) (string, error) { type ControllerConf struct {
vals := templates.LoadTemplateValuesFromEnv() Namespace string
Image string
Labels map[string]string
}
labels := map[string]string{ func RenderControllerDeployments(conf ControllerConf) (string, error) {
if conf.Namespace == "" {
return "", fmt.Errorf("namespace is required")
}
conf.Labels = map[string]string{
"app.kubernetes.io/name": monov1alpha1.ControllerName, "app.kubernetes.io/name": monov1alpha1.ControllerName,
"app.kubernetes.io/component": "controller", "app.kubernetes.io/component": "controller",
"app.kubernetes.io/part-of": "monok8s", "app.kubernetes.io/part-of": "monok8s",
@@ -28,10 +35,10 @@ func RenderControllerDeployments(namespace string) (string, error) {
} }
objs := []runtime.Object{ objs := []runtime.Object{
buildControllerServiceAccount(namespace, labels), buildControllerServiceAccount(conf),
buildControllerClusterRole(labels), buildControllerClusterRole(conf),
buildControllerClusterRoleBinding(namespace, labels), buildControllerClusterRoleBinding(conf),
buildControllerDeployment(vals, namespace, labels), buildControllerDeployment(conf),
} }
s := runtime.NewScheme() s := runtime.NewScheme()
@@ -57,7 +64,7 @@ func RenderControllerDeployments(namespace string) (string, error) {
return buf.String(), nil return buf.String(), nil
} }
func buildControllerServiceAccount(namespace string, labels map[string]string) *corev1.ServiceAccount { func buildControllerServiceAccount(conf ControllerConf) *corev1.ServiceAccount {
automount := true automount := true
@@ -68,14 +75,14 @@ func buildControllerServiceAccount(namespace string, labels map[string]string) *
}, },
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: monov1alpha1.ControllerName, Name: monov1alpha1.ControllerName,
Namespace: namespace, Namespace: conf.Namespace,
Labels: labels, Labels: conf.Labels,
}, },
AutomountServiceAccountToken: &automount, AutomountServiceAccountToken: &automount,
} }
} }
func buildControllerClusterRole(labels map[string]string) *rbacv1.ClusterRole { func buildControllerClusterRole(conf ControllerConf) *rbacv1.ClusterRole {
wantRules := []rbacv1.PolicyRule{ wantRules := []rbacv1.PolicyRule{
{ {
APIGroups: []string{monov1alpha1.Group}, APIGroups: []string{monov1alpha1.Group},
@@ -111,19 +118,19 @@ func buildControllerClusterRole(labels map[string]string) *rbacv1.ClusterRole {
}, },
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: monov1alpha1.ControllerName, Name: monov1alpha1.ControllerName,
Labels: labels, Labels: conf.Labels,
}, },
Rules: wantRules, Rules: wantRules,
} }
} }
func buildControllerClusterRoleBinding(namespace string, labels map[string]string) *rbacv1.ClusterRoleBinding { func buildControllerClusterRoleBinding(conf ControllerConf) *rbacv1.ClusterRoleBinding {
wantSubjects := []rbacv1.Subject{ wantSubjects := []rbacv1.Subject{
{ {
Kind: "ServiceAccount", Kind: "ServiceAccount",
Name: monov1alpha1.ControllerName, Name: monov1alpha1.ControllerName,
Namespace: namespace, Namespace: conf.Namespace,
}, },
} }
@@ -140,14 +147,14 @@ func buildControllerClusterRoleBinding(namespace string, labels map[string]strin
}, },
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: monov1alpha1.ControllerName, Name: monov1alpha1.ControllerName,
Labels: labels, Labels: conf.Labels,
}, },
Subjects: wantSubjects, Subjects: wantSubjects,
RoleRef: wantRoleRef, RoleRef: wantRoleRef,
} }
} }
func buildControllerDeployment(tVals templates.TemplateValues, namespace string, labels map[string]string) *appsv1.Deployment { func buildControllerDeployment(conf ControllerConf) *appsv1.Deployment {
replicas := int32(1) replicas := int32(1)
selectorLabels := map[string]string{ selectorLabels := map[string]string{
@@ -155,10 +162,13 @@ func buildControllerDeployment(tVals templates.TemplateValues, namespace string,
"app.kubernetes.io/component": "controller", "app.kubernetes.io/component": "controller",
} }
podLabels := mergeStringMaps(labels, selectorLabels) podLabels := mergeStringMaps(conf.Labels, selectorLabels)
runAsNonRoot := true runAsNonRoot := true
allowPrivilegeEscalation := false allowPrivilegeEscalation := false
userGroup := int64(65532)
image, pullPolicy := controllerImage(conf)
return &appsv1.Deployment{ return &appsv1.Deployment{
TypeMeta: metav1.TypeMeta{ TypeMeta: metav1.TypeMeta{
@@ -167,8 +177,8 @@ func buildControllerDeployment(tVals templates.TemplateValues, namespace string,
}, },
ObjectMeta: metav1.ObjectMeta{ ObjectMeta: metav1.ObjectMeta{
Name: monov1alpha1.ControllerName, Name: monov1alpha1.ControllerName,
Namespace: namespace, Namespace: conf.Namespace,
Labels: labels, Labels: conf.Labels,
}, },
Spec: appsv1.DeploymentSpec{ Spec: appsv1.DeploymentSpec{
Replicas: &replicas, Replicas: &replicas,
@@ -184,12 +194,12 @@ func buildControllerDeployment(tVals templates.TemplateValues, namespace string,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
Name: "controller", Name: "controller",
Image: fmt.Sprintf("localhost/monok8s/node-control:%s", buildinfo.Version), Image: image,
ImagePullPolicy: corev1.PullIfNotPresent, ImagePullPolicy: pullPolicy,
Args: []string{ Args: []string{
"controller", "controller",
"--namespace", "--namespace",
namespace, conf.Namespace,
}, },
Env: []corev1.EnvVar{ Env: []corev1.EnvVar{
{ {
@@ -239,6 +249,10 @@ func buildControllerDeployment(tVals templates.TemplateValues, namespace string,
Port: intstr.FromString("http"), Port: intstr.FromString("http"),
}, },
}, },
InitialDelaySeconds: 5,
PeriodSeconds: 60,
TimeoutSeconds: 2,
FailureThreshold: 3,
}, },
ReadinessProbe: &corev1.Probe{ ReadinessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{ ProbeHandler: corev1.ProbeHandler{
@@ -247,13 +261,64 @@ func buildControllerDeployment(tVals templates.TemplateValues, namespace string,
Port: intstr.FromString("http"), Port: intstr.FromString("http"),
}, },
}, },
InitialDelaySeconds: 2,
PeriodSeconds: 5,
TimeoutSeconds: 2,
FailureThreshold: 3,
}, },
SecurityContext: &corev1.SecurityContext{ SecurityContext: &corev1.SecurityContext{
RunAsNonRoot: &runAsNonRoot, RunAsNonRoot: &runAsNonRoot,
RunAsUser: &userGroup,
RunAsGroup: &userGroup,
AllowPrivilegeEscalation: &allowPrivilegeEscalation, AllowPrivilegeEscalation: &allowPrivilegeEscalation,
}, },
}, },
}, },
NodeSelector: controllerNodeSelector(conf),
Affinity: controllerAffinity(conf),
},
},
},
}
}
func controllerImage(conf ControllerConf) (string, corev1.PullPolicy) {
if conf.Image != "" {
return conf.Image, corev1.PullIfNotPresent
}
return fmt.Sprintf("localhost/monok8s/node-control:%s", buildinfo.Version), corev1.PullNever
}
func controllerNodeSelector(conf ControllerConf) map[string]string {
if conf.Image != "" {
return nil
}
// Local image exists on managed nodes only.
return map[string]string{
monov1alpha1.NodeControlKey: "true",
}
}
func controllerAffinity(conf ControllerConf) *corev1.Affinity {
// Local image exists only on managed nodes, so in that mode we already use
// NodeSelector and should not fight placement with anti-affinity.
if conf.Image == "" {
return nil
}
return &corev1.Affinity{
PodAntiAffinity: &corev1.PodAntiAffinity{
PreferredDuringSchedulingIgnoredDuringExecution: []corev1.WeightedPodAffinityTerm{
{
Weight: 100,
PodAffinityTerm: corev1.PodAffinityTerm{
TopologyKey: corev1.LabelHostname,
LabelSelector: &metav1.LabelSelector{
MatchLabels: monov1alpha1.NodeAgentLabels(),
},
},
}, },
}, },
}, },

View File

@@ -160,6 +160,7 @@ func buildSSHDDeployment(
Labels: podLabels, Labels: podLabels,
}, },
Spec: corev1.PodSpec{ Spec: corev1.PodSpec{
HostPID: true,
NodeSelector: selectorLabels, NodeSelector: selectorLabels,
Containers: []corev1.Container{ Containers: []corev1.Container{
{ {
@@ -215,24 +216,20 @@ exec /usr/sbin/sshd \
corev1.ResourceMemory: resource.MustParse("128Mi"), corev1.ResourceMemory: resource.MustParse("128Mi"),
}, },
}, },
VolumeMounts: []corev1.VolumeMount{ VolumeMounts: append(
[]corev1.VolumeMount{
{ {
Name: "authorized-keys", Name: "authorized-keys",
MountPath: "/authorized-keys", MountPath: "/authorized-keys",
ReadOnly: true, ReadOnly: true,
}, },
{
Name: "host-etc",
MountPath: "/host/etc",
}, },
{ buildHostRootVolumeMounts()...,
Name: "host-var", ),
MountPath: "/host/var",
}, },
}, },
}, Volumes: append(
}, []corev1.Volume{
Volumes: []corev1.Volume{
{ {
Name: "authorized-keys", Name: "authorized-keys",
VolumeSource: corev1.VolumeSource{ VolumeSource: corev1.VolumeSource{
@@ -244,31 +241,85 @@ exec /usr/sbin/sshd \
}, },
}, },
}, },
{
Name: "host-etc",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/etc",
Type: ptrHostPathType(corev1.HostPathDirectory),
},
},
},
{
Name: "host-var",
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: "/var",
Type: ptrHostPathType(corev1.HostPathDirectory),
},
},
},
}, },
buildHostRootVolumes()...,
),
}, },
}, },
}, },
} }
} }
func buildHostRootVolumeMounts() []corev1.VolumeMount {
paths := []struct {
name string
mountPath string
readOnly bool
}{
{"host-bin", "/host/bin", true},
{"host-sbin", "/host/sbin", true},
{"host-lib", "/host/lib", true},
{"host-usr", "/host/usr", true},
{"host-etc", "/host/etc", false},
{"host-run", "/host/run", false},
{"host-proc", "/host/proc", false},
{"host-sys", "/host/sys", false},
{"host-dev", "/host/dev", false},
{"host-var", "/host/var", false},
}
mounts := make([]corev1.VolumeMount, 0, len(paths))
for _, p := range paths {
mounts = append(mounts, corev1.VolumeMount{
Name: p.name,
MountPath: p.mountPath,
ReadOnly: p.readOnly,
})
}
return mounts
}
func buildHostRootVolumes() []corev1.Volume {
hostPathDir := corev1.HostPathDirectory
paths := []struct {
name string
path string
}{
{"host-bin", "/bin"},
{"host-sbin", "/sbin"},
{"host-lib", "/lib"},
{"host-usr", "/usr"},
{"host-etc", "/etc"},
{"host-run", "/run"},
{"host-proc", "/proc"},
{"host-sys", "/sys"},
{"host-dev", "/dev"},
// /var is an rbind mount in monok8s and may be private.
// Mount the real backing path instead.
{"host-var", "/data/var"},
}
volumes := make([]corev1.Volume, 0, len(paths))
for _, p := range paths {
volumes = append(volumes, corev1.Volume{
Name: p.name,
VolumeSource: corev1.VolumeSource{
HostPath: &corev1.HostPathVolumeSource{
Path: p.path,
Type: &hostPathDir,
},
},
})
}
return volumes
}
func ptrInt32(v int32) *int32 { func ptrInt32(v int32) *int32 {
return &v return &v
} }

View File

@@ -84,7 +84,7 @@ func LoadTemplateValuesFromEnv() TemplateValues {
v := defaultTemplateValues() v := defaultTemplateValues()
v.Hostname = getenvDefault("MKS_HOSTNAME", v.Hostname) v.Hostname = getenvDefault("MKS_HOSTNAME", v.Hostname)
v.NodeName = getenvDefault("MKS_NODE_NAME", v.Hostname) v.NodeName = getenvDefault("MKS_NODE_NAME", getenvDefault("NODE_NAME", v.Hostname))
v.KubernetesVersion = getenvDefault("MKS_KUBERNETES_VERSION", v.KubernetesVersion) v.KubernetesVersion = getenvDefault("MKS_KUBERNETES_VERSION", v.KubernetesVersion)
v.ClusterName = getenvDefault("MKS_CLUSTER_NAME", v.ClusterName) v.ClusterName = getenvDefault("MKS_CLUSTER_NAME", v.ClusterName)

6
devtools/serve-images.sh Executable file
View File

@@ -0,0 +1,6 @@
#/bin/bash
SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")"
OUT_DIR="$(realpath "$SCRIPT_DIR"/../out/)"
python3 -m http.server 8000 --bind 0.0.0.0 --directory "$OUT_DIR"

View File

@@ -1,15 +1,29 @@
#!/bin/bash #!/bin/bash
SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")" SCRIPT_DIR="$(dirname "${BASH_SOURCE[0]}")"
OUT_DIR="$( realpath "$SCRIPT_DIR"/../out/ )" OUT_DIR="$(realpath "$SCRIPT_DIR"/../out/)"
set -e set -e
BASE_URL="http://localhost:8000" DEFAULT_BASE_URL="http://localhost:8000"
TARGET_VERSION="v$1" DEFAULT_TARGET_VERSION="v1.34.1"
STABLE_VERSION="v1.34.6" STABLE_VERSION="v1.34.1"
NAME="my-upgrade-1" NAME="my-upgrade-1"
if [ -r /dev/tty ]; then
printf "Enter the base url (%s): " "$DEFAULT_BASE_URL" > /dev/tty
read -r BASE_URL < /dev/tty
printf "Enter the target version (%s): " "$DEFAULT_TARGET_VERSION" > /dev/tty
read -r TARGET_VERSION < /dev/tty
else
echo "No TTY available for interactive input" >&2
exit 1
fi
BASE_URL="${BASE_URL:-$DEFAULT_BASE_URL}"
TARGET_VERSION="${TARGET_VERSION:-$DEFAULT_TARGET_VERSION}"
echo "apiVersion: monok8s.io/v1alpha1" echo "apiVersion: monok8s.io/v1alpha1"
echo "kind: OSUpgrade" echo "kind: OSUpgrade"
echo "metadata:" echo "metadata:"

204
docs/installing-ssh-pod.md Normal file
View File

@@ -0,0 +1,204 @@
# Installing the recovery SSHD pod
This page explains how to install a temporary SSH server pod for break-glass recovery.
Use this when normal Kubernetes access is degraded, for example after the API server certificate expires or rotates and you need to retrieve updated host-side credentials.
The SSHD pod is intended for recovery and debugging only. Remove it when you are done.
## What this does
The recovery pod starts an SSH server on the selected node and authorizes your local SSH public key.
The pod also mounts selected host paths under `/host`, so you can inspect the host filesystem and run some host-side recovery commands through `chroot`.
For example:
```sh
chroot /host /bin/sh -lc 'rc-status'
chroot /host /bin/sh -lc 'rc-service crio status'
chroot /host /bin/sh -lc 'rc-service kubelet status'
```
## Requirements
You need:
- A working `kubectl` connection to the cluster.
- Access to the `node-agent` DaemonSet in the `mono-system` namespace.
- A local SSH public key, usually `~/.ssh/id_rsa.pub` or `~/.ssh/id_ed25519.pub`.
Use a public key file only. Do not pass your private key.
## Generate the SSHD manifest
To print the recovery SSHD manifest:
```bash
kubectl exec -i -n mono-system ds/node-agent -- \
ctl create sshd --authkeys /dev/stdin < ~/.ssh/id_rsa.pub
```
This reads your local public key and places it into the generated pod's `authorized_keys`.
If you use Ed25519 keys, use:
```bash
kubectl exec -i -n mono-system ds/node-agent -- \
ctl create sshd --authkeys /dev/stdin < ~/.ssh/id_ed25519.pub
```
## Generate and apply the manifest
To create the recovery SSHD resources in one step:
```bash
kubectl exec -i -n mono-system ds/node-agent -- \
ctl create sshd --authkeys /dev/stdin < ~/.ssh/id_rsa.pub \
| kubectl apply -f -
```
For Ed25519:
```bash
kubectl exec -i -n mono-system ds/node-agent -- \
ctl create sshd --authkeys /dev/stdin < ~/.ssh/id_ed25519.pub \
| kubectl apply -f -
```
## Why `-i` is used instead of `-it`
Use `-i`, not `-it`, when piping the SSH public key.
The `-t` option allocates a pseudo-TTY. A pseudo-TTY can modify piped input, which is not what you want when passing an SSH public key through stdin.
Correct:
```bash
kubectl exec -i -n mono-system ds/node-agent -- \
ctl create sshd --authkeys /dev/stdin < ~/.ssh/id_rsa.pub
```
Avoid:
```bash
kubectl exec -it -n mono-system ds/node-agent -- \
ctl create sshd --authkeys /dev/stdin < ~/.ssh/id_rsa.pub
```
## Check that the pod is running
After applying the manifest, check the pod:
```bash
kubectl get pods -n mono-system -l app.kubernetes.io/name=sshd
```
Check the service:
```bash
kubectl get svc -n mono-system -l app.kubernetes.io/name=sshd
```
If the pod does not start, inspect it:
```bash
kubectl describe pod -n mono-system -l app.kubernetes.io/name=sshd
```
## Connect through SSH
The exact SSH command depends on how the generated service exposes the pod.
If the service uses a NodePort such as `30022`, connect with:
```bash
ssh -p 30022 root@<node-ip>
```
Replace `<node-ip>` with the node's reachable IP address.
## Access the host environment
Inside the SSH session, the host filesystem is available under `/host`.
Useful checks:
```sh
ls -la /host
chroot /host /bin/sh -lc 'rc-status'
chroot /host /bin/sh -lc 'rc-service crio status'
chroot /host /bin/sh -lc 'rc-service kubelet status'
```
Restart CRI-O:
```sh
chroot /host /bin/sh -lc 'rc-service crio restart'
```
Restart kubelet:
```sh
chroot /host /bin/sh -lc 'rc-service kubelet restart'
```
You can also inspect host processes from the pod because the recovery pod uses the host PID namespace:
```sh
ps aux | grep -E 'kubelet|crio'
```
## Notes for monok8s host mounts
The recovery pod does not mount host `/` directly.
On monok8s, `/` and `/var` may be private mounts. Mounting them directly as host paths can fail with errors such as:
```text
path "/" is mounted on "/" but it is not a shared or slave mount
```
or:
```text
path "/var" is mounted on "/var" but it is not a shared or slave mount
```
Instead, the recovery pod assembles a minimal host root under `/host` from individual host paths.
For `/var`, it uses the backing path:
```text
/data/var -> /host/var
```
This avoids the private bind-mount issue.
## Remove the recovery pod
When recovery is complete, remove the generated resources.
If the resources use the default SSHD labels:
```bash
kubectl delete deployment -n mono-system -l app.kubernetes.io/name=sshd
kubectl delete service -n mono-system -l app.kubernetes.io/name=sshd
kubectl delete configmap -n mono-system -l app.kubernetes.io/name=sshd
```
If your generated manifest uses a fixed resource name, you can also remove them by name:
```bash
kubectl delete deployment -n mono-system sshd
kubectl delete service -n mono-system sshd
kubectl delete configmap -n mono-system sshd-authorized-keys
```
## Security warning
This pod is powerful.
It runs with root-level recovery access and can inspect or modify host files through `/host`. Treat it as a temporary break-glass tool, not a normal service.
Do not leave it running after recovery.

View File

@@ -1,19 +1,54 @@
## Upgrade process # OS OTA Upgrades
We use an agent to watch the OSUpgrade CRD to handle this. Our image versions follows upstream. MonoK8s upgrades are driven through two custom resources:
To issue an upgrade. Simply use - `OSUpgrade`: the user-facing upgrade request.
- `OSUpgradeProgress`: the per-node upgrade state watched and executed by the node agent.
The node agent does the actual upgrade work. It watches `OSUpgradeProgress` resources assigned to its node, downloads the selected image, writes it to the inactive rootfs partition, updates status, and reboots when ready.
The controller is optional but strongly recommended. It watches `OSUpgrade` resources and creates the matching `OSUpgradeProgress` resources for the target nodes.
## Install the controller
By default, each managed node only runs the node agent. The node agent does **not** watch `OSUpgrade` directly; it only watches `OSUpgradeProgress`.
You can create `OSUpgradeProgress` resources by hand, but normal users should not need to. Install the controller instead, then create `OSUpgrade` resources.
Install the controller from the existing node-agent image:
```bash
kubectl exec -i -n mono-system ds/node-agent -- \
ctl create controller --image REPO/IMAGE:TAG | kubectl apply -f -
```
### `--image`
`--image` is optional.
If omitted, the generated Deployment uses the local controller image that is already shipped with managed nodes. In that mode, the controller Deployment is scheduled only onto managed nodes because the image is expected to exist locally.
If provided, the generated Deployment uses that image directly. This is useful when you host the controller image in your own registry.
There is no official public image repository yet, so external controller images must currently be managed by the operator.
## Create an upgrade
Create an `OSUpgrade` resource to request an upgrade:
```bash
kubectl apply -f upgrade.yaml kubectl apply -f upgrade.yaml
```
Example:
Example yaml
```yaml ```yaml
apiVersion: monok8s.io/v1alpha1 apiVersion: monok8s.io/v1alpha1
kind: OSUpgrade kind: OSUpgrade
metadata: metadata:
name: "my-ugrade-2" name: my-upgrade-2
spec: spec:
version: "v1.35.3" version: v1.35.3
nodeSelector: {} nodeSelector: {}
catalog: catalog:
inline: | inline: |
@@ -34,24 +69,61 @@ spec:
- version: v1.35.1 - version: v1.35.1
url: http://localhost:8000/rootfs.ext4.zst url: http://localhost:8000/rootfs.ext4.zst
checksum: sha256:99af82a263deca44ad91d21d684f0fa944d5d0456a1da540f1c644f8aa59b14b checksum: sha256:99af82a263deca44ad91d21d684f0fa944d5d0456a1da540f1c644f8aa59b14b
size: 1858076672 # expanded image size in bytes, use "zstd -lv image.zst to check" size: 1858076672 # expanded image size in bytes; check with: zstd -lv image.zst
blocked: blocked:
- v1.34.0 - v1.34.0
``` ```
catalog also accepts URL or ConfigMap※ ### `spec.version`
`spec.version` is the requested target version.
It may be either:
- an explicit version, such as `v1.35.3`
- `stable`, if the catalog defines a `stable` version
### `spec.nodeSelector`
`spec.nodeSelector` selects the nodes that should receive the upgrade.
An empty selector means all eligible managed nodes.
### `spec.catalog`
The catalog tells the agent where to find available OS images.
The catalog can be provided inline:
```yaml ```yaml
catalog: catalog:
URL: https://example.com/images.yaml inline: |
stable: v1.35.1
catalog: images:
ConfigMap: images-cm - version: v1.35.1
url: https://example.invalid/images/monok8s-v1.35.1.img.zst
checksum: sha256:abc
size: 1858076672
``` ```
※ ConfigMap requires additional RBAC permissions which is not enabled by default. You can edit It can also be loaded from a URL:
the node-agent's ClusterRole and add `configmaps: get` to allow this.
```yaml
catalog:
url: https://example.com/images.yaml
```
Or from a ConfigMap:
```yaml
catalog:
configMap: images-cm
```
ConfigMap catalogs require extra RBAC. This permission is not enabled by default. To use a ConfigMap catalog, edit the relevant ClusterRole and allow `get` on `configmaps`.
Catalog content should look like this:
Contents should look like this
```yaml ```yaml
stable: v1.35.1 stable: v1.35.1
images: images:
@@ -70,64 +142,114 @@ images:
- version: v1.35.1 - version: v1.35.1
url: http://localhost:8000/rootfs.ext4.zst url: http://localhost:8000/rootfs.ext4.zst
checksum: sha256:99af82a263deca44ad91d21d684f0fa944d5d0456a1da540f1c644f8aa59b14b checksum: sha256:99af82a263deca44ad91d21d684f0fa944d5d0456a1da540f1c644f8aa59b14b
size: 1858076672 # expanded image size in bytes, use "zstd -lv image.zst to check" size: 1858076672 # expanded image size in bytes; check with: zstd -lv image.zst
blocked: blocked:
- v1.34.0 - v1.34.0
``` ```
### Monitoring the upgrades ## Monitor upgrades
kubectl get osugrades List upgrade requests:
```
NAME DESIRED RESOLVED PHASE TARGETS OK FAIL AGE ```bash
my-upgrade-3 stable v1.35.4 RollingOut 3 1 0 1m kubectl get osupgrades
my-upgrade-2 v1.35.3 v1.35.3 Accepted 2 0 0 1m
my-downgrade-1 v1.33.2 v1.33.2 Rejected 2 0 2 1m
``` ```
kubectl get osupgradeprogress Example output:
```text
NAME DESIRED RESOLVED PHASE
my-upgrade-3 stable v1.35.4 Pending
my-upgrade-2 v1.35.3 v1.35.3 Accepted
my-downgrade-1 v1.33.2 v1.33.2 Rejected
``` ```
List per-node progress:
```bash
kubectl get osupgradeprogresses
```
Example output:
```text
NAME NODE SOURCE CURRENT TARGET STATUS NAME NODE SOURCE CURRENT TARGET STATUS
osupgrade-abc123f node-1 my-upgrade-2 v1.34.1 v1.35.3 downloading osupgrade-abc123f node-1 my-upgrade-2 v1.34.1 v1.35.3 Downloading
osupgrade-cde456g node-2 my-upgrade-2 v1.35.3 v1.35.3 completed osupgrade-cde456g node-2 my-upgrade-2 v1.35.3 v1.35.3 Completed
``` ```
Inspect one node's progress:
```bash
kubectl describe osupgradeprogress osupgrade-abc123f kubectl describe osupgradeprogress osupgrade-abc123f
```
Example resource:
```yaml ```yaml
apiVersion: monok8s.io/v1alpha1 apiVersion: monok8s.io/v1alpha1
kind: OSUpgradeProgress kind: OSUpgradeProgress
metadata: metadata:
name: "osupgrade-abc123f" name: osupgrade-abc123f
spec: spec:
sourceRef: sourceRef:
name: my-upgrade-2 name: my-upgrade-2
nodeName: node-1 nodeName: node-1
status: status:
currentVersion: "v1.34.1" currentVersion: v1.34.1
targetVersion: "v1.35.3" targetVersion: v1.35.3
phase: Downloading phase: Downloading
startedAt: null startedAt: null
completedAt: null completedAt: null
lastUpdatedAt: null lastUpdatedAt: null
retryCount: 0 retryCount: 0
inactivePartition: "B" inactivePartition: B
failureReason: "" failureReason: ""
message: "" message: ""
``` ```
## Retry a failed upgrade
If an upgrade fails, for example because the image download failed, edit `spec.retryNonce` on the affected `OSUpgradeProgress` resource.
Any changed value is enough. The field is only used to tell the node agent that the user intentionally requested a retry.
Example:
```bash
kubectl patch osupgradeprogress osupgrade-abc123f \
--type merge \
-p '{"spec":{"retryNonce":"retry-1"}}'
```
If the same node fails again and you want to retry again, change the nonce to a new value:
```bash
kubectl patch osupgradeprogress osupgrade-abc123f \
--type merge \
-p '{"spec":{"retryNonce":"retry-2"}}'
```
## Development notes ## Development notes
### Flashing manually into partition B ### Flash an image manually into partition B
**Use nmap ncat**. Otherwise we'll have all kinds of fabulous issues sending it. Use nmap's `ncat`. Other tools may work, but they are more likely to cause annoying stream or connection behavior.
Sending side On the sending machine:
```
pv "out/rootfs.ext4.zst" | ncat 10.0.0.10 1234 --send-only ```bash
pv out/rootfs.ext4.zst | ncat 10.0.0.10 1234 --send-only
``` ```
Receiving side On the receiving machine:
```
ncat -l 1234 --recv-only | zstd -d -c | dd of=/dev/sda3 bs=4M status=progress && sync && echo "SUCCESS" ```bash
ncat -l 1234 --recv-only | \
zstd -d -c | \
dd of=/dev/sda3 bs=4M status=progress && \
sync && \
echo "SUCCESS"
``` ```
Be careful with the target partition. The example writes to `/dev/sda3`, which is assumed to be rootfs B in that setup. Verify the partition layout before running this on real hardware.