Skip to content

Commit

Permalink
Merge pull request #416 from klihub/fixes/config/startup-error-propag…
Browse files Browse the repository at this point in the history
…ation

resmgr,agent: propagate startup config error back to CR.
  • Loading branch information
fmuyassarov authored Nov 25, 2024
2 parents ed3e6fa + a441a0f commit 47c4b11
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 17 deletions.
13 changes: 9 additions & 4 deletions pkg/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ func TemplateConfigInterface() ConfigInterface {
}

// NotifyFn is a function to call when the effective configuration changes.
type NotifyFn func(cfg interface{}) error
type NotifyFn func(cfg interface{}) (bool, error)

var (
// Our logger instance for the agent.
Expand Down Expand Up @@ -544,12 +544,17 @@ func (a *Agent) updateConfig(cfg metav1.Object) {
}
}

err := a.notifyFn(cfg)
fatal, err := a.notifyFn(cfg)
a.patchConfigStatus(a.currentCfg, cfg, err)

if err != nil {
log.Errorf("failed to apply configuration: %v", err)
if fatal {
log.Fatalf("failed to apply configuration: %v", err)
} else {
log.Errorf("failed to apply configuration: %v", err)
}
}

a.patchConfigStatus(a.currentCfg, cfg, err)
a.currentCfg = cfg
}

Expand Down
14 changes: 7 additions & 7 deletions pkg/resmgr/resource-manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,17 +128,17 @@ func (m *resmgr) Start() error {
return nil
}

func (m *resmgr) updateConfig(newCfg interface{}) error {
func (m *resmgr) updateConfig(newCfg interface{}) (bool, error) {
if newCfg == nil {
return fmt.Errorf("can't run without effective configuration...")
return false, fmt.Errorf("can't run without effective configuration...")
}

cfg, ok := newCfg.(cfgapi.ResmgrConfig)
if !ok {
if !m.running {
log.Fatalf("got initial configuration of unexpected type %T", newCfg)
return true, fmt.Errorf("got initial configuration of unexpected type %T", newCfg)
} else {
return fmt.Errorf("got configuration of unexpected type %T", newCfg)
return false, fmt.Errorf("got configuration of unexpected type %T", newCfg)
}
}

Expand All @@ -151,17 +151,17 @@ func (m *resmgr) updateConfig(newCfg interface{}) error {
log.InfoBlock(" <initial config> ", "%s", dump)

if err := m.start(cfg); err != nil {
log.Fatalf("failed to start with initial configuration: %v", err)
return true, fmt.Errorf("failed to start with initial configuration: %v", err)
}

m.running = true
return nil
return false, nil
}

log.Infof("configuration update %s (generation %d):", meta.GetName(), meta.GetGeneration())
log.InfoBlock(" <updated config> ", "%s", dump)

return m.reconfigure(cfg)
return false, m.reconfigure(cfg)
}

// Start resource management once we acquired initial configuration.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
helm-terminate
CONFIG_GROUP="group.test"

cleanup() {
vm-command "kubectl delete -n kube-system topologyawarepolicies.config.nri/default" || :
vm-command "kubectl delete -n kube-system topologyawarepolicies.config.nri/$CONFIG_GROUP" || :
vm-command "kubectl label nodes --all config.nri/group-" || :
helm-terminate || :
}

cleanup
helm_config=$(instantiate helm-config.yaml) helm-launch topology-aware

sleep 1
Expand All @@ -12,6 +21,7 @@ vm-command "kubectl wait -n kube-system topologyawarepolicies/default \
error "expected initial Success status"
}

# verify propagation of errors back to source CR
vm-put-file $(RESERVED_CPU=750x instantiate custom-config.yaml) broken-config.yaml
vm-command "kubectl apply -f broken-config.yaml"

Expand All @@ -26,3 +36,19 @@ vm-command "kubectl wait -n kube-system topologyawarepolicies/default \
}

helm-terminate

# verify propagation of initial configuration errors back to source CR
vm-put-file $(CONFIG_NAME="$CONFIG_GROUP" RESERVED_CPU=750x instantiate custom-config.yaml) \
broken-group-config.yaml
vm-command "kubectl apply -f broken-group-config.yaml" || \
error "failed to install broken group config"
vm-command "kubectl label nodes --all config.nri/group=test" || \
error "failed to label nodes for group config"

expect_error=1 launch_timeout=5s helm_config=$(instantiate helm-config.yaml) helm-launch topology-aware
get-config-node-status-error topologyawarepolicies/$CONFIG_GROUP | \
grep "failed to parse" | grep -q 750x || {
error "expected initial error not found in status"
}

cleanup
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
apiVersion: config.nri/v1alpha1
kind: TopologyAwarePolicy
metadata:
name: default
name: ${CONFIG_NAME:-default}
namespace: kube-system
spec:
pinCPU: true
Expand Down
16 changes: 12 additions & 4 deletions test/e2e/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,7 @@ helm-launch() { # script API
# default: 20s
# cfgresource: config custom resource to wait for node status to change in
# default: balloonspolicies/default or topologyawarepolicies/default
# expect_error: don't exit, expect availability error
#
# Example:
# helm_config=$(instantiate helm-config.yaml) helm-launch balloons
Expand All @@ -361,6 +362,7 @@ helm-launch() { # script API
local helm_config="${helm_config:-$TEST_DIR/helm-config.yaml}"
local ds_name="${daemonset_name:-}" ctr_name="${container_name:-nri-resource-policy-$1}"
local helm_name="${helm_name:-test}" timeout="${launch_timeout:-20s}"
local expect_error="${expect_error:-0}"
local plugin="$1" cfgresource=${cfgresource:-} cfgstatus
local deadline
shift
Expand Down Expand Up @@ -403,8 +405,15 @@ helm-launch() { # script API

deadline=$(deadline-for-timeout $timeout)
vm-command "kubectl wait -n kube-system ds/${ds_name} --timeout=$timeout \
--for=jsonpath='{.status.numberAvailable}'=1" || \
error "Timeout while waiting daemonset/${ds_name} to be available"
--for=jsonpath='{.status.numberAvailable}'=1"

if [ "$COMMAND_STATUS" != "0" ]; then
if [ "$expect_error" != "1" ]; then
error "Timeout while waiting daemonset/${ds_name} to be available"
else
return 1
fi
fi

timeout=$(timeout-for-deadline $deadline)
timeout=$timeout wait-config-node-status $cfgresource
Expand All @@ -413,7 +422,6 @@ helm-launch() { # script API
if [ "$result" != "Success" ]; then
reason=$(get-config-node-status-error $cfgresource)
error "Plugin $plugin configuration failed: $reason"
return 1
fi

vm-start-log-collection -n kube-system ds/$ds_name -c $ctr_name
Expand Down Expand Up @@ -502,7 +510,7 @@ get-config-node-status-result() {
get-config-node-status-error() {
local resource="$1" node="$(get-hostname-for-vm)"
vm-command-q "kubectl get -n kube-system $resource \
-ojsonpath=\"{.status.nodes['$node'].error}\""
-ojsonpath=\"{.status.nodes['$node'].errors}\""
}

wait-config-node-status() {
Expand Down

0 comments on commit 47c4b11

Please sign in to comment.