Skip to content

Commit

Permalink
DRA kubelet: refactor gRPC call timeouts
Browse files Browse the repository at this point in the history
Some of the E2E node tests were flaky. Their timeout apparently was chosen
under the assumption that kubelet would retry immediately after a failed gRPC
call, with a factor of 2 as safety margin. But according to
kubernetes@0449cef,
kubelet has a different, higher retry period of 90 seconds, which was exactly
the test timeout. The test timeout has to be higher than that.

As the tests don't use the gRPC call timeout anymore, it can be made
private. While at it, the name and documentation gets updated.
  • Loading branch information
pohly committed Jul 22, 2024
1 parent 357a292 commit d11b58e
Show file tree
Hide file tree
Showing 5 changed files with 66 additions and 65 deletions.
8 changes: 3 additions & 5 deletions pkg/kubelet/cm/dra/plugin/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,6 @@ import (
drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha4"
)

const PluginClientTimeout = 45 * time.Second

// NewDRAPluginClient returns a wrapper around those gRPC methods of a DRA
// driver kubelet plugin which need to be called by kubelet. The wrapper
// handles gRPC connection management and logging. Connections are reused
Expand All @@ -60,7 +58,7 @@ type Plugin struct {
conn *grpc.ClientConn
endpoint string
highestSupportedVersion *utilversion.Version
clientTimeout time.Duration
clientCallTimeout time.Duration
}

func (p *Plugin) getOrCreateGRPCConn() (*grpc.ClientConn, error) {
Expand Down Expand Up @@ -116,7 +114,7 @@ func (p *Plugin) NodePrepareResources(
return nil, err
}

ctx, cancel := context.WithTimeout(ctx, p.clientTimeout)
ctx, cancel := context.WithTimeout(ctx, p.clientCallTimeout)
defer cancel()

nodeClient := drapb.NewNodeClient(conn)
Expand All @@ -138,7 +136,7 @@ func (p *Plugin) NodeUnprepareResources(
return nil, err
}

ctx, cancel := context.WithTimeout(ctx, p.clientTimeout)
ctx, cancel := context.WithTimeout(ctx, p.clientCallTimeout)
defer cancel()

nodeClient := drapb.NewNodeClient(conn)
Expand Down
6 changes: 3 additions & 3 deletions pkg/kubelet/cm/dra/plugin/client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,9 +243,9 @@ func TestNodeUnprepareResources(t *testing.T) {
defer teardown()

p := &Plugin{
backgroundCtx: ctx,
endpoint: addr,
clientTimeout: PluginClientTimeout,
backgroundCtx: ctx,
endpoint: addr,
clientCallTimeout: defaultClientCallTimeout,
}

conn, err := p.getOrCreateGRPCConn()
Expand Down
13 changes: 11 additions & 2 deletions pkg/kubelet/cm/dra/plugin/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,15 @@ import (
"k8s.io/kubernetes/pkg/kubelet/pluginmanager/cache"
)

// defaultClientCallTimeout is the default amount of time that a DRA driver has
// to respond to any of the gRPC calls. kubelet uses this value by passing nil
// to RegisterPlugin. Some tests use a different, usually shorter timeout to
// speed up testing.
//
// This is half of the kubelet retry period (according to
// https://github.com/kubernetes/kubernetes/commit/0449cef8fd5217d394c5cd331d852bd50983e6b3).
const defaultClientCallTimeout = 45 * time.Second

// RegistrationHandler is the handler which is fed to the pluginwatcher API.
type RegistrationHandler struct {
// backgroundCtx is used for all future activities of the handler.
Expand Down Expand Up @@ -144,7 +153,7 @@ func (h *RegistrationHandler) RegisterPlugin(pluginName string, endpoint string,

var timeout time.Duration
if pluginClientTimeout == nil {
timeout = PluginClientTimeout
timeout = defaultClientCallTimeout
} else {
timeout = *pluginClientTimeout
}
Expand All @@ -157,7 +166,7 @@ func (h *RegistrationHandler) RegisterPlugin(pluginName string, endpoint string,
conn: nil,
endpoint: endpoint,
highestSupportedVersion: highestSupportedVersion,
clientTimeout: timeout,
clientCallTimeout: timeout,
}

// Storing endpoint of newly registered DRA Plugin into the map, where plugin name will be the key
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -137,26 +137,6 @@ func classWithConfig(name, driver, attribute string) *resourceapi.DeviceClass {
return class
}

// generate a DeviceClass object with the given name and the node selector
// that selects nodes with the region label set to either "west" or "east".
func classWithSuitableNodes(name, driver string) *resourceapi.DeviceClass {
class := class(name, driver)
class.Spec.SuitableNodes = &v1.NodeSelector{
NodeSelectorTerms: []v1.NodeSelectorTerm{
{
MatchExpressions: []v1.NodeSelectorRequirement{
{
Key: regionKey,
Operator: v1.NodeSelectorOpIn,
Values: []string{region1, region2},
},
},
},
},
}
return class
}

// generate a ResourceClaim object with the given name and device requests.
func claimWithRequests(name string, constraints []resourceapi.DeviceConstraint, requests ...resourceapi.DeviceRequest) *resourceapi.ResourceClaim {
return &resourceapi.ResourceClaim{
Expand Down
Loading

0 comments on commit d11b58e

Please sign in to comment.