Skip to content

Commit

Permalink
balancer: automatically stop producers on subchannel state changes (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
dfawley authored Sep 30, 2024
1 parent 941102b commit ca4865d
Show file tree
Hide file tree
Showing 9 changed files with 133 additions and 320 deletions.
13 changes: 9 additions & 4 deletions balancer/balancer.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,11 @@ type SubConn interface {
Connect()
// GetOrBuildProducer returns a reference to the existing Producer for this
// ProducerBuilder in this SubConn, or, if one does not currently exist,
// creates a new one and returns it. Returns a close function which must
// be called when the Producer is no longer needed.
// creates a new one and returns it. Returns a close function which may be
// called when the Producer is no longer needed. Otherwise the producer
// will automatically be closed upon connection loss or subchannel close.
// Should only be called on a SubConn in state Ready. Otherwise the
// producer will be unable to create streams.
GetOrBuildProducer(ProducerBuilder) (p Producer, close func())
// Shutdown shuts down the SubConn gracefully. Any started RPCs will be
// allowed to complete. No future calls should be made on the SubConn.
Expand Down Expand Up @@ -452,8 +455,10 @@ type ProducerBuilder interface {
// Build creates a Producer. The first parameter is always a
// grpc.ClientConnInterface (a type to allow creating RPCs/streams on the
// associated SubConn), but is declared as `any` to avoid a dependency
// cycle. Should also return a close function that will be called when all
// references to the Producer have been given up.
// cycle. Build also returns a close function that will be called when all
// references to the Producer have been given up for a SubConn, or when a
// connectivity state change occurs on the SubConn. The close function
// should always block until all asynchronous cleanup work is completed.
Build(grpcClientConnInterface any) (p Producer, close func())
}

Expand Down
22 changes: 12 additions & 10 deletions balancer/weightedroundrobin/balancer.go
Original file line number Diff line number Diff line change
Expand Up @@ -526,17 +526,21 @@ func (w *weightedSubConn) updateConfig(cfg *lbConfig) {
w.cfg = cfg
w.mu.Unlock()

newPeriod := cfg.OOBReportingPeriod
if cfg.EnableOOBLoadReport == oldCfg.EnableOOBLoadReport &&
newPeriod == oldCfg.OOBReportingPeriod {
cfg.OOBReportingPeriod == oldCfg.OOBReportingPeriod {
// Load reporting wasn't enabled before or after, or load reporting was
// enabled before and after, and had the same period. (Note that with
// load reporting disabled, OOBReportingPeriod is always 0.)
return
}
// (Optionally stop and) start the listener to use the new config's
// settings for OOB reporting.
if w.connectivityState == connectivity.Ready {
// (Re)start the listener to use the new config's settings for OOB
// reporting.
w.updateORCAListener(cfg)
}
}

func (w *weightedSubConn) updateORCAListener(cfg *lbConfig) {
if w.stopORCAListener != nil {
w.stopORCAListener()
}
Expand All @@ -545,9 +549,9 @@ func (w *weightedSubConn) updateConfig(cfg *lbConfig) {
return
}
if w.logger.V(2) {
w.logger.Infof("Registering ORCA listener for %v with interval %v", w.SubConn, newPeriod)
w.logger.Infof("Registering ORCA listener for %v with interval %v", w.SubConn, cfg.OOBReportingPeriod)
}
opts := orca.OOBListenerOptions{ReportInterval: time.Duration(newPeriod)}
opts := orca.OOBListenerOptions{ReportInterval: time.Duration(cfg.OOBReportingPeriod)}
w.stopORCAListener = orca.RegisterOOBListener(w.SubConn, w, opts)
}

Expand All @@ -569,11 +573,9 @@ func (w *weightedSubConn) updateConnectivityState(cs connectivity.State) connect
w.mu.Lock()
w.nonEmptySince = time.Time{}
w.lastUpdated = time.Time{}
cfg := w.cfg
w.mu.Unlock()
case connectivity.Shutdown:
if w.stopORCAListener != nil {
w.stopORCAListener()
}
w.updateORCAListener(cfg)
}

oldCS := w.connectivityState
Expand Down
49 changes: 30 additions & 19 deletions balancer_wrapper.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,14 @@ import (
"sync"

"google.golang.org/grpc/balancer"
"google.golang.org/grpc/codes"
"google.golang.org/grpc/connectivity"
"google.golang.org/grpc/internal"
"google.golang.org/grpc/internal/balancer/gracefulswitch"
"google.golang.org/grpc/internal/channelz"
"google.golang.org/grpc/internal/grpcsync"
"google.golang.org/grpc/resolver"
"google.golang.org/grpc/status"
)

var setConnectedAddress = internal.SetConnectedAddress.(func(*balancer.SubConnState, resolver.Address))
Expand Down Expand Up @@ -256,17 +258,20 @@ type acBalancerWrapper struct {
ccb *ccBalancerWrapper // read-only
stateListener func(balancer.SubConnState)

mu sync.Mutex
producers map[balancer.ProducerBuilder]*refCountedProducer
producersMu sync.Mutex
producers map[balancer.ProducerBuilder]*refCountedProducer
}

// updateState is invoked by grpc to push a subConn state update to the
// underlying balancer.
func (acbw *acBalancerWrapper) updateState(s connectivity.State, curAddr resolver.Address, err error, readyChan chan struct{}) {
func (acbw *acBalancerWrapper) updateState(s connectivity.State, curAddr resolver.Address, err error) {
acbw.ccb.serializer.TrySchedule(func(ctx context.Context) {
if ctx.Err() != nil || acbw.ccb.balancer == nil {
return
}
// Invalidate all producers on any state change.
acbw.closeProducers()

// Even though it is optional for balancers, gracefulswitch ensures
// opts.StateListener is set, so this cannot ever be nil.
// TODO: delete this comment when UpdateSubConnState is removed.
Expand All @@ -275,15 +280,6 @@ func (acbw *acBalancerWrapper) updateState(s connectivity.State, curAddr resolve
setConnectedAddress(&scs, curAddr)
}
acbw.stateListener(scs)
acbw.ac.mu.Lock()
defer acbw.ac.mu.Unlock()
if s == connectivity.Ready {
// When changing states to READY, close stateReadyChan. Wait until
// after we notify the LB policy's listener(s) in order to prevent
// ac.getTransport() from unblocking before the LB policy starts
// tracking the subchannel as READY.
close(readyChan)
}
})
}

Expand All @@ -300,16 +296,18 @@ func (acbw *acBalancerWrapper) Connect() {
}

func (acbw *acBalancerWrapper) Shutdown() {
acbw.closeProducers()
acbw.ccb.cc.removeAddrConn(acbw.ac, errConnDrain)
}

// NewStream begins a streaming RPC on the addrConn. If the addrConn is not
// ready, blocks until it is or ctx expires. Returns an error when the context
// expires or the addrConn is shut down.
func (acbw *acBalancerWrapper) NewStream(ctx context.Context, desc *StreamDesc, method string, opts ...CallOption) (ClientStream, error) {
transport, err := acbw.ac.getTransport(ctx)
if err != nil {
return nil, err
transport := acbw.ac.getReadyTransport()
if transport == nil {
return nil, status.Errorf(codes.Unavailable, "SubConn state is not Ready")

}
return newNonRetryClientStream(ctx, desc, method, transport, acbw.ac, opts...)
}
Expand All @@ -334,8 +332,8 @@ type refCountedProducer struct {
}

func (acbw *acBalancerWrapper) GetOrBuildProducer(pb balancer.ProducerBuilder) (balancer.Producer, func()) {
acbw.mu.Lock()
defer acbw.mu.Unlock()
acbw.producersMu.Lock()
defer acbw.producersMu.Unlock()

// Look up existing producer from this builder.
pData := acbw.producers[pb]
Expand All @@ -352,13 +350,26 @@ func (acbw *acBalancerWrapper) GetOrBuildProducer(pb balancer.ProducerBuilder) (
// and delete the refCountedProducer from the map if the total reference
// count goes to zero.
unref := func() {
acbw.mu.Lock()
acbw.producersMu.Lock()
// If closeProducers has already closed this producer instance, refs is
// set to 0, so the check after decrementing will never pass, and the
// producer will not be double-closed.
pData.refs--
if pData.refs == 0 {
defer pData.close() // Run outside the acbw mutex
delete(acbw.producers, pb)
}
acbw.mu.Unlock()
acbw.producersMu.Unlock()
}
return pData.producer, grpcsync.OnceFunc(unref)
}

func (acbw *acBalancerWrapper) closeProducers() {
acbw.producersMu.Lock()
defer acbw.producersMu.Unlock()
for pb, pData := range acbw.producers {
pData.refs = 0
pData.close()
delete(acbw.producers, pb)
}
}
53 changes: 9 additions & 44 deletions clientconn.go
Original file line number Diff line number Diff line change
Expand Up @@ -825,14 +825,13 @@ func (cc *ClientConn) newAddrConnLocked(addrs []resolver.Address, opts balancer.
}

ac := &addrConn{
state: connectivity.Idle,
cc: cc,
addrs: copyAddresses(addrs),
scopts: opts,
dopts: cc.dopts,
channelz: channelz.RegisterSubChannel(cc.channelz, ""),
resetBackoff: make(chan struct{}),
stateReadyChan: make(chan struct{}),
state: connectivity.Idle,
cc: cc,
addrs: copyAddresses(addrs),
scopts: opts,
dopts: cc.dopts,
channelz: channelz.RegisterSubChannel(cc.channelz, ""),
resetBackoff: make(chan struct{}),
}
ac.ctx, ac.cancel = context.WithCancel(cc.ctx)
// Start with our address set to the first address; this may be updated if
Expand Down Expand Up @@ -1179,8 +1178,7 @@ type addrConn struct {
addrs []resolver.Address // All addresses that the resolver resolved to.

// Use updateConnectivityState for updating addrConn's connectivity state.
state connectivity.State
stateReadyChan chan struct{} // closed and recreated on every READY state change.
state connectivity.State

backoffIdx int // Needs to be stateful for resetConnectBackoff.
resetBackoff chan struct{}
Expand All @@ -1193,22 +1191,14 @@ func (ac *addrConn) updateConnectivityState(s connectivity.State, lastErr error)
if ac.state == s {
return
}
if ac.state == connectivity.Ready {
// When leaving ready, re-create the ready channel.
ac.stateReadyChan = make(chan struct{})
}
if s == connectivity.Shutdown {
// Wake any producer waiting to create a stream on the transport.
close(ac.stateReadyChan)
}
ac.state = s
ac.channelz.ChannelMetrics.State.Store(&s)
if lastErr == nil {
channelz.Infof(logger, ac.channelz, "Subchannel Connectivity change to %v", s)
} else {
channelz.Infof(logger, ac.channelz, "Subchannel Connectivity change to %v, last error: %s", s, lastErr)
}
ac.acbw.updateState(s, ac.curAddr, lastErr, ac.stateReadyChan)
ac.acbw.updateState(s, ac.curAddr, lastErr)
}

// adjustParams updates parameters used to create transports upon
Expand Down Expand Up @@ -1512,31 +1502,6 @@ func (ac *addrConn) getReadyTransport() transport.ClientTransport {
return nil
}

// getTransport waits until the addrconn is ready and returns the transport.
// If the context expires first, returns an appropriate status. If the
// addrConn is stopped first, returns an Unavailable status error.
func (ac *addrConn) getTransport(ctx context.Context) (transport.ClientTransport, error) {
for ctx.Err() == nil {
ac.mu.Lock()
t, state, readyChan := ac.transport, ac.state, ac.stateReadyChan
ac.mu.Unlock()
if state == connectivity.Shutdown {
// Return an error immediately in only this case since a connection
// will never occur.
return nil, status.Errorf(codes.Unavailable, "SubConn shutting down")
}

select {
case <-ctx.Done():
case <-readyChan:
if state == connectivity.Ready {
return t, nil
}
}
}
return nil, status.FromContextError(ctx.Err()).Err()
}

// tearDown starts to tear down the addrConn.
//
// Note that tearDown doesn't remove ac from ac.cc.conns, so the addrConn struct
Expand Down
11 changes: 4 additions & 7 deletions interop/orcalb.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,8 @@ func (orcabb) Name() string {
}

type orcab struct {
cc balancer.ClientConn
sc balancer.SubConn
cancelWatch func()
cc balancer.ClientConn
sc balancer.SubConn

reportMu sync.Mutex
report *v3orcapb.OrcaLoadReport
Expand All @@ -70,7 +69,6 @@ func (o *orcab) UpdateClientConnState(s balancer.ClientConnState) error {
o.cc.UpdateState(balancer.State{ConnectivityState: connectivity.TransientFailure, Picker: base.NewErrPicker(fmt.Errorf("error creating subconn: %v", err))})
return nil
}
o.cancelWatch = orca.RegisterOOBListener(o.sc, o, orca.OOBListenerOptions{ReportInterval: time.Second})
o.sc.Connect()
o.cc.UpdateState(balancer.State{ConnectivityState: connectivity.Connecting, Picker: base.NewErrPicker(balancer.ErrNoSubConnAvailable)})
return nil
Expand All @@ -89,6 +87,7 @@ func (o *orcab) UpdateSubConnState(sc balancer.SubConn, state balancer.SubConnSt
func (o *orcab) updateSubConnState(state balancer.SubConnState) {
switch state.ConnectivityState {
case connectivity.Ready:
orca.RegisterOOBListener(o.sc, o, orca.OOBListenerOptions{ReportInterval: time.Second})
o.cc.UpdateState(balancer.State{ConnectivityState: connectivity.Ready, Picker: &orcaPicker{o: o}})
case connectivity.TransientFailure:
o.cc.UpdateState(balancer.State{ConnectivityState: connectivity.TransientFailure, Picker: base.NewErrPicker(fmt.Errorf("all subchannels in transient failure: %v", state.ConnectionError))})
Expand All @@ -102,9 +101,7 @@ func (o *orcab) updateSubConnState(state balancer.SubConnState) {
}
}

func (o *orcab) Close() {
o.cancelWatch()
}
func (o *orcab) Close() {}

func (o *orcab) OnLoadReport(r *v3orcapb.OrcaLoadReport) {
o.reportMu.Lock()
Expand Down
19 changes: 11 additions & 8 deletions orca/producer.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ func (*producerBuilder) Build(cci any) (balancer.Producer, func()) {
backoff: internal.DefaultBackoffFunc,
}
return p, func() {
p.mu.Lock()
if p.stop != nil {
p.stop()
p.stop = nil
}
p.mu.Unlock()
<-p.stopped
}
}
Expand All @@ -67,19 +73,16 @@ type OOBListenerOptions struct {
ReportInterval time.Duration
}

// RegisterOOBListener registers an out-of-band load report listener on sc.
// Any OOBListener may only be registered once per subchannel at a time. The
// returned stop function must be called when no longer needed. Do not
// RegisterOOBListener registers an out-of-band load report listener on a Ready
// sc. Any OOBListener may only be registered once per subchannel at a time.
// The returned stop function must be called when no longer needed. Do not
// register a single OOBListener more than once per SubConn.
func RegisterOOBListener(sc balancer.SubConn, l OOBListener, opts OOBListenerOptions) (stop func()) {
pr, closeFn := sc.GetOrBuildProducer(producerBuilderSingleton)
p := pr.(*producer)

p.registerListener(l, opts.ReportInterval)

// TODO: When we can register for SubConn state updates, automatically call
// stop() on SHUTDOWN.

// If stop is called multiple times, prevent it from having any effect on
// subsequent calls.
return grpcsync.OnceFunc(func() {
Expand All @@ -96,13 +99,13 @@ type producer struct {
// is incremented when stream errors occur and is reset when the stream
// reports a result.
backoff func(int) time.Duration
stopped chan struct{} // closed when the run goroutine exits

mu sync.Mutex
intervals map[time.Duration]int // map from interval time to count of listeners requesting that time
listeners map[OOBListener]struct{} // set of registered listeners
minInterval time.Duration
stop func() // stops the current run goroutine
stopped chan struct{} // closed when the run goroutine exits
stop func() // stops the current run goroutine
}

// registerListener adds the listener and its requested report interval to the
Expand Down
Loading

0 comments on commit ca4865d

Please sign in to comment.