From 925c61e84b5d9ccaabe2af8bad574e3aaf565d97 Mon Sep 17 00:00:00 2001 From: Jille Timmermans Date: Wed, 9 Sep 2020 11:48:26 +0100 Subject: [PATCH] Make leadership failovers transparent for clients --- README.md | 42 +++++++++++++++++++++++---- rafterrors/rafterrors.go | 61 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 6 deletions(-) create mode 100644 rafterrors/rafterrors.go diff --git a/README.md b/README.md index 829d9dd..bdfbcf9 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,5 @@ # raft-grpc-leader-rpc -[![Godoc](https://godoc.org/github.com/Jille/raft-grpc-leader-rpc/leaderhealth?status.svg)](https://godoc.org/github.com/Jille/raft-grpc-leader-rpc/leaderhealth) - Send gRPCs to your [Raft](https://github.com/hashicorp/raft) leader. It connects to all your Raft nodes and uses [client-side health checks](https://github.com/grpc/proposal/blob/master/A17-client-side-health-checking.md) to only send RPCs to the master. @@ -10,6 +8,8 @@ During leader elections you'll see errors, make sure your client can handle thos ## Server side +[![Godoc](https://godoc.org/github.com/Jille/raft-grpc-leader-rpc/leaderhealth?status.svg)](https://godoc.org/github.com/Jille/raft-grpc-leader-rpc/leaderhealth) + Add this to your server: ```go @@ -37,13 +37,17 @@ Add this to your client: ```go import _ "google.golang.org/grpc/health" -c := `{"healthCheckConfig": {"serviceName": "your-service-name-or-an-empty-string"}, "loadBalancingConfig": [ { "round_robin": {} } ]}` +c := `{"healthCheckConfig": {"serviceName": "quis.RaftLeader"}, "loadBalancingConfig": [ { "round_robin": {} } ]}` conn, err := grpc.Dial("dns://all-your-raft-nodes.example.com", grpc.WithDefaultServiceConfig(c)) ``` -Pick any of the service names you registered on the server (possibly the empty string if you used that). +Instead of `quis.RaftLeader` you can also pick any of the service names you registered with leaderhealth.Setup(). -You'll need to create a DNS entry that points to all your Raft nodes. If you don't feel like doing that, you can use this instead: +You'll need to create a DNS entry that points to all your Raft nodes. + +### No DNS entry? + +If you don't feel like doing that, you can use this instead: ```go import _ "github.com/Jille/grpc-multi-resolver" @@ -55,4 +59,30 @@ conn, err := grpc.Dial("multi:///127.0.0.1:50051,127.0.0.1:50052,127.0.0.1:50053 ### Wait for Ready -I recommend enabling [Wait for Ready](https://github.com/grpc/grpc/blob/master/doc/wait-for-ready.md) by adding `grpc.WithDefaultCallOption(grpc.WithWaitForReady(true))` to your grpc.Dial(). This lets gRPC wait for a connection to the leader rather than immediately failing it if the leader is currently unknown. The deadline is still honored. +I recommend enabling [Wait for Ready](https://github.com/grpc/grpc/blob/master/doc/wait-for-ready.md) by adding `grpc.WithDefaultCallOptions(grpc.WaitForReady(true))` to your grpc.Dial(). This lets gRPC wait for a connection to the leader rather than immediately failing it if the leader is currently unknown. The deadline is still honored. + +When you get errors like `connection active but health check failed.`, this is what you want to enable. + +## Automatic retries + +You can use https://godoc.org/github.com/grpc-ecosystem/go-grpc-middleware/retry to transparently retry failures without the client code knowing it. + +You're gonna want to enable Wait for Ready or this isn't going to make it very transparent for your clients. + +Add this to your client: + +```go +import "github.com/grpc-ecosystem/go-grpc-middleware/retry" + +retryOpts := []grpc_retry.CallOption{ + grpc_retry.WithBackoff(grpc_retry.BackoffExponential(100 * time.Millisecond)), + grpc_retry.WithMax(5), // Give up after 5 retries. +} +grpc.Dial(..., grpc.WithUnaryInterceptor(grpc_retry.UnaryClientInterceptor(retryOpts...))) +``` + +Your server will need to more modifications. Each of your RPCs needs to return appropriate status codes. + +[![Godoc](https://godoc.org/github.com/Jille/raft-grpc-leader-rpc/rafterrors?status.svg)](https://godoc.org/github.com/Jille/raft-grpc-leader-rpc/rafterrors) + +Make sure to read rafterror's documentation to known when to use MarkRetriable vs MarkUnretriable, there's a pitfall. diff --git a/rafterrors/rafterrors.go b/rafterrors/rafterrors.go new file mode 100644 index 0000000..3e9e1e5 --- /dev/null +++ b/rafterrors/rafterrors.go @@ -0,0 +1,61 @@ +// Package rafterrors annotates Raft errors with gRPC status codes. +// +// Use MarkRetriable/MarkUnretriable to add a gRPC status code. +// +// Use MarkRetriable for atomic operations like Apply, ApplyLog, Barrier, changing configuration/voters. +// +// Use MarkUnretriable if your own application already made changes that it didn't roll back and for Restore. +// Restore does multiple operation, and errors could be from the first or second and it's unsafe to distinguish. +package rafterrors + +import ( + "github.com/hashicorp/raft" + "google.golang.org/grpc/codes" + "google.golang.org/grpc/status" +) + +// MarkRetriable annotates a Raft error with a gRPC status code, given that the entire operation is retriable. +func MarkRetriable(err error) error { + return status.Error(RetriableCode(err), err.Error()) +} + +// MarkUnretriable annotates a Raft error with a gRPC status code, given that the entire operation is not retriable. +func MarkUnretriable(err error) error { + return status.Error(UnretriableCode(err), err.Error()) +} + +// RetriableCode returns a gRPC status code for a given Raft error, given that the entire operation is retriable. +func RetriableCode(err error) codes.Code { + return code(err, true) +} + +// Code returns a gRPC status code for a given Raft error, given that the entire operation is not retriable. +func UnretriableCode(err error) codes.Code { + return code(err, false) +} + +func code(err error, retriable bool) codes.Code { + switch err { + case raft.ErrLeader, raft.ErrNotLeader, raft.ErrLeadershipLost, raft.ErrRaftShutdown, raft.ErrLeadershipTransferInProgress: + if retriable { + return codes.Unavailable + } + return codes.Unknown + case raft.ErrAbortedByRestore: + return codes.Aborted + case raft.ErrEnqueueTimeout: + if retriable { + return codes.Unavailable + } + // DeadlineExceeded is generally considered not safe to be retried, because (part of) the mutation might have been applied already. + // In hashicorp/raft, there is one place in which ErrEnqueueTimeout doesn't mean nothing happened: during a Restore does two actions (restore + noop) and i f the latter failed the restore might've gone through. + // So sadly we can't return a more retriable error code here. + return codes.DeadlineExceeded + case raft.ErrNothingNewToSnapshot, raft.ErrCantBootstrap: + return codes.FailedPrecondition + case raft.ErrUnsupportedProtocol: + return codes.Unimplemented + default: + return codes.Unknown + } +}