From 925c61e84b5d9ccaabe2af8bad574e3aaf565d97 Mon Sep 17 00:00:00 2001
From: Jille Timmermans <jille@quis.cx>
Date: Wed, 9 Sep 2020 11:48:26 +0100
Subject: [PATCH] Make leadership failovers transparent for clients

---
 README.md                | 42 +++++++++++++++++++++++----
 rafterrors/rafterrors.go | 61 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 6 deletions(-)
 create mode 100644 rafterrors/rafterrors.go

diff --git a/README.md b/README.md
index 829d9dd..bdfbcf9 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,5 @@
 # raft-grpc-leader-rpc
 
-[![Godoc](https://godoc.org/github.com/Jille/raft-grpc-leader-rpc/leaderhealth?status.svg)](https://godoc.org/github.com/Jille/raft-grpc-leader-rpc/leaderhealth)
-
 Send gRPCs to your [Raft](https://github.com/hashicorp/raft) leader.
 
 It connects to all your Raft nodes and uses [client-side health checks](https://github.com/grpc/proposal/blob/master/A17-client-side-health-checking.md) to only send RPCs to the master.
@@ -10,6 +8,8 @@ During leader elections you'll see errors, make sure your client can handle thos
 
 ## Server side
 
+[![Godoc](https://godoc.org/github.com/Jille/raft-grpc-leader-rpc/leaderhealth?status.svg)](https://godoc.org/github.com/Jille/raft-grpc-leader-rpc/leaderhealth)
+
 Add this to your server:
 
 ```go
@@ -37,13 +37,17 @@ Add this to your client:
 ```go
 import _ "google.golang.org/grpc/health"
 
-c := `{"healthCheckConfig": {"serviceName": "your-service-name-or-an-empty-string"}, "loadBalancingConfig": [ { "round_robin": {} } ]}`
+c := `{"healthCheckConfig": {"serviceName": "quis.RaftLeader"}, "loadBalancingConfig": [ { "round_robin": {} } ]}`
 conn, err := grpc.Dial("dns://all-your-raft-nodes.example.com", grpc.WithDefaultServiceConfig(c))
 ```
 
-Pick any of the service names you registered on the server (possibly the empty string if you used that).
+Instead of `quis.RaftLeader` you can also pick any of the service names you registered with leaderhealth.Setup().
 
-You'll need to create a DNS entry that points to all your Raft nodes. If you don't feel like doing that, you can use this instead:
+You'll need to create a DNS entry that points to all your Raft nodes.
+
+### No DNS entry?
+
+If you don't feel like doing that, you can use this instead:
 
 ```go
 import _ "github.com/Jille/grpc-multi-resolver"
@@ -55,4 +59,30 @@ conn, err := grpc.Dial("multi:///127.0.0.1:50051,127.0.0.1:50052,127.0.0.1:50053
 
 ### Wait for Ready
 
-I recommend enabling [Wait for Ready](https://github.com/grpc/grpc/blob/master/doc/wait-for-ready.md) by adding `grpc.WithDefaultCallOption(grpc.WithWaitForReady(true))` to your grpc.Dial(). This lets gRPC wait for a connection to the leader rather than immediately failing it if the leader is currently unknown. The deadline is still honored.
+I recommend enabling [Wait for Ready](https://github.com/grpc/grpc/blob/master/doc/wait-for-ready.md) by adding `grpc.WithDefaultCallOptions(grpc.WaitForReady(true))` to your grpc.Dial(). This lets gRPC wait for a connection to the leader rather than immediately failing it if the leader is currently unknown. The deadline is still honored.
+
+When you get errors like `connection active but health check failed.`, this is what you want to enable.
+
+## Automatic retries
+
+You can use https://godoc.org/github.com/grpc-ecosystem/go-grpc-middleware/retry to transparently retry failures without the client code knowing it.
+
+You're gonna want to enable Wait for Ready or this isn't going to make it very transparent for your clients.
+
+Add this to your client:
+
+```go
+import "github.com/grpc-ecosystem/go-grpc-middleware/retry"
+
+retryOpts := []grpc_retry.CallOption{
+	grpc_retry.WithBackoff(grpc_retry.BackoffExponential(100 * time.Millisecond)),
+	grpc_retry.WithMax(5), // Give up after 5 retries.
+}
+grpc.Dial(..., grpc.WithUnaryInterceptor(grpc_retry.UnaryClientInterceptor(retryOpts...)))
+```
+
+Your server will need to more modifications. Each of your RPCs needs to return appropriate status codes.
+
+[![Godoc](https://godoc.org/github.com/Jille/raft-grpc-leader-rpc/rafterrors?status.svg)](https://godoc.org/github.com/Jille/raft-grpc-leader-rpc/rafterrors)
+
+Make sure to read rafterror's documentation to known when to use MarkRetriable vs MarkUnretriable, there's a pitfall.
diff --git a/rafterrors/rafterrors.go b/rafterrors/rafterrors.go
new file mode 100644
index 0000000..3e9e1e5
--- /dev/null
+++ b/rafterrors/rafterrors.go
@@ -0,0 +1,61 @@
+// Package rafterrors annotates Raft errors with gRPC status codes.
+//
+// Use MarkRetriable/MarkUnretriable to add a gRPC status code.
+//
+// Use MarkRetriable for atomic operations like Apply, ApplyLog, Barrier, changing configuration/voters.
+//
+// Use MarkUnretriable if your own application already made changes that it didn't roll back and for Restore.
+// Restore does multiple operation, and errors could be from the first or second and it's unsafe to distinguish.
+package rafterrors
+
+import (
+	"github.com/hashicorp/raft"
+	"google.golang.org/grpc/codes"
+	"google.golang.org/grpc/status"
+)
+
+// MarkRetriable annotates a Raft error with a gRPC status code, given that the entire operation is retriable.
+func MarkRetriable(err error) error {
+	return status.Error(RetriableCode(err), err.Error())
+}
+
+// MarkUnretriable annotates a Raft error with a gRPC status code, given that the entire operation is not retriable.
+func MarkUnretriable(err error) error {
+	return status.Error(UnretriableCode(err), err.Error())
+}
+
+// RetriableCode returns a gRPC status code for a given Raft error, given that the entire operation is retriable.
+func RetriableCode(err error) codes.Code {
+	return code(err, true)
+}
+
+// Code returns a gRPC status code for a given Raft error, given that the entire operation is not retriable.
+func UnretriableCode(err error) codes.Code {
+	return code(err, false)
+}
+
+func code(err error, retriable bool) codes.Code {
+	switch err {
+	case raft.ErrLeader, raft.ErrNotLeader, raft.ErrLeadershipLost, raft.ErrRaftShutdown, raft.ErrLeadershipTransferInProgress:
+		if retriable {
+			return codes.Unavailable
+		}
+		return codes.Unknown
+	case raft.ErrAbortedByRestore:
+		return codes.Aborted
+	case raft.ErrEnqueueTimeout:
+		if retriable {
+			return codes.Unavailable
+		}
+		// DeadlineExceeded is generally considered not safe to be retried, because (part of) the mutation might have been applied already.
+		// In hashicorp/raft, there is one place in which ErrEnqueueTimeout doesn't mean nothing happened: during a Restore does two actions (restore + noop) and i f the latter failed the restore might've gone through.
+		// So sadly we can't return a more retriable error code here.
+		return codes.DeadlineExceeded
+	case raft.ErrNothingNewToSnapshot, raft.ErrCantBootstrap:
+		return codes.FailedPrecondition
+	case raft.ErrUnsupportedProtocol:
+		return codes.Unimplemented
+	default:
+		return codes.Unknown
+	}
+}