Added end maintenance API which allows DRAINED hosts to be transitioned to ACTIVE. Fixed bug where payload error would never be returned if call failed due to a bad payload.

2017-09-27 12:55:50 -07:00 · 2017-09-27 12:55:50 -07:00 · 8fe3780949
commit 8fe3780949
parent f59f0bbdc3
2 changed files with 91 additions and 6 deletions
--- a/examples/client.go
+++ b/examples/client.go
@ -28,7 +28,7 @@ import (
 	"strings"
 )

-var cmd, executor, url, clustersConfig, clusterName, updateId, username, password, zkUrl, drainCandidates string
+var cmd, executor, url, clustersConfig, clusterName, updateId, username, password, zkUrl, hostList string

 var CONNECTION_TIMEOUT = 20000

@ -42,7 +42,7 @@ func init() {
 	flag.StringVar(&username, "username", "aurora", "Username to use for authorization")
 	flag.StringVar(&password, "password", "secret", "Password to use for authorization")
 	flag.StringVar(&zkUrl, "zkurl", "", "zookeeper url")
-	flag.StringVar(&drainCandidates, "drainCandidates", "", "Comma separated list of candidate hosts to drain")
+	flag.StringVar(&hostList, "hostList", "", "Comma separated list of hosts to operate on")
 	flag.Parse()
 }

@ -501,11 +501,11 @@ func main() {

 	case "drainHosts":
 		fmt.Println("Setting hosts to DRAINING")
-		if drainCandidates == "" {
+		if hostList == "" {
 			fmt.Println("No hosts specified to drain")
 			os.Exit(1)
 		}
-		hosts := strings.Split(drainCandidates, ",")
+		hosts := strings.Split(hostList, ",")
 		_, result, err := r.DrainHosts(hosts...)
 		if err != nil {
 			fmt.Printf("error: %+v\n", err.Error())
@ -513,6 +513,20 @@ func main() {
 		}
 		fmt.Print(result.String())

+	case "endMaintenance":
+		fmt.Println("Setting hosts to ACTIVE")
+		if hostList == "" {
+			fmt.Println("No hosts specified to drain")
+			os.Exit(1)
+		}
+		hosts := strings.Split(hostList, ",")
+		_, result, err := r.EndMaintenance(hosts...)
+		if err != nil {
+			fmt.Printf("error: %+v\n", err.Error())
+			os.Exit(1)
+		}
+		fmt.Print(result.String())
+
 	default:
 		fmt.Println("Command not supported")
 		os.Exit(1)
--- a/realis.go
+++ b/realis.go
@ -59,6 +59,7 @@ type Realis interface {

 	// Admin functions
 	DrainHosts(hosts ...string) (*aurora.Response, *aurora.DrainHostsResult_, error)
+	EndMaintenance(hosts ...string) (*aurora.Response, *aurora.EndMaintenanceResult_, error)
 }

 type realisClient struct {
@ -1164,7 +1165,7 @@ func (r *realisClient) DrainHosts(hosts ...string) (*aurora.Response, *aurora.Dr

 	var resp *aurora.Response
 	var result *aurora.DrainHostsResult_
-	var clientErr, payloadErr error
+	var returnErr, clientErr, payloadErr error

 	if len(hosts) == 0 {
 		return nil, nil, errors.New("no hosts provided to drain")
@ -1206,11 +1207,81 @@ func (r *realisClient) DrainHosts(hosts ...string) (*aurora.Response, *aurora.Dr
 		result = resp.GetResult_().GetDrainHostsResult_()
 	}

+	// Prioritize returning a bad payload error over a client error as a bad payload error indicates
+	// a deeper issue
+	if payloadErr != nil {
+		returnErr = payloadErr
+	} else {
+		returnErr = clientErr
+	}

 	// Timed out on retries. *Note that when we fix the unexpected errors with a correct payload,
 	// this will can become either a timeout error or a payload error
 	if retryErr != nil {
-		return resp, result, errors.Wrap(clientErr, "Unable to recover connection")
+		return resp, result, errors.Wrap(returnErr, "Unable to recover connection")
+	}
+
+	return resp, result, nil
+}
+
+func (r *realisClient) EndMaintenance(hosts ...string) (*aurora.Response, *aurora.EndMaintenanceResult_, error) {
+
+	var resp *aurora.Response
+	var result *aurora.EndMaintenanceResult_
+	var returnErr, clientErr, payloadErr error
+
+	if len(hosts) == 0 {
+		return nil, nil, errors.New("no hosts provided to drain")
+	}
+
+	hostList := aurora.NewHosts()
+	hostList.HostNames = make(map[string]bool)
+	for _, host := range hosts {
+		hostList.HostNames[host] = true
+	}
+
+	retryErr := ExponentialBackoff(defaultBackoff, func() (bool, error) {
+
+		// Send thrift call, if we have a thrift send error, attempt to reconnect
+		// and continue trying to resend command
+		if resp, clientErr = r.adminClient.EndMaintenance(hostList); clientErr != nil {
+			// Experienced an connection error
+			err1 := r.ReestablishConn()
+			if err1 != nil {
+				fmt.Println("error in re-establishing connection: ", err1)
+			}
+			return false, nil
+		}
+
+		// If error is NOT due to connection
+		if _, payloadErr = response.ResponseCodeCheck(resp); payloadErr != nil {
+			// TODO(rdelvalle): an leader election may cause the response to have
+			// failed when it should have succeeded. Retry everything for now until
+			// we figure out a more concrete fix.
+			return false, nil
+		}
+
+		// Successful call
+		return true, nil
+
+	})
+
+	if resp != nil && resp.GetResult_() != nil {
+		result = resp.GetResult_().GetEndMaintenanceResult_()
+	}
+
+	// Prioritize returning a bad payload error over a client error as a bad payload error indicates
+	// a deeper issue
+	if payloadErr != nil {
+		returnErr = payloadErr
+	} else {
+		returnErr = clientErr
+	}
+
+	// Timed out on retries. *Note that when we fix the unexpected errors with a correct payload,
+	// this will can become either a timeout error or a payload error
+	if retryErr != nil {
+		return resp, result, errors.Wrap(returnErr, "Unable to recover connection")
 	}

 	return resp, result, nil