Added end maintenance API which allows DRAINED hosts to be transitioned to ACTIVE. Fixed bug where payload error would never be returned if call failed due to a bad payload.

This commit is contained in:
Renan DelValle 2017-09-27 12:55:50 -07:00
parent f59f0bbdc3
commit 8fe3780949
2 changed files with 91 additions and 6 deletions

View file

@ -28,7 +28,7 @@ import (
"strings"
)
var cmd, executor, url, clustersConfig, clusterName, updateId, username, password, zkUrl, drainCandidates string
var cmd, executor, url, clustersConfig, clusterName, updateId, username, password, zkUrl, hostList string
var CONNECTION_TIMEOUT = 20000
@ -42,7 +42,7 @@ func init() {
flag.StringVar(&username, "username", "aurora", "Username to use for authorization")
flag.StringVar(&password, "password", "secret", "Password to use for authorization")
flag.StringVar(&zkUrl, "zkurl", "", "zookeeper url")
flag.StringVar(&drainCandidates, "drainCandidates", "", "Comma separated list of candidate hosts to drain")
flag.StringVar(&hostList, "hostList", "", "Comma separated list of hosts to operate on")
flag.Parse()
}
@ -501,11 +501,11 @@ func main() {
case "drainHosts":
fmt.Println("Setting hosts to DRAINING")
if drainCandidates == "" {
if hostList == "" {
fmt.Println("No hosts specified to drain")
os.Exit(1)
}
hosts := strings.Split(drainCandidates, ",")
hosts := strings.Split(hostList, ",")
_, result, err := r.DrainHosts(hosts...)
if err != nil {
fmt.Printf("error: %+v\n", err.Error())
@ -513,6 +513,20 @@ func main() {
}
fmt.Print(result.String())
case "endMaintenance":
fmt.Println("Setting hosts to ACTIVE")
if hostList == "" {
fmt.Println("No hosts specified to drain")
os.Exit(1)
}
hosts := strings.Split(hostList, ",")
_, result, err := r.EndMaintenance(hosts...)
if err != nil {
fmt.Printf("error: %+v\n", err.Error())
os.Exit(1)
}
fmt.Print(result.String())
default:
fmt.Println("Command not supported")
os.Exit(1)

View file

@ -59,6 +59,7 @@ type Realis interface {
// Admin functions
DrainHosts(hosts ...string) (*aurora.Response, *aurora.DrainHostsResult_, error)
EndMaintenance(hosts ...string) (*aurora.Response, *aurora.EndMaintenanceResult_, error)
}
type realisClient struct {
@ -1164,7 +1165,7 @@ func (r *realisClient) DrainHosts(hosts ...string) (*aurora.Response, *aurora.Dr
var resp *aurora.Response
var result *aurora.DrainHostsResult_
var clientErr, payloadErr error
var returnErr, clientErr, payloadErr error
if len(hosts) == 0 {
return nil, nil, errors.New("no hosts provided to drain")
@ -1206,11 +1207,81 @@ func (r *realisClient) DrainHosts(hosts ...string) (*aurora.Response, *aurora.Dr
result = resp.GetResult_().GetDrainHostsResult_()
}
// Prioritize returning a bad payload error over a client error as a bad payload error indicates
// a deeper issue
if payloadErr != nil {
returnErr = payloadErr
} else {
returnErr = clientErr
}
// Timed out on retries. *Note that when we fix the unexpected errors with a correct payload,
// this will can become either a timeout error or a payload error
if retryErr != nil {
return resp, result, errors.Wrap(clientErr, "Unable to recover connection")
return resp, result, errors.Wrap(returnErr, "Unable to recover connection")
}
return resp, result, nil
}
func (r *realisClient) EndMaintenance(hosts ...string) (*aurora.Response, *aurora.EndMaintenanceResult_, error) {
var resp *aurora.Response
var result *aurora.EndMaintenanceResult_
var returnErr, clientErr, payloadErr error
if len(hosts) == 0 {
return nil, nil, errors.New("no hosts provided to drain")
}
hostList := aurora.NewHosts()
hostList.HostNames = make(map[string]bool)
for _, host := range hosts {
hostList.HostNames[host] = true
}
retryErr := ExponentialBackoff(defaultBackoff, func() (bool, error) {
// Send thrift call, if we have a thrift send error, attempt to reconnect
// and continue trying to resend command
if resp, clientErr = r.adminClient.EndMaintenance(hostList); clientErr != nil {
// Experienced an connection error
err1 := r.ReestablishConn()
if err1 != nil {
fmt.Println("error in re-establishing connection: ", err1)
}
return false, nil
}
// If error is NOT due to connection
if _, payloadErr = response.ResponseCodeCheck(resp); payloadErr != nil {
// TODO(rdelvalle): an leader election may cause the response to have
// failed when it should have succeeded. Retry everything for now until
// we figure out a more concrete fix.
return false, nil
}
// Successful call
return true, nil
})
if resp != nil && resp.GetResult_() != nil {
result = resp.GetResult_().GetEndMaintenanceResult_()
}
// Prioritize returning a bad payload error over a client error as a bad payload error indicates
// a deeper issue
if payloadErr != nil {
returnErr = payloadErr
} else {
returnErr = clientErr
}
// Timed out on retries. *Note that when we fix the unexpected errors with a correct payload,
// this will can become either a timeout error or a payload error
if retryErr != nil {
return resp, result, errors.Wrap(returnErr, "Unable to recover connection")
}
return resp, result, nil