remove exponential backoffs from monitors
This commit is contained in:
parent
7538f366ea
commit
0e12d273af
4 changed files with 229 additions and 251 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -6,6 +6,7 @@
|
|||
# Folders
|
||||
_obj
|
||||
_test
|
||||
.idea
|
||||
|
||||
# Architecture specific extensions/prefixes
|
||||
*.[568vq]
|
||||
|
|
130
monitors.go
130
monitors.go
|
@ -17,7 +17,6 @@ package realis
|
|||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/paypal/gorealis/gen-go/apache/aurora"
|
||||
|
@ -28,6 +27,7 @@ import (
|
|||
const (
|
||||
UpdateFailed = "update failed"
|
||||
RolledBack = "update rolled back"
|
||||
Timeout = "timeout"
|
||||
)
|
||||
|
||||
type Monitor struct {
|
||||
|
@ -41,84 +41,86 @@ func (m *Monitor) JobUpdate(updateKey aurora.JobUpdateKey, interval int, timeout
|
|||
Key: &updateKey,
|
||||
Limit: 1,
|
||||
}
|
||||
|
||||
ticker := time.NewTicker(time.Second * time.Duration(interval))
|
||||
defer ticker.Stop()
|
||||
timer := time.NewTimer(time.Second * time.Duration(timeout))
|
||||
defer timer.Stop()
|
||||
var cliErr error
|
||||
var respDetail *aurora.Response
|
||||
|
||||
retryErr := ExponentialBackoff(*m.Client.RealisConfig().backoff, func() (bool, error) {
|
||||
respDetail, cliErr = CheckAndRetryConn(m.Client, func() (*aurora.Response, error) {
|
||||
return m.Client.JobUpdateDetails(updateQ)
|
||||
})
|
||||
if cliErr == RetryConnErr {
|
||||
return false, nil
|
||||
} else {
|
||||
return false, cliErr
|
||||
}
|
||||
updateDetail := response.JobUpdateDetails(respDetail)
|
||||
|
||||
if len(updateDetail) == 0 {
|
||||
fmt.Println("No update found")
|
||||
return false, errors.New("No update found for " + updateKey.String())
|
||||
}
|
||||
status := updateDetail[0].Update.Summary.State.Status
|
||||
|
||||
if _, ok := aurora.ACTIVE_JOB_UPDATE_STATES[status]; !ok {
|
||||
|
||||
// Rolled forward is the only state in which an update has been successfully updated
|
||||
// if we encounter an inactive state and it is not at rolled forward, update failed
|
||||
switch status {
|
||||
case aurora.JobUpdateStatus_ROLLED_FORWARD:
|
||||
fmt.Println("Update succeded")
|
||||
return true, nil
|
||||
case aurora.JobUpdateStatus_FAILED:
|
||||
fmt.Println("Update failed")
|
||||
return false, errors.New(UpdateFailed)
|
||||
case aurora.JobUpdateStatus_ROLLED_BACK:
|
||||
fmt.Println("rolled back")
|
||||
return false, errors.New(RolledBack)
|
||||
default:
|
||||
return false, nil
|
||||
timedout := false
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
respDetail, cliErr = m.Client.JobUpdateDetails(updateQ)
|
||||
if cliErr != nil {
|
||||
return false, cliErr
|
||||
}
|
||||
|
||||
updateDetail := response.JobUpdateDetails(respDetail)
|
||||
|
||||
if len(updateDetail) == 0 {
|
||||
fmt.Println("No update found")
|
||||
return false, errors.New("No update found for " + updateKey.String())
|
||||
}
|
||||
status := updateDetail[0].Update.Summary.State.Status
|
||||
|
||||
if _, ok := aurora.ACTIVE_JOB_UPDATE_STATES[status]; !ok {
|
||||
|
||||
// Rolled forward is the only state in which an update has been successfully updated
|
||||
// if we encounter an inactive state and it is not at rolled forward, update failed
|
||||
switch status {
|
||||
case aurora.JobUpdateStatus_ROLLED_FORWARD:
|
||||
fmt.Println("Update succeded")
|
||||
return true, nil
|
||||
case aurora.JobUpdateStatus_FAILED:
|
||||
fmt.Println("Update failed")
|
||||
return false, errors.New(UpdateFailed)
|
||||
case aurora.JobUpdateStatus_ROLLED_BACK:
|
||||
fmt.Println("rolled back")
|
||||
return false, errors.New(RolledBack)
|
||||
default:
|
||||
return false, nil
|
||||
}
|
||||
}
|
||||
case <-timer.C:
|
||||
timedout = true
|
||||
}
|
||||
if timedout {
|
||||
break
|
||||
}
|
||||
return false, nil
|
||||
})
|
||||
if retryErr != nil {
|
||||
return false, errors.Wrap(cliErr, retryErr.Error())
|
||||
}
|
||||
return true, nil
|
||||
return false, errors.New(Timeout)
|
||||
}
|
||||
|
||||
func (m *Monitor) Instances(key *aurora.JobKey, instances int32, interval int, timeout int) (bool, error) {
|
||||
|
||||
var cliErr error
|
||||
var live map[int32]bool
|
||||
ticker := time.NewTicker(time.Second * time.Duration(interval))
|
||||
defer ticker.Stop()
|
||||
timer := time.NewTimer(time.Second * time.Duration(timeout))
|
||||
defer timer.Stop()
|
||||
|
||||
retryErr := ExponentialBackoff(*m.Client.RealisConfig().backoff, func() (bool, error) {
|
||||
live, cliErr = m.Client.GetInstanceIds(key, aurora.LIVE_STATES)
|
||||
if strings.Contains(cliErr.Error(), ConnRefusedErr) || strings.Contains(cliErr.Error(), NoLeaderFoundErr) {
|
||||
// TODO try this condition only if the error is connection related
|
||||
conErr := m.Client.ReestablishConn()
|
||||
if conErr != nil {
|
||||
// TODO: identify specific type of connection errors
|
||||
return false, nil
|
||||
timedout := false
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
live, cliErr = m.Client.GetInstanceIds(key, aurora.LIVE_STATES)
|
||||
|
||||
if cliErr != nil {
|
||||
return false, errors.Wrap(cliErr, "Unable to communicate with Aurora")
|
||||
}
|
||||
return false, nil
|
||||
if len(live) == int(instances) {
|
||||
return true, nil
|
||||
}
|
||||
case <-timer.C:
|
||||
timedout = true
|
||||
}
|
||||
if cliErr != nil {
|
||||
return false, errors.Wrap(cliErr, "Unable to communicate with Aurora")
|
||||
if timedout {
|
||||
break
|
||||
}
|
||||
if len(live) == int(instances) {
|
||||
return true, nil
|
||||
}
|
||||
return false, nil
|
||||
})
|
||||
if cliErr != nil {
|
||||
return false, cliErr
|
||||
}
|
||||
if retryErr != nil {
|
||||
return false, retryErr
|
||||
}
|
||||
return true, nil
|
||||
return false, errors.New(Timeout)
|
||||
}
|
||||
|
||||
// Monitor host status until all hosts match the status provided. Returns a map where the value is true if the host
|
||||
|
@ -174,5 +176,5 @@ func (m *Monitor) HostMaintenance(hosts []string, modes []aurora.MaintenanceMode
|
|||
hostResult[host] = false
|
||||
}
|
||||
|
||||
return hostResult, errors.New("Timed out")
|
||||
return hostResult, errors.New(Timeout)
|
||||
}
|
||||
|
|
334
realis.go
334
realis.go
|
@ -16,13 +16,13 @@
|
|||
package realis
|
||||
|
||||
import (
|
||||
"crypto/tls"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/cookiejar"
|
||||
"time"
|
||||
|
||||
"fmt"
|
||||
|
||||
"math/rand"
|
||||
|
||||
"git.apache.org/thrift.git/lib/go/thrift"
|
||||
|
@ -307,7 +307,7 @@ func GetDefaultClusterFromZKUrl(zkurl string) *Cluster {
|
|||
//This api would create default cluster object..
|
||||
func NewDefaultClientUsingZKUrl(zkUrl, user, passwd string) (Realis, error) {
|
||||
|
||||
fmt.Println(" zkUrl: %s", zkUrl)
|
||||
fmt.Printf(" zkUrl: %s\n", zkUrl)
|
||||
cluster := GetDefaultClusterFromZKUrl(zkUrl)
|
||||
|
||||
url, err := LeaderFromZK(*cluster)
|
||||
|
@ -369,9 +369,11 @@ func defaultTTransport(urlstr string, timeoutms int) (thrift.TTransport, error)
|
|||
if err != nil {
|
||||
return &thrift.THttpClient{}, errors.Wrap(err, "Error creating Cookie Jar")
|
||||
}
|
||||
|
||||
transport := &http.Transport{
|
||||
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
|
||||
}
|
||||
trans, err := thrift.NewTHttpPostClientWithOptions(urlstr+"/api",
|
||||
thrift.THttpClientOptions{Client: &http.Client{Timeout: time.Millisecond * time.Duration(timeoutms), Jar: jar}})
|
||||
thrift.THttpClientOptions{Client: &http.Client{Timeout: time.Millisecond * time.Duration(timeoutms), Transport: transport, Jar: jar}})
|
||||
|
||||
if err != nil {
|
||||
return &thrift.THttpClient{}, errors.Wrap(err, "Error creating transport")
|
||||
|
@ -527,9 +529,10 @@ func (r *realisClient) GetInstanceIds(key *aurora.JobKey, states map[aurora.Sche
|
|||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.GetTasksWithoutConfigs(taskQ)
|
||||
})
|
||||
if clientErr == RetryConnErr {
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
} else {
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
|
@ -559,9 +562,10 @@ func (r *realisClient) GetJobUpdateSummaries(jobUpdateQuery *aurora.JobUpdateQue
|
|||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.readonlyClient.GetJobUpdateSummaries(jobUpdateQuery)
|
||||
})
|
||||
if clientErr == RetryConnErr {
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
} else {
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
|
@ -589,9 +593,10 @@ func (r *realisClient) KillInstances(key *aurora.JobKey, instances ...int32) (*a
|
|||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.KillTasks(key, instanceIds)
|
||||
})
|
||||
if clientErr == RetryConnErr {
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
} else {
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
|
@ -623,9 +628,10 @@ func (r *realisClient) KillJob(key *aurora.JobKey) (*aurora.Response, error) {
|
|||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.KillTasks(key, instanceIds)
|
||||
})
|
||||
if clientErr == RetryConnErr {
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
} else {
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
|
@ -647,9 +653,10 @@ func (r *realisClient) CreateJob(auroraJob Job) (*aurora.Response, error) {
|
|||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.CreateJob(auroraJob.JobConfig())
|
||||
})
|
||||
if clientErr == RetryConnErr {
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
} else {
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
|
@ -670,9 +677,10 @@ func (r *realisClient) ScheduleCronJob(auroraJob Job) (*aurora.Response, error)
|
|||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.ScheduleCronJob(auroraJob.JobConfig())
|
||||
})
|
||||
if clientErr == RetryConnErr {
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
} else {
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
|
@ -693,9 +701,10 @@ func (r *realisClient) DescheduleCronJob(key *aurora.JobKey) (*aurora.Response,
|
|||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.DescheduleCronJob(key)
|
||||
})
|
||||
if clientErr == RetryConnErr {
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
} else {
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
|
@ -716,9 +725,10 @@ func (r *realisClient) StartCronJob(key *aurora.JobKey) (*aurora.Response, error
|
|||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.StartCronJob(key)
|
||||
})
|
||||
if clientErr == RetryConnErr {
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
} else {
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
|
@ -745,9 +755,10 @@ func (r *realisClient) RestartInstances(key *aurora.JobKey, instances ...int32)
|
|||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.RestartShards(key, instanceIds)
|
||||
})
|
||||
if clientErr == RetryConnErr {
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
} else {
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
|
@ -773,9 +784,10 @@ func (r *realisClient) RestartJob(key *aurora.JobKey) (*aurora.Response, error)
|
|||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.RestartShards(key, instanceIds)
|
||||
})
|
||||
if clientErr == RetryConnErr {
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
} else {
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
|
@ -800,9 +812,10 @@ func (r *realisClient) StartJobUpdate(updateJob *UpdateJob, message string) (*au
|
|||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.StartJobUpdate(updateJob.req, message)
|
||||
})
|
||||
if clientErr == RetryConnErr {
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
} else {
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
|
@ -819,20 +832,23 @@ func (r *realisClient) AbortJobUpdate(
|
|||
updateKey aurora.JobUpdateKey,
|
||||
message string) (*aurora.Response, error) {
|
||||
var resp *aurora.Response
|
||||
var err error
|
||||
defaultBackoff := r.config.backoff
|
||||
xerr := ExponentialBackoff(*defaultBackoff, func() (bool, error) {
|
||||
resp, err = r.client.AbortJobUpdate(&updateKey, message)
|
||||
if err != nil {
|
||||
err1 := r.ReestablishConn()
|
||||
if err1 != nil {
|
||||
return false, nil
|
||||
}
|
||||
var clientErr error
|
||||
|
||||
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
|
||||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.AbortJobUpdate(&updateKey, message)
|
||||
})
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, xerr.Error()+"Error sending AbortJobUpdate command to Aurora Scheduler")
|
||||
|
||||
if clientErr != nil {
|
||||
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error sending AbortJobUpdate command to Aurora Scheduler")
|
||||
}
|
||||
return response.ResponseCodeCheck(resp)
|
||||
}
|
||||
|
@ -842,21 +858,23 @@ func (r *realisClient) AbortJobUpdate(
|
|||
func (r *realisClient) AddInstances(instKey aurora.InstanceKey, count int32) (*aurora.Response, error) {
|
||||
|
||||
var resp *aurora.Response
|
||||
var err error
|
||||
var clientErr error
|
||||
|
||||
defaultBackoff := r.config.backoff
|
||||
xerr := ExponentialBackoff(*defaultBackoff, func() (bool, error) {
|
||||
resp, err = r.client.AddInstances(&instKey, count)
|
||||
if err != nil {
|
||||
err1 := r.ReestablishConn()
|
||||
if err1 != nil {
|
||||
return false, nil
|
||||
}
|
||||
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
|
||||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.AddInstances(&instKey, count)
|
||||
})
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, xerr.Error()+"Error sending AddInstances command to Aurora Scheduler")
|
||||
|
||||
if clientErr != nil {
|
||||
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error sending AddInstances command to Aurora Scheduler")
|
||||
}
|
||||
return response.ResponseCodeCheck(resp)
|
||||
|
||||
|
@ -888,23 +906,23 @@ func (r *realisClient) RemoveInstances(key *aurora.JobKey, count int32) (*aurora
|
|||
func (r *realisClient) GetTaskStatus(query *aurora.TaskQuery) (tasks []*aurora.ScheduledTask, e error) {
|
||||
|
||||
var resp *aurora.Response
|
||||
var err error
|
||||
var clientErr error
|
||||
|
||||
defaultBackoff := r.config.backoff
|
||||
xerr := ExponentialBackoff(*defaultBackoff, func() (bool, error) {
|
||||
start:
|
||||
resp, err = r.client.GetTasksStatus(query)
|
||||
if err != nil {
|
||||
err1 := r.ReestablishConn()
|
||||
if err1 != nil {
|
||||
return false, nil
|
||||
}
|
||||
goto start
|
||||
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
|
||||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.GetTasksStatus(query)
|
||||
})
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, xerr.Error()+"Error querying Aurora Scheduler for task status")
|
||||
|
||||
if clientErr != nil {
|
||||
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error querying Aurora Scheduler for task status")
|
||||
}
|
||||
//Check for response code..
|
||||
if resp.GetResponseCode() != aurora.ResponseCode_OK {
|
||||
|
@ -917,23 +935,23 @@ func (r *realisClient) GetTaskStatus(query *aurora.TaskQuery) (tasks []*aurora.S
|
|||
// Get information about task including without a task configuration object
|
||||
func (r *realisClient) GetTasksWithoutConfigs(query *aurora.TaskQuery) (tasks []*aurora.ScheduledTask, e error) {
|
||||
var resp *aurora.Response
|
||||
var err error
|
||||
var clientErr error
|
||||
|
||||
defaultBackoff := r.config.backoff
|
||||
xerr := ExponentialBackoff(*defaultBackoff, func() (bool, error) {
|
||||
start:
|
||||
resp, err = r.client.GetTasksWithoutConfigs(query)
|
||||
if err != nil {
|
||||
err1 := r.ReestablishConn()
|
||||
if err1 != nil {
|
||||
return false, nil
|
||||
}
|
||||
goto start
|
||||
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
|
||||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.GetTasksWithoutConfigs(query)
|
||||
})
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, xerr.Error()+"Error querying Aurora Scheduler for task status without configs")
|
||||
|
||||
if clientErr != nil {
|
||||
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error querying Aurora Scheduler for task status without configs")
|
||||
}
|
||||
//Check for response code..
|
||||
if resp.GetResponseCode() != aurora.ResponseCode_OK {
|
||||
|
@ -956,21 +974,23 @@ func (r *realisClient) FetchTaskConfig(instKey aurora.InstanceKey) (*aurora.Task
|
|||
Statuses: aurora.ACTIVE_STATES}
|
||||
|
||||
var resp *aurora.Response
|
||||
var err error
|
||||
var clientErr error
|
||||
|
||||
defaultBackoff := r.config.backoff
|
||||
xerr := ExponentialBackoff(*defaultBackoff, func() (bool, error) {
|
||||
resp, err = r.client.GetTasksStatus(taskQ)
|
||||
if err != nil {
|
||||
err1 := r.ReestablishConn()
|
||||
if err1 != nil {
|
||||
return false, nil
|
||||
}
|
||||
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
|
||||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.GetTasksStatus(taskQ)
|
||||
})
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, xerr.Error()+"Error querying Aurora Scheduler for task configuration")
|
||||
|
||||
if clientErr != nil {
|
||||
return nil, errors.Wrap(clientErr, retryErr.Error()+"Error querying Aurora Scheduler for task configuration")
|
||||
}
|
||||
//Check for response code..
|
||||
if resp.GetResponseCode() != aurora.ResponseCode_OK {
|
||||
|
@ -994,21 +1014,23 @@ func (r *realisClient) FetchTaskConfig(instKey aurora.InstanceKey) (*aurora.Task
|
|||
func (r *realisClient) JobUpdateDetails(updateQuery aurora.JobUpdateQuery) (*aurora.Response, error) {
|
||||
|
||||
var resp *aurora.Response
|
||||
var err error
|
||||
var clientErr error
|
||||
|
||||
defaultBackoff := r.config.backoff
|
||||
xerr := ExponentialBackoff(*defaultBackoff, func() (bool, error) {
|
||||
resp, err = r.client.GetJobUpdateDetails(&updateQuery)
|
||||
if err != nil {
|
||||
err1 := r.ReestablishConn()
|
||||
if err1 != nil {
|
||||
return false, nil
|
||||
}
|
||||
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
|
||||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.GetJobUpdateDetails(&updateQuery)
|
||||
})
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, xerr.Error()+"Unable to get job update details")
|
||||
|
||||
if clientErr != nil {
|
||||
return nil, errors.Wrap(clientErr, retryErr.Error()+"Unable to get job update details")
|
||||
}
|
||||
return response.ResponseCodeCheck(resp)
|
||||
|
||||
|
@ -1016,21 +1038,23 @@ func (r *realisClient) JobUpdateDetails(updateQuery aurora.JobUpdateQuery) (*aur
|
|||
|
||||
func (r *realisClient) RollbackJobUpdate(key aurora.JobUpdateKey, message string) (*aurora.Response, error) {
|
||||
var resp *aurora.Response
|
||||
var err error
|
||||
var clientErr error
|
||||
|
||||
defaultBackoff := r.config.backoff
|
||||
xerr := ExponentialBackoff(*defaultBackoff, func() (bool, error) {
|
||||
resp, err = r.client.RollbackJobUpdate(&key, message)
|
||||
if err != nil {
|
||||
err1 := r.ReestablishConn()
|
||||
if err1 != nil {
|
||||
return false, nil
|
||||
}
|
||||
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
|
||||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.client.RollbackJobUpdate(&key, message)
|
||||
})
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
}
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
return true, nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, xerr.Error()+"Unable to roll back job update")
|
||||
|
||||
if clientErr != nil {
|
||||
return nil, errors.Wrap(clientErr, retryErr.Error()+"Unable to roll back job update")
|
||||
}
|
||||
return response.ResponseCodeCheck(resp)
|
||||
}
|
||||
|
@ -1042,7 +1066,7 @@ func (r *realisClient) DrainHosts(hosts ...string) (*aurora.Response, *aurora.Dr
|
|||
|
||||
var resp *aurora.Response
|
||||
var result *aurora.DrainHostsResult_
|
||||
var returnErr, clientErr, payloadErr error
|
||||
var clientErr error
|
||||
|
||||
if len(hosts) == 0 {
|
||||
return nil, nil, errors.New("no hosts provided to drain")
|
||||
|
@ -1054,50 +1078,27 @@ func (r *realisClient) DrainHosts(hosts ...string) (*aurora.Response, *aurora.Dr
|
|||
drainList.HostNames[host] = true
|
||||
}
|
||||
|
||||
retryErr := ExponentialBackoff(defaultBackoff, func() (bool, error) {
|
||||
|
||||
// Send thrift call, if we have a thrift send error, attempt to reconnect
|
||||
// and continue trying to resend command
|
||||
if resp, clientErr = r.adminClient.DrainHosts(drainList); clientErr != nil {
|
||||
// Experienced an connection error
|
||||
err1 := r.ReestablishConn()
|
||||
if err1 != nil {
|
||||
fmt.Println("error in re-establishing connection: ", err1)
|
||||
}
|
||||
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
|
||||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.adminClient.DrainHosts(drainList)
|
||||
})
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// If error is NOT due to connection
|
||||
if _, payloadErr = response.ResponseCodeCheck(resp); payloadErr != nil {
|
||||
// TODO(rdelvalle): an leader election may cause the response to have
|
||||
// failed when it should have succeeded. Retry everything for now until
|
||||
// we figure out a more concrete fix.
|
||||
return false, nil
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
|
||||
// Successful call
|
||||
return true, nil
|
||||
|
||||
})
|
||||
|
||||
if clientErr != nil {
|
||||
return nil, nil, errors.Wrap(clientErr, retryErr.Error()+"Unable to recover connection")
|
||||
}
|
||||
|
||||
if resp != nil && resp.GetResult_() != nil {
|
||||
result = resp.GetResult_().GetDrainHostsResult_()
|
||||
}
|
||||
|
||||
// Prioritize returning a bad payload error over a client error as a bad payload error indicates
|
||||
// a deeper issue
|
||||
if payloadErr != nil {
|
||||
returnErr = payloadErr
|
||||
} else {
|
||||
returnErr = clientErr
|
||||
}
|
||||
|
||||
// Timed out on retries. *Note that when we fix the unexpected errors with a correct payload,
|
||||
// this will can become either a timeout error or a payload error
|
||||
if retryErr != nil {
|
||||
return resp, result, errors.Wrap(returnErr, "Unable to recover connection")
|
||||
}
|
||||
|
||||
return resp, result, nil
|
||||
}
|
||||
|
||||
|
@ -1105,7 +1106,7 @@ func (r *realisClient) EndMaintenance(hosts ...string) (*aurora.Response, *auror
|
|||
|
||||
var resp *aurora.Response
|
||||
var result *aurora.EndMaintenanceResult_
|
||||
var returnErr, clientErr, payloadErr error
|
||||
var clientErr error
|
||||
|
||||
if len(hosts) == 0 {
|
||||
return nil, nil, errors.New("no hosts provided to end maintenance on")
|
||||
|
@ -1117,50 +1118,27 @@ func (r *realisClient) EndMaintenance(hosts ...string) (*aurora.Response, *auror
|
|||
hostList.HostNames[host] = true
|
||||
}
|
||||
|
||||
retryErr := ExponentialBackoff(defaultBackoff, func() (bool, error) {
|
||||
|
||||
// Send thrift call, if we have a thrift send error, attempt to reconnect
|
||||
// and continue trying to resend command
|
||||
if resp, clientErr = r.adminClient.EndMaintenance(hostList); clientErr != nil {
|
||||
// Experienced an connection error
|
||||
err1 := r.ReestablishConn()
|
||||
if err1 != nil {
|
||||
fmt.Println("error in re-establishing connection: ", err1)
|
||||
}
|
||||
retryErr := ExponentialBackoff(*r.config.backoff, func() (bool, error) {
|
||||
resp, clientErr = CheckAndRetryConn(r, func() (*aurora.Response, error) {
|
||||
return r.adminClient.EndMaintenance(hostList)
|
||||
})
|
||||
if clientErr != nil && clientErr.Error() == RetryConnErr.Error() {
|
||||
return false, nil
|
||||
}
|
||||
|
||||
// If error is NOT due to connection
|
||||
if _, payloadErr = response.ResponseCodeCheck(resp); payloadErr != nil {
|
||||
// TODO(rdelvalle): an leader election may cause the response to have
|
||||
// failed when it should have succeeded. Retry everything for now until
|
||||
// we figure out a more concrete fix.
|
||||
return false, nil
|
||||
if clientErr != nil {
|
||||
return false, clientErr
|
||||
}
|
||||
|
||||
// Successful call
|
||||
return true, nil
|
||||
|
||||
})
|
||||
|
||||
if clientErr != nil {
|
||||
return nil, nil, errors.Wrap(clientErr, retryErr.Error()+"Unable to recover connection")
|
||||
}
|
||||
|
||||
if resp != nil && resp.GetResult_() != nil {
|
||||
result = resp.GetResult_().GetEndMaintenanceResult_()
|
||||
}
|
||||
|
||||
// Prioritize returning a bad payload error over a client error as a bad payload error indicates
|
||||
// a deeper issue
|
||||
if payloadErr != nil {
|
||||
returnErr = payloadErr
|
||||
} else {
|
||||
returnErr = clientErr
|
||||
}
|
||||
|
||||
// Timed out on retries. *Note that when we fix the unexpected errors with a correct payload,
|
||||
// this will can become either a timeout error or a payload error
|
||||
if retryErr != nil {
|
||||
return resp, result, errors.Wrap(returnErr, "Unable to recover connection")
|
||||
}
|
||||
|
||||
return resp, result, nil
|
||||
}
|
||||
|
||||
|
|
15
retry.go
15
retry.go
|
@ -37,7 +37,7 @@ var RetryConnErr = errors.New("error occured during with aurora retrying")
|
|||
// if the loop should be aborted.
|
||||
type ConditionFunc func() (done bool, err error)
|
||||
|
||||
type AuroraFunc func() (resp *aurora.Response, err error)
|
||||
type AuroraThriftCall func() (resp *aurora.Response, err error)
|
||||
|
||||
// ExponentialBackoff repeats a condition check with exponential backoff.
|
||||
//
|
||||
|
@ -70,16 +70,13 @@ func ExponentialBackoff(backoff Backoff, condition ConditionFunc) error {
|
|||
// CheckAndRetryConn function takes realis client and a trhift API function to call and returns response and error
|
||||
// If Error from the APi call is Retry able . THe functions re establishes the connection with aurora by getting the latest aurora master from zookeeper.
|
||||
// If Error is retyable return resp and RetryConnErr error.
|
||||
func CheckAndRetryConn(r Realis, aurorajob AuroraFunc) (*aurora.Response, error) {
|
||||
resp, cliErr := aurorajob()
|
||||
if strings.Contains(cliErr.Error(), ConnRefusedErr) || strings.Contains(cliErr.Error(), NoLeaderFoundErr) {
|
||||
conErr := r.ReestablishConn()
|
||||
if conErr != nil {
|
||||
return resp, RetryConnErr
|
||||
}
|
||||
func CheckAndRetryConn(r Realis, auroraCall AuroraThriftCall) (*aurora.Response, error) {
|
||||
resp, cliErr := auroraCall()
|
||||
if cliErr != nil && (strings.Contains(cliErr.Error(), ConnRefusedErr) || strings.Contains(cliErr.Error(), NoLeaderFoundErr)) {
|
||||
r.ReestablishConn()
|
||||
return resp, RetryConnErr
|
||||
}
|
||||
if resp != nil && resp.GetResponseCode() != aurora.ResponseCode_ERROR_TRANSIENT {
|
||||
if resp != nil && resp.GetResponseCode() == aurora.ResponseCode_ERROR_TRANSIENT {
|
||||
return resp, RetryConnErr
|
||||
}
|
||||
return resp, cliErr
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue