* Code cleanup: Deleted multiple functions which have become stale. Removed cluster example as we replaced the need to create the Cluster object. * Cleaned up ZK connection code by using the backoff function. Added a test to the end to end to test that we're getting the host correctly from ZK. Changed clusters test to be an outside package. * Added LeaderFromZKURL test to end to end tests. * Added logger to realisConfig so that users can attach their own Loggers to the client. Logger is an interface that shadows most popular logging libraries. Only Print, Println, and Printf are needed to be a realis.Logger type. Example in the client uses the std library log. * Moved most fmt.Print* calls to be redirected to user provided logger. Logger by default is a no-op logger. * Adding CreateService to realis interface. Uses the StartJobUpdate API to create services instead of the createJobs API. * Bumping up version number inside client in anticipation of new release.
179 lines
5 KiB
Go
179 lines
5 KiB
Go
/**
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
// Collection of monitors to create synchronicity
|
|
package realis
|
|
|
|
import (
|
|
"time"
|
|
|
|
"github.com/paypal/gorealis/gen-go/apache/aurora"
|
|
"github.com/paypal/gorealis/response"
|
|
"github.com/pkg/errors"
|
|
)
|
|
|
|
const (
|
|
UpdateFailed = "update failed"
|
|
RolledBack = "update rolled back"
|
|
Timeout = "timeout"
|
|
)
|
|
|
|
type Monitor struct {
|
|
Client Realis
|
|
}
|
|
|
|
// Polls the scheduler every certain amount of time to see if the update has succeeded
|
|
func (m *Monitor) JobUpdate(updateKey aurora.JobUpdateKey, interval int, timeout int) (bool, error) {
|
|
|
|
updateQ := aurora.JobUpdateQuery{
|
|
Key: &updateKey,
|
|
Limit: 1,
|
|
}
|
|
ticker := time.NewTicker(time.Second * time.Duration(interval))
|
|
defer ticker.Stop()
|
|
timer := time.NewTimer(time.Second * time.Duration(timeout))
|
|
defer timer.Stop()
|
|
var cliErr error
|
|
var respDetail *aurora.Response
|
|
timedout := false
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
respDetail, cliErr = m.Client.JobUpdateDetails(updateQ)
|
|
if cliErr != nil {
|
|
return false, cliErr
|
|
}
|
|
|
|
updateDetail := response.JobUpdateDetails(respDetail)
|
|
|
|
if len(updateDetail) == 0 {
|
|
m.Client.RealisConfig().logger.Println("No update found")
|
|
return false, errors.New("No update found for " + updateKey.String())
|
|
}
|
|
status := updateDetail[0].Update.Summary.State.Status
|
|
|
|
if _, ok := aurora.ACTIVE_JOB_UPDATE_STATES[status]; !ok {
|
|
|
|
// Rolled forward is the only state in which an update has been successfully updated
|
|
// if we encounter an inactive state and it is not at rolled forward, update failed
|
|
switch status {
|
|
case aurora.JobUpdateStatus_ROLLED_FORWARD:
|
|
m.Client.RealisConfig().logger.Println("Update succeded")
|
|
return true, nil
|
|
case aurora.JobUpdateStatus_FAILED:
|
|
m.Client.RealisConfig().logger.Println("Update failed")
|
|
return false, errors.New(UpdateFailed)
|
|
case aurora.JobUpdateStatus_ROLLED_BACK:
|
|
m.Client.RealisConfig().logger.Println("rolled back")
|
|
return false, errors.New(RolledBack)
|
|
default:
|
|
return false, nil
|
|
}
|
|
}
|
|
case <-timer.C:
|
|
timedout = true
|
|
}
|
|
if timedout {
|
|
break
|
|
}
|
|
}
|
|
return false, errors.New(Timeout)
|
|
}
|
|
|
|
func (m *Monitor) Instances(key *aurora.JobKey, instances int32, interval int, timeout int) (bool, error) {
|
|
|
|
var cliErr error
|
|
var live map[int32]bool
|
|
ticker := time.NewTicker(time.Second * time.Duration(interval))
|
|
defer ticker.Stop()
|
|
timer := time.NewTimer(time.Second * time.Duration(timeout))
|
|
defer timer.Stop()
|
|
|
|
timedout := false
|
|
for {
|
|
select {
|
|
case <-ticker.C:
|
|
live, cliErr = m.Client.GetInstanceIds(key, aurora.LIVE_STATES)
|
|
|
|
if cliErr != nil {
|
|
return false, errors.Wrap(cliErr, "Unable to communicate with Aurora")
|
|
}
|
|
if len(live) == int(instances) {
|
|
return true, nil
|
|
}
|
|
case <-timer.C:
|
|
timedout = true
|
|
}
|
|
if timedout {
|
|
break
|
|
}
|
|
}
|
|
return false, errors.New(Timeout)
|
|
}
|
|
|
|
// Monitor host status until all hosts match the status provided. Returns a map where the value is true if the host
|
|
// is in one of the desired mode(s) or false if it is not as of the time when the monitor exited.
|
|
func (m *Monitor) HostMaintenance(hosts []string, modes []aurora.MaintenanceMode, sleepTime, steps int) (map[string]bool, error) {
|
|
|
|
// Transform modes to monitor for into a set for easy lookup
|
|
desiredMode := make(map[aurora.MaintenanceMode]struct{})
|
|
for _, mode := range modes {
|
|
desiredMode[mode] = struct{}{}
|
|
}
|
|
|
|
// Turn slice into a host set to eliminate duplicates.
|
|
// We also can't use a simple count because multiple modes means we can have multiple matches for a single host.
|
|
// I.e. host A transitions from ACTIVE to DRAINING to DRAINED while monitored
|
|
remainingHosts := make(map[string]struct{})
|
|
for _, host := range hosts {
|
|
remainingHosts[host] = struct{}{}
|
|
}
|
|
|
|
hostResult := make(map[string]bool)
|
|
|
|
for step := 0; step < steps; step++ {
|
|
if step != 0 {
|
|
time.Sleep(time.Duration(sleepTime) * time.Second)
|
|
}
|
|
|
|
// Client call has multiple retries internally
|
|
_, result, err := m.Client.MaintenanceStatus(hosts...)
|
|
if err != nil {
|
|
// Error is either a payload error or a severe connection error
|
|
for host := range remainingHosts {
|
|
hostResult[host] = false
|
|
}
|
|
return hostResult, errors.Wrap(err, "client error in monitor")
|
|
}
|
|
|
|
for status := range result.GetStatuses() {
|
|
|
|
if _, ok := desiredMode[status.GetMode()]; ok {
|
|
hostResult[status.GetHost()] = true
|
|
delete(remainingHosts, status.GetHost())
|
|
|
|
if len(remainingHosts) == 0 {
|
|
return hostResult, nil
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
for host := range remainingHosts {
|
|
hostResult[host] = false
|
|
}
|
|
|
|
return hostResult, errors.New(Timeout)
|
|
}
|