gorealis/realis_admin.go
Renan DelValle 1a15c4a5aa
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105)
* Bumped up version to 1.21.1

* Moving admin functions to a new file. They are still part of the same pointer receiver type.

* Removing dead code and fixing some comments to add space between backslash and comment.

* Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod.

* Added `--rm` to run tests Mac script.

* Removing cookie jar from transport layer as it's not needed.

* Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive.

* Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout.

* Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload.

* Updating changelog with information about the error type returned.

* Adding test for duplicate metadata.

* Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query.


* Added documentation as to how to handle a timeout on an API request.

* Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00

269 lines
7.2 KiB
Go

package realis
import (
"github.com/paypal/gorealis/gen-go/apache/aurora"
"github.com/pkg/errors"
)
// TODO(rdelvalle): Consider moving these functions to another interface. It would be a backwards incompatible change,
// but would add safety.
// Set a list of nodes to DRAINING. This means nothing will be able to be scheduled on them and any existing
// tasks will be killed and re-scheduled elsewhere in the cluster. Tasks from DRAINING nodes are not guaranteed
// to return to running unless there is enough capacity in the cluster to run them.
func (r *realisClient) DrainHosts(hosts ...string) (*aurora.Response, *aurora.DrainHostsResult_, error) {
var result *aurora.DrainHostsResult_
if len(hosts) == 0 {
return nil, nil, errors.New("no hosts provided to drain")
}
drainList := aurora.NewHosts()
drainList.HostNames = hosts
r.logger.DebugPrintf("DrainHosts Thrift Payload: %v\n", drainList)
resp, retryErr := r.thriftCallWithRetries(
false,
func() (*aurora.Response, error) {
return r.adminClient.DrainHosts(nil, drainList)
})
if retryErr != nil {
return resp, result, errors.Wrap(retryErr, "Unable to recover connection")
}
if resp.GetResult_() != nil {
result = resp.GetResult_().GetDrainHostsResult_()
}
return resp, result, nil
}
// Start SLA Aware Drain.
// defaultSlaPolicy is the fallback SlaPolicy to use if a task does not have an SlaPolicy.
// After timeoutSecs, tasks will be forcefully drained without checking SLA.
func (r *realisClient) SLADrainHosts(policy *aurora.SlaPolicy, timeout int64, hosts ...string) (*aurora.DrainHostsResult_, error) {
var result *aurora.DrainHostsResult_
if len(hosts) == 0 {
return nil, errors.New("no hosts provided to drain")
}
drainList := aurora.NewHosts()
drainList.HostNames = hosts
r.logger.DebugPrintf("SLADrainHosts Thrift Payload: %v\n", drainList)
resp, retryErr := r.thriftCallWithRetries(
false,
func() (*aurora.Response, error) {
return r.adminClient.SlaDrainHosts(nil, drainList, policy, timeout)
})
if retryErr != nil {
return result, errors.Wrap(retryErr, "Unable to recover connection")
}
if resp.GetResult_() != nil {
result = resp.GetResult_().GetDrainHostsResult_()
}
return result, nil
}
func (r *realisClient) StartMaintenance(hosts ...string) (*aurora.Response, *aurora.StartMaintenanceResult_, error) {
var result *aurora.StartMaintenanceResult_
if len(hosts) == 0 {
return nil, nil, errors.New("no hosts provided to start maintenance on")
}
hostList := aurora.NewHosts()
hostList.HostNames = hosts
r.logger.DebugPrintf("StartMaintenance Thrift Payload: %v\n", hostList)
resp, retryErr := r.thriftCallWithRetries(
false,
func() (*aurora.Response, error) {
return r.adminClient.StartMaintenance(nil, hostList)
})
if retryErr != nil {
return resp, result, errors.Wrap(retryErr, "Unable to recover connection")
}
if resp.GetResult_() != nil {
result = resp.GetResult_().GetStartMaintenanceResult_()
}
return resp, result, nil
}
func (r *realisClient) EndMaintenance(hosts ...string) (*aurora.Response, *aurora.EndMaintenanceResult_, error) {
var result *aurora.EndMaintenanceResult_
if len(hosts) == 0 {
return nil, nil, errors.New("no hosts provided to end maintenance on")
}
hostList := aurora.NewHosts()
hostList.HostNames = hosts
r.logger.DebugPrintf("EndMaintenance Thrift Payload: %v\n", hostList)
resp, retryErr := r.thriftCallWithRetries(
false,
func() (*aurora.Response, error) {
return r.adminClient.EndMaintenance(nil, hostList)
})
if retryErr != nil {
return resp, result, errors.Wrap(retryErr, "Unable to recover connection")
}
if resp.GetResult_() != nil {
result = resp.GetResult_().GetEndMaintenanceResult_()
}
return resp, result, nil
}
func (r *realisClient) MaintenanceStatus(hosts ...string) (*aurora.Response, *aurora.MaintenanceStatusResult_, error) {
var result *aurora.MaintenanceStatusResult_
if len(hosts) == 0 {
return nil, nil, errors.New("no hosts provided to get maintenance status from")
}
hostList := aurora.NewHosts()
hostList.HostNames = hosts
r.logger.DebugPrintf("MaintenanceStatus Thrift Payload: %v\n", hostList)
// Make thrift call. If we encounter an error sending the call, attempt to reconnect
// and continue trying to resend command until we run out of retries.
resp, retryErr := r.thriftCallWithRetries(
false,
func() (*aurora.Response, error) {
return r.adminClient.MaintenanceStatus(nil, hostList)
})
if retryErr != nil {
return resp, result, errors.Wrap(retryErr, "Unable to recover connection")
}
if resp.GetResult_() != nil {
result = resp.GetResult_().GetMaintenanceStatusResult_()
}
return resp, result, nil
}
// SetQuota sets a quota aggregate for the given role
// TODO(zircote) Currently investigating an error that is returned from thrift calls that include resources for `NamedPort` and `NumGpu`
func (r *realisClient) SetQuota(role string, cpu *float64, ramMb *int64, diskMb *int64) (*aurora.Response, error) {
quota := &aurora.ResourceAggregate{
Resources: []*aurora.Resource{{NumCpus: cpu}, {RamMb: ramMb}, {DiskMb: diskMb}},
}
resp, retryErr := r.thriftCallWithRetries(
false,
func() (*aurora.Response, error) {
return r.adminClient.SetQuota(nil, role, quota)
})
if retryErr != nil {
return resp, errors.Wrap(retryErr, "Unable to set role quota")
}
return resp, retryErr
}
// GetQuota returns the resource aggregate for the given role
func (r *realisClient) GetQuota(role string) (*aurora.Response, error) {
resp, retryErr := r.thriftCallWithRetries(
false,
func() (*aurora.Response, error) {
return r.adminClient.GetQuota(nil, role)
})
if retryErr != nil {
return resp, errors.Wrap(retryErr, "Unable to get role quota")
}
return resp, retryErr
}
// Force Aurora Scheduler to perform a snapshot and write to Mesos log
func (r *realisClient) Snapshot() error {
_, retryErr := r.thriftCallWithRetries(
false,
func() (*aurora.Response, error) {
return r.adminClient.Snapshot(nil)
})
if retryErr != nil {
return errors.Wrap(retryErr, "Unable to recover connection")
}
return nil
}
// Force Aurora Scheduler to write backup file to a file in the backup directory
func (r *realisClient) PerformBackup() error {
_, retryErr := r.thriftCallWithRetries(
false,
func() (*aurora.Response, error) {
return r.adminClient.PerformBackup(nil)
})
if retryErr != nil {
return errors.Wrap(retryErr, "Unable to recover connection")
}
return nil
}
func (r *realisClient) ForceImplicitTaskReconciliation() error {
_, retryErr := r.thriftCallWithRetries(
false,
func() (*aurora.Response, error) {
return r.adminClient.TriggerImplicitTaskReconciliation(nil)
})
if retryErr != nil {
return errors.Wrap(retryErr, "Unable to recover connection")
}
return nil
}
func (r *realisClient) ForceExplicitTaskReconciliation(batchSize *int32) error {
if batchSize != nil && *batchSize < 1 {
return errors.New("Invalid batch size.")
}
settings := aurora.NewExplicitReconciliationSettings()
settings.BatchSize = batchSize
_, retryErr := r.thriftCallWithRetries(false,
func() (*aurora.Response, error) {
return r.adminClient.TriggerExplicitTaskReconciliation(nil, settings)
})
if retryErr != nil {
return errors.Wrap(retryErr, "Unable to recover connection")
}
return nil
}