From 5099d7e6ecdee1d5566ce736a8679b10bb8ebac4 Mon Sep 17 00:00:00 2001 From: Renan DelValle Date: Fri, 14 Sep 2018 15:04:16 -0700 Subject: [PATCH] Adding force snapshot and force backup APIs (#73) * Adding force snapshot and force backup APIs. --- examples/client.go | 18 +++++++++++++++++- realis.go | 34 ++++++++++++++++++++++++++++++++++ realis_e2e_test.go | 14 +++++++++++++- 3 files changed, 64 insertions(+), 2 deletions(-) diff --git a/examples/client.go b/examples/client.go index 5eae5c4..da922a3 100644 --- a/examples/client.go +++ b/examples/client.go @@ -644,12 +644,28 @@ func main() { fmt.Println("GetJobs...role: ", role) _, result, err := r.GetJobs(role) if err != nil { - fmt.Print("error: %+v\n", err.Error()) + fmt.Printf("error: %+v\n", err.Error()) os.Exit(1) } fmt.Println("map size: ", len(result.Configs)) fmt.Println(result.String()) + case "snapshot": + fmt.Println("Forcing scheduler to write snapshot to mesos replicated log") + err := r.Snapshot() + if err != nil { + fmt.Printf("error: %+v\n", err.Error()) + os.Exit(1) + } + + case "performBackup": + fmt.Println("Writing Backup of Snapshot to file system") + err := r.PerformBackup() + if err != nil { + fmt.Printf("error: %+v\n", err.Error()) + os.Exit(1) + } + default: fmt.Println("Command not supported") os.Exit(1) diff --git a/realis.go b/realis.go index 3e4837a..b491774 100644 --- a/realis.go +++ b/realis.go @@ -75,6 +75,8 @@ type Realis interface { MaintenanceStatus(hosts ...string) (*aurora.Response, *aurora.MaintenanceStatusResult_, error) SetQuota(role string, cpu *float64, ram *int64, disk *int64) (*aurora.Response, error) GetQuota(role string) (*aurora.Response, error) + Snapshot() error + PerformBackup() error } type realisClient struct { @@ -940,6 +942,10 @@ func (r *realisClient) RollbackJobUpdate(key aurora.JobUpdateKey, message string return resp, nil } +/* Admin functions */ +// TODO(rdelvalle): Consider moving these functions to another interface. It would be a backwards incompatible change, +// but would add safety. + // Set a list of nodes to DRAINING. This means nothing will be able to be scheduled on them and any existing // tasks will be killed and re-scheduled elsewhere in the cluster. Tasks from DRAINING nodes are not guaranteed // to return to running unless there is enough capacity in the cluster to run them. @@ -1078,3 +1084,31 @@ func (r *realisClient) GetQuota(role string) (*aurora.Response, error) { return resp, retryErr } + +// Force Aurora Scheduler to perform a snapshot and write to Mesos log +func (r *realisClient) Snapshot() error { + + _, retryErr := r.thriftCallWithRetries(func() (*aurora.Response, error) { + return r.adminClient.Snapshot() + }) + + if retryErr != nil { + return errors.Wrap(retryErr, "Unable to recover connection") + } + + return nil +} + +// Force Aurora Scheduler to write backup file to a file in the backup directory +func (r *realisClient) PerformBackup() error { + + _, retryErr := r.thriftCallWithRetries(func() (*aurora.Response, error) { + return r.adminClient.PerformBackup() + }) + + if retryErr != nil { + return errors.Wrap(retryErr, "Unable to recover connection") + } + + return nil +} diff --git a/realis_e2e_test.go b/realis_e2e_test.go index afc651e..633528d 100644 --- a/realis_e2e_test.go +++ b/realis_e2e_test.go @@ -143,12 +143,24 @@ func TestRealisClient_CreateJob_Thermos(t *testing.T) { assert.True(t, success) assert.NoError(t, err) - //Fetch all obs + //Fetch all Jobs _, result, err := r.GetJobs(role) fmt.Printf("GetJobs length: %+v \n", len(result.Configs)) assert.Equal(t, len(result.Configs), 1) assert.NoError(t, err) + // Test asking the scheduler to perform a Snpshot + t.Run("TestRealisClient_Snapshot", func(t *testing.T) { + err := r.Snapshot() + assert.NoError(t, err) + }) + + // Test asking the scheduler to backup a Snapshot + t.Run("TestRealisClient_PerformBackup", func(t *testing.T) { + err := r.PerformBackup() + assert.NoError(t, err) + }) + // Tasks must exist for it to, be killed t.Run("TestRealisClient_KillJob_Thermos", func(t *testing.T) { start := time.Now()