gorealis/realis_e2e_test.go

1118 lines
29 KiB
Go
Raw Permalink Normal View History

/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package realis_test
import (
"fmt"
"io/ioutil"
"os"
"sync"
"testing"
"time"
"github.com/apache/thrift/lib/go/thrift"
realis "github.com/paypal/gorealis"
"github.com/paypal/gorealis/gen-go/apache/aurora"
"github.com/paypal/gorealis/response"
"github.com/pkg/errors"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
var r realis.Realis
var monitor *realis.Monitor
var thermosPayload []byte
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
const auroraURL = "http://192.168.33.7:8081"
func TestMain(m *testing.M) {
var err error
// New configuration to connect to docker container
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
r, err = realis.NewRealisClient(realis.SchedulerUrl(auroraURL),
realis.BasicAuth("aurora", "secret"),
Merge develop branch into master (#68) * Fixing possible race condition when passing backoff around as a pointer. * Adding a debug logger that is turned off by default. Info logger is enabled by default but prints out less information. * Removing OK Aurora acknowledgment. * Making Mutex a pointer so that there's no chance it can accidentally be copied. * Changing %v to %+v for composite structs. Removing a repetitive statement for the Aurora return code. * Removing another superflous debug statement. * Removing a leftover helper function from before we changed how we configured the client. * Changing the logging paradigm to only require a single logger. All logging will be disabled by default. If debug is enabled, and a logger has not been set, the library will default to printing all logging (INFO and DEBUG) to the stdout. * Minor changes to demonstrate how a logger can be used in conjunction to debug mode. * Removing port override as it is not needed * Changing code comments to reflect getting rid of port override. * Adding port override back in. * Bug fix: Logger was being set to NOOP despite no logger being provided when debug mode is turned on. * Turn on logging by default. * Removing option to override schema and ports for information found on Zookeeper. * Turning off debug mode for tests because it's too verbose. Making sure LevelLogger is initialized correctly under all scenarios. * Removing override fields for zk config. * Remove space. * Removing info that is now incorrect about zk options.
2018-06-22 12:57:21 -07:00
realis.TimeoutMS(20000))
if err != nil {
fmt.Println("Please run docker-compose up -d before running test suite")
os.Exit(1)
}
defer r.Close()
// Create monitor
monitor = &realis.Monitor{Client: r}
thermosPayload, err = ioutil.ReadFile("examples/thermos_payload.json")
if err != nil {
fmt.Println("Error reading thermos payload file: ", err)
os.Exit(1)
}
os.Exit(m.Run())
}
func TestNonExistentEndpoint(t *testing.T) {
// Reduce penalties for this test to make it quick
backoff := realis.Backoff{
Steps: 3,
Duration: 200 * time.Millisecond,
Factor: 1.0,
Jitter: 0.1}
taskQ := &aurora.TaskQuery{}
badEndpoint := "http://idontexist.local:8081/api"
t.Run("WithRetries", func(t *testing.T) {
// Attempt to connect to a bad endpoint
badClient, err := realis.NewRealisClient(
realis.SchedulerUrl(badEndpoint),
realis.TimeoutMS(200000),
realis.BackOff(backoff),
)
require.NoError(t, err)
require.NotNil(t, badClient)
defer badClient.Close()
_, err = badClient.GetTasksWithoutConfigs(taskQ)
// Check that we do error out of retrying
require.Error(t, err)
// Check that the error before this one was a a retry error
// TODO: Consider bubbling up timeout behaving error all the way up to the user.
retryErr := realis.ToRetryCount(errors.Cause(err))
require.NotNil(t, retryErr, "error passed in is not a retry error")
assert.Equal(t, backoff.Steps, retryErr.RetryCount(), "retry count is incorrect")
})
t.Run("FailOnLookup", func(t *testing.T) {
// Attempt to connect to a bad endpoint
badClient, err := realis.NewRealisClient(
realis.SchedulerUrl(badEndpoint),
realis.TimeoutMS(200000),
realis.BackOff(backoff),
realis.FailOnPermanentErrors(),
)
require.NoError(t, err)
require.NotNil(t, badClient)
defer badClient.Close()
_, err = badClient.GetTasksWithoutConfigs(taskQ)
// Check that we do error out of retrying
require.Error(t, err)
})
}
func TestThriftBinary(t *testing.T) {
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
r, err := realis.NewRealisClient(realis.SchedulerUrl(auroraURL),
realis.BasicAuth("aurora", "secret"),
realis.TimeoutMS(20000),
realis.ThriftBinary())
require.NoError(t, err)
defer r.Close()
role := "all"
taskQ := &aurora.TaskQuery{
Role: &role,
}
// Perform a simple API call to test Thrift Binary
_, err = r.GetTasksWithoutConfigs(taskQ)
assert.NoError(t, err)
}
func TestThriftJSON(t *testing.T) {
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
r, err := realis.NewRealisClient(realis.SchedulerUrl(auroraURL),
realis.BasicAuth("aurora", "secret"),
realis.TimeoutMS(20000),
realis.ThriftJSON())
require.NoError(t, err)
defer r.Close()
role := "all"
taskQ := &aurora.TaskQuery{
Role: &role,
}
// Perform a simple API call to test Thrift Binary
_, err = r.GetTasksWithoutConfigs(taskQ)
assert.NoError(t, err)
}
func TestNoopLogger(t *testing.T) {
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
r, err := realis.NewRealisClient(realis.SchedulerUrl(auroraURL),
realis.BasicAuth("aurora", "secret"),
realis.SetLogger(realis.NoopLogger{}))
require.NoError(t, err)
defer r.Close()
role := "all"
taskQ := &aurora.TaskQuery{
Role: &role,
}
// Perform a simple API call to test Thrift Binary
_, err = r.GetTasksWithoutConfigs(taskQ)
assert.NoError(t, err)
}
func TestLeaderFromZK(t *testing.T) {
cluster := realis.GetDefaultClusterFromZKUrl("192.168.33.2:2181")
url, err := realis.LeaderFromZK(*cluster)
require.NoError(t, err)
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
// Address stored inside of ZK might be different than the one we connect to in our tests.
assert.Equal(t, "http://192.168.33.7:8081", url)
}
func TestRealisClient_ReestablishConn(t *testing.T) {
// Test that we're able to tear down the old connection and create a new one.
err := r.ReestablishConn()
assert.NoError(t, err)
}
func TestRealisClient_CreateJob_Thermos(t *testing.T) {
2018-01-27 10:33:55 -08:00
role := "vagrant"
job := realis.NewJob().
Environment("prod").
2018-01-27 10:33:55 -08:00
Role(role).
Name("create_thermos_job_test").
ExecutorName(aurora.AURORA_EXECUTOR_NAME).
ExecutorData(string(thermosPayload)).
CPU(.1).
RAM(16).
Disk(50).
IsService(true).
InstanceCount(2).
AddPorts(1)
_, err := r.CreateJob(job)
require.NoError(t, err)
// Test Instances Monitor
success, err := monitor.Instances(job.JobKey(), job.GetInstanceCount(), 1, 50)
assert.True(t, success)
assert.NoError(t, err)
// Fetch all Jobs
2018-01-27 10:33:55 -08:00
_, result, err := r.GetJobs(role)
assert.Len(t, result.Configs, 1)
2018-01-27 10:33:55 -08:00
assert.NoError(t, err)
// Test asking the scheduler to perform a Snpshot
t.Run("Snapshot", func(t *testing.T) {
err := r.Snapshot()
assert.NoError(t, err)
})
// Test asking the scheduler to backup a Snapshot
t.Run("PerformBackup", func(t *testing.T) {
err := r.PerformBackup()
assert.NoError(t, err)
})
t.Run("GetTaskStatus", func(t *testing.T) {
status, err := r.GetTaskStatus(&aurora.TaskQuery{
JobKeys: []*aurora.JobKey{job.JobKey()},
Statuses: []aurora.ScheduleStatus{aurora.ScheduleStatus_RUNNING}})
require.NoError(t, err)
assert.NotNil(t, status)
assert.Len(t, status, 2)
// TODO: Assert that assigned task matches the configuration of the task scheduled
})
t.Run("AddInstances", func(t *testing.T) {
_, err := r.AddInstances(aurora.InstanceKey{JobKey: job.JobKey(), InstanceId: 0}, 2)
require.NoError(t, err)
success, err := monitor.Instances(job.JobKey(), 4, 1, 50)
assert.True(t, success)
assert.NoError(t, err)
})
t.Run("KillInstances", func(t *testing.T) {
_, err := r.KillInstances(job.JobKey(), 2, 3)
require.NoError(t, err)
success, err := monitor.Instances(job.JobKey(), 2, 1, 50)
assert.True(t, success)
assert.NoError(t, err)
})
t.Run("RestartInstances", func(t *testing.T) {
_, err := r.RestartInstances(job.JobKey(), 0)
require.NoError(t, err)
success, err := monitor.Instances(job.JobKey(), 2, 1, 50)
assert.True(t, success)
assert.NoError(t, err)
})
t.Run("RestartJobs", func(t *testing.T) {
_, err := r.RestartJob(job.JobKey())
require.NoError(t, err)
success, err := monitor.Instances(job.JobKey(), 2, 1, 50)
assert.True(t, success)
assert.NoError(t, err)
})
// Tasks must exist for it to, be killed
t.Run("KillJob", func(t *testing.T) {
_, err := r.KillJob(job.JobKey())
require.NoError(t, err)
success, err := monitor.Instances(job.JobKey(), 0, 1, 50)
assert.True(t, success)
assert.NoError(t, err)
})
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
t.Run("Duplicate_Metadata", func(t *testing.T) {
job.Name("thermos_duplicate_metadata").
AddLabel("hostname", "cookie").
AddLabel("hostname", "candy").
AddLabel("hostname", "popcorn").
AddLabel("hostname", "chips").
AddLabel("chips", "chips")
_, err := r.CreateJob(job)
require.NoError(t, err)
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
success, err := monitor.Instances(job.JobKey(), 2, 1, 50)
assert.True(t, success)
assert.NoError(t, err)
_, err = r.KillJob(job.JobKey())
assert.NoError(t, err)
})
t.Run("Duplicate_constraints", func(t *testing.T) {
job.Name("thermos_duplicate_constraints").
AddValueConstraint("zone", false, "east", "west").
AddValueConstraint("zone", false, "east").
AddValueConstraint("zone", true, "west")
_, err := r.CreateJob(job)
require.NoError(t, err)
success, err := monitor.Instances(job.JobKey(), 2, 1, 50)
assert.True(t, success)
assert.NoError(t, err)
_, err = r.KillJob(job.JobKey())
assert.NoError(t, err)
})
t.Run("Overwrite_constraints", func(t *testing.T) {
job.Name("thermos_overwrite_constraints").
AddLimitConstraint("zone", 1).
AddValueConstraint("zone", true, "west", "east").
AddLimitConstraint("zone", 2)
_, err := r.CreateJob(job)
require.NoError(t, err)
success, err := monitor.Instances(job.JobKey(), 2, 1, 50)
assert.True(t, success)
assert.NoError(t, err)
_, err = r.KillJob(job.JobKey())
assert.NoError(t, err)
})
}
// Test configuring an executor that doesn't exist for CreateJob API
func TestRealisClient_CreateJob_ExecutorDoesNotExist(t *testing.T) {
// Create a single job
job := realis.NewJob().
Environment("prod").
Role("vagrant").
Name("executordoesntexist").
ExecutorName("idontexist").
ExecutorData("").
CPU(.25).
RAM(4).
Disk(10).
InstanceCount(1)
resp, err := r.CreateJob(job)
assert.Error(t, err)
assert.Equal(t, aurora.ResponseCode_INVALID_REQUEST, resp.GetResponseCode())
}
// Test configuring an executor that doesn't exist for CreateJob API
func TestRealisClient_GetPendingReason(t *testing.T) {
env := "prod"
role := "vagrant"
name := "pending_reason_test"
// Create a single job
job := realis.NewJob().
Environment(env).
Role(role).
Name(name).
ExecutorName(aurora.AURORA_EXECUTOR_NAME).
ExecutorData(string(thermosPayload)).
CPU(1000).
RAM(64).
Disk(100).
InstanceCount(1)
resp, err := r.CreateJob(job)
require.NoError(t, err)
assert.Equal(t, aurora.ResponseCode_OK, resp.ResponseCode)
taskQ := &aurora.TaskQuery{
Role: &role,
Environment: &env,
JobName: &name,
}
reasons, err := r.GetPendingReason(taskQ)
assert.NoError(t, err)
assert.Len(t, reasons, 1)
_, err = r.KillJob(job.JobKey())
assert.NoError(t, err)
}
func TestRealisClient_CreateService_WithPulse_Thermos(t *testing.T) {
fmt.Println("Creating service")
role := "vagrant"
job := realis.NewJob().
Environment("prod").
Role(role).
Name("create_thermos_job_test").
ExecutorName(aurora.AURORA_EXECUTOR_NAME).
ExecutorData(string(thermosPayload)).
CPU(.5).
RAM(64).
Disk(100).
IsService(true).
InstanceCount(1).
AddPorts(1).
AddLabel("currentTime", time.Now().String())
settings := realis.NewUpdateSettings()
settings.BlockIfNoPulsesAfterMs = thrift.Int32Ptr(500)
settings.UpdateGroupSize = 1
settings.MinWaitInInstanceRunningMs = 5000
settings.WaitForBatchCompletion = true
job.InstanceCount(2)
_, result, err := r.CreateService(job, settings)
require.NoError(t, err)
updateQ := aurora.JobUpdateQuery{
Key: result.GetKey(),
Limit: 1,
}
var updateDetails []*aurora.JobUpdateDetails
ticker := time.NewTicker(time.Second * 1)
timer := time.NewTimer(time.Minute * 6)
defer ticker.Stop()
defer timer.Stop()
pulseLoop:
for {
select {
case <-ticker.C:
_, err = r.PulseJobUpdate(result.GetKey())
assert.NoError(t, err, "unable to pulse job update")
respDetail, err := r.JobUpdateDetails(updateQ)
assert.NoError(t, err)
updateDetails = response.JobUpdateDetails(respDetail)
require.Len(t, updateDetails, 1, "No update found")
status := updateDetails[0].Update.Summary.State.Status
if _, ok := realis.ActiveJobUpdateStates[status]; !ok {
// Rolled forward is the only state in which an update has been successfully updated
// if we encounter an inactive state and it is not at rolled forward, update failed
if status == aurora.JobUpdateStatus_ROLLED_FORWARD {
fmt.Println("update succeed")
break pulseLoop
} else {
fmt.Println("update failed")
break pulseLoop
}
}
case <-timer.C:
_, err := r.AbortJobUpdate(*updateDetails[0].GetUpdate().GetSummary().GetKey(), "")
assert.NoError(t, err)
_, err = r.KillJob(job.JobKey())
assert.NoError(t, err, "timed out during pulse update test")
t.FailNow()
}
}
_, err = r.KillJob(job.JobKey())
assert.NoError(t, err)
}
// Test configuring an executor that doesn't exist for CreateJob API
func TestRealisClient_CreateService(t *testing.T) {
// Create a single job
job := realis.NewJob().
Environment("prod").
Role("vagrant").
Name("create_service_test").
ExecutorName(aurora.AURORA_EXECUTOR_NAME).
ExecutorData(string(thermosPayload)).
CPU(.25).
RAM(4).
Disk(10).
InstanceCount(3).
IsService(true)
settings := realis.NewUpdateSettings()
settings.UpdateGroupSize = 2
settings.MinWaitInInstanceRunningMs = 5000
job.InstanceCount(3)
_, result, err := r.CreateService(job, settings)
require.NoError(t, err)
assert.NotNil(t, result)
// Test asking the scheduler to backup a Snapshot
t.Run("PauseJobUpdate", func(t *testing.T) {
_, err = r.PauseJobUpdate(result.GetKey(), "")
assert.NoError(t, err)
})
t.Run("ResumeJobUpdate", func(t *testing.T) {
_, err = r.ResumeJobUpdate(result.GetKey(), "")
assert.NoError(t, err)
})
var ok bool
var mErr error
if ok, mErr = monitor.JobUpdate(*result.GetKey(), 5, 240); !ok || mErr != nil {
// Update may already be in a terminal state so don't check for error
_, err := r.AbortJobUpdate(*result.GetKey(), "Monitor timed out.")
_, err = r.KillJob(job.JobKey())
assert.NoError(t, err)
}
require.NoError(t, mErr)
assert.True(t, ok)
// Kill task test task after confirming it came up fine
_, err = r.KillJob(job.JobKey())
assert.NoError(t, err)
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
success, err := monitor.Instances(job.JobKey(), 0, 1, 50)
require.NoError(t, mErr)
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
assert.True(t, success)
// Create a client which will timeout and close the connection before receiving an answer
timeoutClient, err := realis.NewRealisClient(
realis.SchedulerUrl(auroraURL),
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
realis.BasicAuth("aurora", "secret"),
realis.TimeoutMS(5),
)
require.NoError(t, err)
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
defer timeoutClient.Close()
// Test case where http connection timeouts out.
t.Run("TimeoutError", func(t *testing.T) {
job.Name("createService_timeout")
// Make sure a timedout error was returned
_, _, err = timeoutClient.CreateService(job, settings)
require.Error(t, err)
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
assert.True(t, realis.IsTimeout(err))
updateReceivedQuery := aurora.JobUpdateQuery{
Role: &job.JobKey().Role,
JobKey: job.JobKey(),
UpdateStatuses: aurora.ACTIVE_JOB_UPDATE_STATES,
Limit: 1}
updateSummaries, err := monitor.JobUpdateQuery(updateReceivedQuery, time.Second*1, time.Second*50)
require.NoError(t, err)
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
require.Len(t, updateSummaries, 1)
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
r.AbortJobUpdate(*updateSummaries[0].Key, "Cleaning up")
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
_, err = r.KillJob(job.JobKey())
assert.NoError(t, err)
})
// Test case where http connection timeouts out.
t.Run("TimeoutError_BadPayload", func(t *testing.T) {
// Illegal payload
job.InstanceCount(-1)
job.Name("createService_timeout_bad_payload")
// Make sure a timedout error was returned
_, _, err = timeoutClient.CreateService(job, settings)
require.Error(t, err)
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
assert.True(t, realis.IsTimeout(err))
summary, err := r.GetJobUpdateSummaries(
&aurora.JobUpdateQuery{
Role: &job.JobKey().Role,
JobKey: job.JobKey(),
UpdateStatuses: aurora.ACTIVE_JOB_UPDATE_STATES})
assert.NoError(t, err)
// Payload should have been rejected, no update should exist
require.Len(t, summary.GetResult_().GetGetJobUpdateSummariesResult_().GetUpdateSummaries(), 0)
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105) * Bumped up version to 1.21.1 * Moving admin functions to a new file. They are still part of the same pointer receiver type. * Removing dead code and fixing some comments to add space between backslash and comment. * Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod. * Added `--rm` to run tests Mac script. * Removing cookie jar from transport layer as it's not needed. * Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive. * Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout. * Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload. * Updating changelog with information about the error type returned. * Adding test for duplicate metadata. * Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query. * Added documentation as to how to handle a timeout on an API request. * Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00
})
}
// Test configuring an executor that doesn't exist for CreateJob API
func TestRealisClient_CreateService_ExecutorDoesNotExist(t *testing.T) {
// Create a single job
job := realis.NewJob().
Environment("prod").
Role("vagrant").
Name("executordoesntexist").
ExecutorName("idontexist").
ExecutorData("").
CPU(.25).
RAM(4).
Disk(10).
InstanceCount(1)
settings := realis.NewUpdateSettings()
job.InstanceCount(3)
resp, result, err := r.CreateService(job, settings)
require.Error(t, err)
assert.Nil(t, result)
require.Equal(t, aurora.ResponseCode_INVALID_REQUEST, resp.GetResponseCode())
}
func TestRealisClient_ScheduleCronJob_Thermos(t *testing.T) {
thermosCronPayload, err := ioutil.ReadFile("examples/thermos_cron_payload.json")
assert.NoError(t, err)
job := realis.NewJob().
Environment("prod").
Role("vagrant").
Name("cronsched_job_test").
ExecutorName(aurora.AURORA_EXECUTOR_NAME).
ExecutorData(string(thermosCronPayload)).
CPU(1).
RAM(64).
Disk(100).
IsService(true).
InstanceCount(1).
AddPorts(1).
CronSchedule("* * * * *").
IsService(false)
_, err = r.ScheduleCronJob(job)
require.NoError(t, err)
t.Run("Start", func(t *testing.T) {
_, err := r.StartCronJob(job.JobKey())
require.NoError(t, err)
})
t.Run("Deschedule", func(t *testing.T) {
_, err := r.DescheduleCronJob(job.JobKey())
require.NoError(t, err)
})
}
func TestRealisClient_StartMaintenance(t *testing.T) {
hosts := []string{"localhost"}
_, _, err := r.StartMaintenance(hosts...)
require.NoError(t, err, "unable to start maintenance")
// Monitor change to DRAINING and DRAINED mode
hostResults, err := monitor.HostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_SCHEDULED},
1,
50)
assert.NoError(t, err)
assert.Equal(t, map[string]bool{"localhost": true}, hostResults)
_, _, err = r.EndMaintenance(hosts...)
require.NoError(t, err)
// Monitor change to DRAINING and DRAINED mode
_, err = monitor.HostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_NONE},
5,
10)
assert.NoError(t, err)
}
func TestRealisClient_DrainHosts(t *testing.T) {
hosts := []string{"localhost"}
t.Run("DrainHosts", func(t *testing.T) {
_, _, err := r.DrainHosts(hosts...)
assert.NoError(t, err, "unable to drain host")
})
t.Run("MonitorTransitionToDrained", func(t *testing.T) {
// Monitor change to DRAINING and DRAINED mode
hostResults, err := monitor.HostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_DRAINED, aurora.MaintenanceMode_DRAINING},
1,
50)
require.NoError(t, err)
assert.Equal(t, map[string]bool{"localhost": true}, hostResults)
})
t.Run("MonitorNonExistentHost", func(t *testing.T) {
// Monitor change to DRAINING and DRAINED mode
hostResults, err := monitor.HostMaintenance(
append(hosts, "IMAGINARY_HOST"),
[]aurora.MaintenanceMode{aurora.MaintenanceMode_DRAINED, aurora.MaintenanceMode_DRAINING},
1,
1)
// Assert monitor returned an error that was not nil, and also a list of the non-transitioned hosts
require.Error(t, err)
assert.Equal(t, map[string]bool{"localhost": true, "IMAGINARY_HOST": false}, hostResults)
})
t.Run("EndMaintenance", func(t *testing.T) {
_, _, err := r.EndMaintenance(hosts...)
require.NoError(t, err)
// Monitor change to DRAINING and DRAINED mode
_, err = monitor.HostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_NONE},
5,
10)
assert.NoError(t, err)
})
}
func TestRealisClient_SLADrainHosts(t *testing.T) {
hosts := []string{"localhost"}
policy := aurora.SlaPolicy{PercentageSlaPolicy: &aurora.PercentageSlaPolicy{Percentage: 50.0}}
_, err := r.SLADrainHosts(&policy, 30, hosts...)
require.NoError(t, err, "unable to drain host with SLA policy")
// Monitor change to DRAINING and DRAINED mode
hostResults, err := monitor.HostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_DRAINED, aurora.MaintenanceMode_DRAINING},
1,
50)
assert.NoError(t, err)
assert.Equal(t, map[string]bool{"localhost": true}, hostResults)
_, _, err = r.EndMaintenance(hosts...)
require.NoError(t, err)
// Monitor change to DRAINING and DRAINED mode
_, err = monitor.HostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_NONE},
5,
10)
assert.NoError(t, err)
2021-11-01 18:17:49 -07:00
// slaDrainHosts goes with default policy if no policy is specified
_, err = r.SLADrainHosts(nil, 30, hosts...)
require.NoError(t, err, "unable to drain host with SLA policy")
// Monitor change to DRAINING and DRAINED mode
hostResults, err = monitor.HostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_DRAINED, aurora.MaintenanceMode_DRAINING},
1,
50)
assert.NoError(t, err)
assert.Equal(t, map[string]bool{"localhost": true}, hostResults)
_, _, err = r.EndMaintenance(hosts...)
require.NoError(t, err)
// Monitor change to DRAINING and DRAINED mode
_, err = monitor.HostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_NONE},
5,
10)
assert.NoError(t, err)
_, err = r.SLADrainHosts(&aurora.SlaPolicy{}, 30, hosts...)
require.NoError(t, err, "unable to drain host with SLA policy")
// Monitor change to DRAINING and DRAINED mode
hostResults, err = monitor.HostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_DRAINED, aurora.MaintenanceMode_DRAINING},
1,
50)
assert.NoError(t, err)
assert.Equal(t, map[string]bool{"localhost": true}, hostResults)
_, _, err = r.EndMaintenance(hosts...)
require.NoError(t, err)
// Monitor change to DRAINING and DRAINED mode
_, err = monitor.HostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_NONE},
5,
10)
assert.NoError(t, err)
}
// Test multiple go routines using a single connection
func TestRealisClient_SessionThreadSafety(t *testing.T) {
// Create a single job
job := realis.NewJob().
Environment("prod").
Role("vagrant").
Name("create_thermos_job_test_multi").
ExecutorName(aurora.AURORA_EXECUTOR_NAME).
ExecutorData(string(thermosPayload)).
CPU(.25).
RAM(4).
Disk(10).
InstanceCount(1000) // Impossible amount to go live in any sane machine
_, err := r.CreateJob(job)
assert.NoError(t, err)
wg := sync.WaitGroup{}
threadCount := 20
wg.Add(threadCount)
for i := 0; i < threadCount; i++ {
// Launch multiple monitors that will poll every second
go func() {
defer wg.Done()
// Test Schedule status monitor for terminal state and timing out after 30 seconds
success, err := monitor.ScheduleStatus(job.JobKey(), job.GetInstanceCount(), realis.LiveStates, 1, 30)
assert.False(t, success)
assert.Error(t, err)
_, err = r.KillJob(job.JobKey())
assert.NoError(t, err)
}()
}
wg.Wait()
}
// Test setting and getting the quota
func TestRealisClient_Quota(t *testing.T) {
var resp *aurora.Response
var err error
cpu := 3.5
ram := int64(20480)
disk := int64(10240)
t.Run("Set", func(t *testing.T) {
resp, err = r.SetQuota("vagrant", &cpu, &ram, &disk)
assert.NoError(t, err)
})
t.Run("Get", func(t *testing.T) {
// Test GetQuota based on previously set values
var result *aurora.GetQuotaResult_
resp, err = r.GetQuota("vagrant")
if resp.GetResult_() != nil {
result = resp.GetResult_().GetQuotaResult_
}
assert.NoError(t, err)
for _, res := range result.Quota.GetResources() {
switch true {
case res.DiskMb != nil:
assert.Equal(t, disk, *res.DiskMb)
case res.NumCpus != nil:
assert.Equal(t, cpu, *res.NumCpus)
case res.RamMb != nil:
assert.Equal(t, ram, *res.RamMb)
}
}
})
}
func TestRealisClient_ForceImplicitTaskReconciliation(t *testing.T) {
err := r.ForceImplicitTaskReconciliation()
assert.NoError(t, err)
}
func TestRealisClient_ForceExplicitTaskReconciliation(t *testing.T) {
// Default value
err := r.ForceExplicitTaskReconciliation(nil)
assert.NoError(t, err)
// Custom batch value
err = r.ForceExplicitTaskReconciliation(thrift.Int32Ptr(32))
assert.NoError(t, err)
}
func TestRealisClient_PartitionPolicy(t *testing.T) {
role := "vagrant"
job := realis.NewJob().
Environment("prod").
Role(role).
Name("create_thermos_job_partition_policy_test").
ExecutorName(aurora.AURORA_EXECUTOR_NAME).
ExecutorData(string(thermosPayload)).
CPU(.5).
RAM(64).
Disk(100).
IsService(true).
InstanceCount(2).
PartitionPolicy(&aurora.PartitionPolicy{Reschedule: true, DelaySecs: thrift.Int64Ptr(30)})
settings := realis.NewUpdateSettings()
settings.UpdateGroupSize = 2
_, result, err := r.CreateService(job, settings)
assert.NoError(t, err)
var ok bool
var mErr error
if ok, mErr = monitor.JobUpdate(*result.GetKey(), 5, 180); !ok || mErr != nil {
// Update may already be in a terminal state so don't check for error
_, err := r.AbortJobUpdate(*result.GetKey(), "Monitor timed out.")
assert.NoError(t, err)
}
// Clean up after finishing test
r.KillJob(job.JobKey())
}
func TestAuroraJob_UpdateSlaPolicy(t *testing.T) {
tests := []struct {
name string
args aurora.SlaPolicy
}{
{
"create_service_with_sla_count_policy_test",
aurora.SlaPolicy{CountSlaPolicy: &aurora.CountSlaPolicy{Count: 1, DurationSecs: 15}},
},
{
"create_service_with_sla_percentage_policy_test",
aurora.SlaPolicy{PercentageSlaPolicy: &aurora.PercentageSlaPolicy{Percentage: 0.25, DurationSecs: 15}},
},
{
"create_service_with_sla_coordinator_policy_test",
aurora.SlaPolicy{CoordinatorSlaPolicy: &aurora.CoordinatorSlaPolicy{
CoordinatorUrl: "http://localhost/endpoint", StatusKey: "aurora_test"}},
},
}
role := "vagrant"
_, err := r.SetQuota(role, thrift.Float64Ptr(6.0), thrift.Int64Ptr(1024), thrift.Int64Ptr(1024))
assert.NoError(t, err)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
// Create a single job
job := realis.NewJob().
Environment("prod").
Role(role).
Name(tt.name).
ExecutorName(aurora.AURORA_EXECUTOR_NAME).
ExecutorData(string(thermosPayload)).
CPU(.01).
RAM(2).
Disk(5).
InstanceCount(4).
IsService(true).
SlaPolicy(&tt.args).
Tier("preferred")
settings := realis.NewUpdateSettings()
settings.UpdateGroupSize = 2
settings.MinWaitInInstanceRunningMs = 5 * 1000
_, result, err := r.CreateService(job, settings)
require.NoError(t, err)
assert.NotNil(t, result)
var ok bool
var mErr error
if ok, mErr = monitor.JobUpdate(*result.GetKey(), 5, 240); !ok || mErr != nil {
// Update may already be in a terminal state so don't check for error
_, err := r.AbortJobUpdate(*result.GetKey(), "Monitor timed out.")
_, err = r.KillJob(job.JobKey())
assert.NoError(t, err)
}
assert.NoError(t, mErr)
assert.True(t, ok)
// Kill task test task after confirming it came up fine
_, err = r.KillJob(job.JobKey())
assert.NoError(t, err)
})
}
}
func TestRealisClient_UpdateStrategies(t *testing.T) {
// Create a single job
job := realis.NewJob().
Environment("prod").
Role("vagrant").
ExecutorName(aurora.AURORA_EXECUTOR_NAME).
ExecutorData(string(thermosPayload)).
CPU(.01).
RAM(4).
Disk(10).
InstanceCount(6).
IsService(true)
strategies := []struct {
UpdateJob *realis.UpdateJob
Name string
}{
{
UpdateJob: realis.NewDefaultUpdateJob(job.TaskConfig()).
QueueUpdateStrategy(aurora.QueueJobUpdateStrategy{GroupSize: 2}).
InstanceCount(6).
WatchTime(1000),
Name: "Queue",
},
{
UpdateJob: realis.NewDefaultUpdateJob(job.TaskConfig()).
BatchUpdateStrategy(aurora.BatchJobUpdateStrategy{GroupSize: 2}).
InstanceCount(6).
WatchTime(1000),
Name: "Batch",
},
{
UpdateJob: realis.NewDefaultUpdateJob(job.TaskConfig()).
VariableBatchStrategy(aurora.VariableBatchJobUpdateStrategy{GroupSizes: []int32{1, 2, 3}}).
InstanceCount(6).
WatchTime(1000),
Name: "VarBatch",
},
}
for _, strategy := range strategies {
t.Run("TestRealisClient_UpdateStrategies_"+strategy.Name, func(t *testing.T) {
job.Name("update_strategies_" + strategy.Name)
resp, err := r.StartJobUpdate(strategy.UpdateJob, "")
assert.NoError(t, err)
assert.NotNil(t, resp)
assert.NotNil(t, resp.GetResult_())
assert.NotNil(t, resp.GetResult_().GetStartJobUpdateResult_())
assert.NotNil(t, resp.GetResult_().GetStartJobUpdateResult_().GetKey())
var ok bool
var mErr error
key := *resp.GetResult_().GetStartJobUpdateResult_().GetKey()
if ok, mErr = monitor.JobUpdate(key, 5, 240); !ok || mErr != nil {
// Update may already be in a terminal state so don't check for error
_, err := r.AbortJobUpdate(key, "Monitor timed out.")
assert.NoError(t, err)
}
_, err = r.KillJob(job.JobKey())
assert.NoError(t, err)
})
}
}
func TestRealisClient_BatchAwareAutoPause(t *testing.T) {
// Create a single job
job := realis.NewJob().
Environment("prod").
Role("vagrant").
Name("BatchAwareAutoPauseTest").
ExecutorName(aurora.AURORA_EXECUTOR_NAME).
ExecutorData(string(thermosPayload)).
CPU(.01).
RAM(4).
Disk(10).
InstanceCount(6).
IsService(true)
updateGroups := []int32{1, 2, 3}
strategy := realis.NewDefaultUpdateJob(job.TaskConfig()).
VariableBatchStrategy(aurora.VariableBatchJobUpdateStrategy{
GroupSizes: updateGroups,
AutopauseAfterBatch: true,
}).
InstanceCount(6).
WatchTime(1000)
resp, err := r.StartJobUpdate(strategy, "")
require.NoError(t, err)
require.NotNil(t, resp)
require.NotNil(t, resp.GetResult_())
require.NotNil(t, resp.GetResult_().GetStartJobUpdateResult_())
require.NotNil(t, resp.GetResult_().GetStartJobUpdateResult_().GetKey())
key := *resp.GetResult_().GetStartJobUpdateResult_().GetKey()
for i := range updateGroups {
curStep, mErr := monitor.AutoPausedUpdateMonitor(key, time.Second*5, time.Second*240)
if mErr != nil {
// Update may already be in a terminal state so don't check for error
_, err := r.AbortJobUpdate(key, "Monitor timed out.")
assert.NoError(t, err)
}
assert.Equal(t, i, curStep)
if i != len(updateGroups)-1 {
_, err = r.ResumeJobUpdate(&key, "auto resuming test")
require.NoError(t, err)
}
}
_, err = r.KillJob(job.JobKey())
assert.NoError(t, err)
}