gorealis/examples/client.go
Renan DelValle 1a15c4a5aa
V1 CreateService and StartJobUpdate Timeout signal and cleanup (#105)
* Bumped up version to 1.21.1

* Moving admin functions to a new file. They are still part of the same pointer receiver type.

* Removing dead code and fixing some comments to add space between backslash and comment.

* Adding set up and tear down to run tests script. It sets up a pod, runs all tests, and then tears down the pod.

* Added `--rm` to run tests Mac script.

* Removing cookie jar from transport layer as it's not needed.

* Changing all error messages to start with a lower case letter. Changing some messages around to be more descriptive.

* Adding an argument to allow the retry mechanism to stop if a timeout has been encountered. This is useful for mutating API calls. Only StartUpdate and CreateService have enabled by default stop at timeout.

* Added 2 tests for when a call goes through despite the client timing out. One is with a good payload, one is with a bad payload.

* Updating changelog with information about the error type returned.

* Adding test for duplicate metadata.

* Refactored JobUpdateStatus monitor to use a new monitor called JobUpdateQuery. Update monitor will now still continue if it does not find an update to monitor. Furthermore, it has been optimized to reduce returning payloads from the scheduler as much as possible. This is through using the GetJobUpdateSummaries API instead of JobUpdateDetails and by including a the statuses we're searching for as part of the query.


* Added documentation as to how to handle a timeout on an API request.

* Optimized GetInstancesIds to create a copy of the JobKey being passed down in order to avoid unexpected behavior. Instead of setting every variable name separately, now a JobKey array is being created.
2019-05-05 11:46:22 -07:00

657 lines
16 KiB
Go

/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package main
import (
"flag"
"fmt"
"io/ioutil"
"log"
"strings"
"time"
realis "github.com/paypal/gorealis"
"github.com/paypal/gorealis/gen-go/apache/aurora"
"github.com/paypal/gorealis/response"
)
var cmd, executor, url, clustersConfig, clusterName, updateId, username, password, zkUrl, hostList, role string
var caCertsPath string
var clientKey, clientCert string
var ConnectionTimeout = 20000
func init() {
flag.StringVar(&cmd, "cmd", "", "Job request type to send to Aurora Scheduler")
flag.StringVar(&executor, "executor", "thermos", "Executor to use")
flag.StringVar(&url, "url", "", "URL at which the Aurora Scheduler exists as [url]:[port]")
flag.StringVar(&clustersConfig, "clusters", "", "Location of the clusters.json file used by aurora.")
flag.StringVar(&clusterName, "cluster", "devcluster", "Name of cluster to run job on (only necessary if clusters is set)")
flag.StringVar(&updateId, "updateId", "", "Update ID to operate on")
flag.StringVar(&username, "username", "aurora", "Username to use for authorization")
flag.StringVar(&password, "password", "secret", "Password to use for authorization")
flag.StringVar(&zkUrl, "zkurl", "", "zookeeper url")
flag.StringVar(&hostList, "hostList", "", "Comma separated list of hosts to operate on")
flag.StringVar(&role, "role", "", "owner role to use")
flag.StringVar(&caCertsPath, "caCertsPath", "", "Path to CA certs on local machine.")
flag.StringVar(&clientCert, "clientCert", "", "Client certificate to use to connect to Aurora.")
flag.StringVar(&clientKey, "clientKey", "", "Client private key to use to connect to Aurora.")
flag.Parse()
// Attempt to load leader from zookeeper using a
// cluster.json file used for the default aurora client if provided.
// This will override the provided url in the arguments
if clustersConfig != "" {
clusters, err := realis.LoadClusters(clustersConfig)
if err != nil {
log.Fatalln(err)
}
cluster, ok := clusters[clusterName]
if !ok {
log.Fatalf("Cluster %s doesn't exist in the file provided\n", clusterName)
}
url, err = realis.LeaderFromZK(cluster)
if err != nil {
log.Fatalln(err)
}
}
}
func main() {
var job realis.Job
var err error
var monitor *realis.Monitor
var r realis.Realis
clientOptions := []realis.ClientOption{
realis.BasicAuth(username, password),
realis.ThriftJSON(),
realis.TimeoutMS(ConnectionTimeout),
realis.BackOff(realis.Backoff{
Steps: 2,
Duration: 10 * time.Second,
Factor: 2.0,
Jitter: 0.1,
}),
realis.Debug(),
}
if zkUrl != "" {
fmt.Println("zkUrl: ", zkUrl)
clientOptions = append(clientOptions, realis.ZKUrl(zkUrl))
} else {
clientOptions = append(clientOptions, realis.SchedulerUrl(url))
}
if caCertsPath != "" {
clientOptions = append(clientOptions, realis.Certspath(caCertsPath))
}
if clientKey != "" && clientCert != "" {
clientOptions = append(clientOptions, realis.ClientCerts(clientKey, clientCert))
}
r, err = realis.NewRealisClient(clientOptions...)
if err != nil {
log.Fatalln(err)
}
monitor = &realis.Monitor{r}
defer r.Close()
switch executor {
case "thermos":
payload, err := ioutil.ReadFile("examples/thermos_payload.json")
if err != nil {
log.Fatalln("Error reading json config file: ", err)
}
job = realis.NewJob().
Environment("prod").
Role("vagrant").
Name("hello_world_from_gorealis").
ExecutorName(aurora.AURORA_EXECUTOR_NAME).
ExecutorData(string(payload)).
CPU(1).
RAM(64).
Disk(100).
IsService(true).
InstanceCount(1).
AddPorts(1)
case "compose":
job = realis.NewJob().
Environment("prod").
Role("vagrant").
Name("docker-compose-test").
ExecutorName("docker-compose-executor").
ExecutorData("{}").
CPU(0.25).
RAM(512).
Disk(100).
IsService(true).
InstanceCount(1).
AddPorts(4).
AddLabel("fileName", "sample-app/docker-compose.yml").
AddURIs(true, true, "https://github.com/mesos/docker-compose-executor/releases/download/0.1.0/sample-app.tar.gz")
case "none":
job = realis.NewJob().
Environment("prod").
Role("vagrant").
Name("docker_as_task").
CPU(1).
RAM(64).
Disk(100).
IsService(true).
InstanceCount(1).
AddPorts(1)
default:
log.Fatalln("Only thermos, compose, and none are supported for now")
}
switch cmd {
case "create":
fmt.Println("Creating job")
resp, err := r.CreateJob(job)
if err != nil {
log.Fatalln(err)
}
fmt.Println(resp.String())
if ok, mErr := monitor.Instances(job.JobKey(), job.GetInstanceCount(), 5, 50); !ok || mErr != nil {
_, err := r.KillJob(job.JobKey())
if err != nil {
log.Fatalln(err)
}
log.Fatalf("ok: %v\n err: %v", ok, mErr)
}
case "createService":
// Create a service with three instances using the update API instead of the createJob API
fmt.Println("Creating service")
settings := realis.NewUpdateSettings()
job.InstanceCount(3)
resp, result, err := r.CreateService(job, settings)
if err != nil {
log.Println("error: ", err)
log.Fatal("response: ", resp.String())
}
fmt.Println(result.String())
if ok, mErr := monitor.JobUpdate(*result.GetKey(), 5, 180); !ok || mErr != nil {
_, err := r.AbortJobUpdate(*result.GetKey(), "Monitor timed out")
_, err = r.KillJob(job.JobKey())
if err != nil {
log.Fatal(err)
}
log.Fatalf("ok: %v\n err: %v", ok, mErr)
}
case "createDocker":
fmt.Println("Creating a docker based job")
container := realis.NewDockerContainer().Image("python:2.7").AddParameter("network", "host")
job.Container(container)
resp, err := r.CreateJob(job)
if err != nil {
log.Fatal(err)
}
fmt.Println(resp.String())
if ok, err := monitor.Instances(job.JobKey(), job.GetInstanceCount(), 10, 300); !ok || err != nil {
_, err := r.KillJob(job.JobKey())
if err != nil {
log.Fatal(err)
}
}
case "createMesosContainer":
fmt.Println("Creating a docker based job")
container := realis.NewMesosContainer().DockerImage("python", "2.7")
job.Container(container)
resp, err := r.CreateJob(job)
if err != nil {
log.Fatal(err)
}
fmt.Println(resp.String())
if ok, err := monitor.Instances(job.JobKey(), job.GetInstanceCount(), 10, 300); !ok || err != nil {
_, err := r.KillJob(job.JobKey())
if err != nil {
log.Fatal(err)
}
}
case "scheduleCron":
fmt.Println("Scheduling a Cron job")
// Cron config
job.CronSchedule("* * * * *")
job.IsService(false)
resp, err := r.ScheduleCronJob(job)
if err != nil {
log.Fatal(err)
}
fmt.Println(resp.String())
case "startCron":
fmt.Println("Starting a Cron job")
resp, err := r.StartCronJob(job.JobKey())
if err != nil {
log.Fatal(err)
}
fmt.Println(resp.String())
case "descheduleCron":
fmt.Println("Descheduling a Cron job")
resp, err := r.DescheduleCronJob(job.JobKey())
if err != nil {
log.Fatal(err)
}
fmt.Println(resp.String())
case "kill":
fmt.Println("Killing job")
resp, err := r.KillJob(job.JobKey())
if err != nil {
log.Fatal(err)
}
if ok, err := monitor.Instances(job.JobKey(), 0, 5, 50); !ok || err != nil {
log.Fatal("Unable to kill all instances of job")
}
fmt.Println(resp.String())
case "restart":
fmt.Println("Restarting job")
resp, err := r.RestartJob(job.JobKey())
if err != nil {
log.Fatal(err)
}
fmt.Println(resp.String())
case "liveCount":
fmt.Println("Getting instance count")
live, err := r.GetInstanceIds(job.JobKey(), aurora.LIVE_STATES)
if err != nil {
log.Fatal(err)
}
fmt.Printf("Live instances: %+v\n", live)
case "activeCount":
fmt.Println("Getting instance count")
live, err := r.GetInstanceIds(job.JobKey(), aurora.ACTIVE_STATES)
if err != nil {
log.Fatal(err)
}
fmt.Println("Number of live instances: ", len(live))
case "flexUp":
fmt.Println("Flexing up job")
numOfInstances := int32(4)
live, err := r.GetInstanceIds(job.JobKey(), aurora.ACTIVE_STATES)
if err != nil {
log.Fatal(err)
}
currInstances := int32(len(live))
fmt.Println("Current num of instances: ", currInstances)
resp, err := r.AddInstances(aurora.InstanceKey{
JobKey: job.JobKey(),
InstanceId: live[0],
},
numOfInstances)
if err != nil {
log.Fatal(err)
}
if ok, err := monitor.Instances(job.JobKey(), currInstances+numOfInstances, 5, 50); !ok || err != nil {
fmt.Println("Flexing up failed")
}
fmt.Println(resp.String())
case "flexDown":
fmt.Println("Flexing down job")
numOfInstances := int32(2)
live, err := r.GetInstanceIds(job.JobKey(), aurora.ACTIVE_STATES)
if err != nil {
log.Fatal(err)
}
currInstances := int32(len(live))
fmt.Println("Current num of instances: ", currInstances)
resp, err := r.RemoveInstances(job.JobKey(), numOfInstances)
if err != nil {
log.Fatal(err)
}
if ok, err := monitor.Instances(job.JobKey(), currInstances-numOfInstances, 5, 100); !ok || err != nil {
fmt.Println("flexDown failed")
}
fmt.Println(resp.String())
case "update":
fmt.Println("Updating a job with with more RAM and to 5 instances")
live, err := r.GetInstanceIds(job.JobKey(), aurora.ACTIVE_STATES)
if err != nil {
log.Fatal(err)
}
taskConfig, err := r.FetchTaskConfig(aurora.InstanceKey{
JobKey: job.JobKey(),
InstanceId: live[0],
})
if err != nil {
log.Fatal(err)
}
updateJob := realis.NewDefaultUpdateJob(taskConfig)
updateJob.InstanceCount(5).RAM(128)
resp, err := r.StartJobUpdate(updateJob, "")
if err != nil {
log.Fatal(err)
}
jobUpdateKey := response.JobUpdateKey(resp)
monitor.JobUpdate(*jobUpdateKey, 5, 500)
case "pauseJobUpdate":
resp, err := r.PauseJobUpdate(&aurora.JobUpdateKey{
Job: job.JobKey(),
ID: updateId,
}, "")
if err != nil {
log.Fatal(err)
}
fmt.Println("PauseJobUpdate response: ", resp.String())
case "resumeJobUpdate":
resp, err := r.ResumeJobUpdate(&aurora.JobUpdateKey{
Job: job.JobKey(),
ID: updateId,
}, "")
if err != nil {
log.Fatal(err)
}
fmt.Println("ResumeJobUpdate response: ", resp.String())
case "pulseJobUpdate":
resp, err := r.PulseJobUpdate(&aurora.JobUpdateKey{
Job: job.JobKey(),
ID: updateId,
})
if err != nil {
log.Fatal(err)
}
fmt.Println("PulseJobUpdate response: ", resp.String())
case "updateDetails":
resp, err := r.JobUpdateDetails(aurora.JobUpdateQuery{
Key: &aurora.JobUpdateKey{
Job: job.JobKey(),
ID: updateId,
},
Limit: 1,
})
if err != nil {
log.Fatal(err)
}
fmt.Println(response.JobUpdateDetails(resp))
case "abortUpdate":
fmt.Println("Abort update")
resp, err := r.AbortJobUpdate(aurora.JobUpdateKey{
Job: job.JobKey(),
ID: updateId,
},
"")
if err != nil {
log.Fatal(err)
}
fmt.Println(resp.String())
case "rollbackUpdate":
fmt.Println("Abort update")
resp, err := r.RollbackJobUpdate(aurora.JobUpdateKey{
Job: job.JobKey(),
ID: updateId,
},
"")
if err != nil {
log.Fatal(err)
}
fmt.Println(resp.String())
case "taskConfig":
fmt.Println("Getting job info")
live, err := r.GetInstanceIds(job.JobKey(), aurora.ACTIVE_STATES)
if err != nil {
log.Fatal(err)
}
config, err := r.FetchTaskConfig(aurora.InstanceKey{
JobKey: job.JobKey(),
InstanceId: live[0],
})
if err != nil {
log.Fatal(err)
}
log.Println(config.String())
case "updatesummary":
fmt.Println("Getting job update summary")
jobquery := &aurora.JobUpdateQuery{
Role: &job.JobKey().Role,
JobKey: job.JobKey(),
}
updatesummary, err := r.GetJobUpdateSummaries(jobquery)
if err != nil {
log.Fatalf("error while getting update summary: %v", err)
}
fmt.Println(updatesummary)
case "taskStatus":
fmt.Println("Getting task status")
taskQ := &aurora.TaskQuery{
Role: &job.JobKey().Role,
Environment: &job.JobKey().Environment,
JobName: &job.JobKey().Name,
}
tasks, err := r.GetTaskStatus(taskQ)
if err != nil {
log.Fatalf("error: %+v\n ", err)
}
fmt.Printf("length: %d\n ", len(tasks))
fmt.Printf("tasks: %+v\n", tasks)
case "tasksWithoutConfig":
fmt.Println("Getting task status")
taskQ := &aurora.TaskQuery{
Role: &job.JobKey().Role,
Environment: &job.JobKey().Environment,
JobName: &job.JobKey().Name,
}
tasks, err := r.GetTasksWithoutConfigs(taskQ)
if err != nil {
log.Fatalf("error: %+v\n ", err)
}
fmt.Printf("length: %d\n ", len(tasks))
fmt.Printf("tasks: %+v\n", tasks)
case "drainHosts":
fmt.Println("Setting hosts to DRAINING")
if hostList == "" {
log.Fatal("No hosts specified to drain")
}
hosts := strings.Split(hostList, ",")
_, result, err := r.DrainHosts(hosts...)
if err != nil {
log.Fatalf("error: %+v\n", err.Error())
}
// Monitor change to DRAINING and DRAINED mode
hostResult, err := monitor.HostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_DRAINED, aurora.MaintenanceMode_DRAINING},
5,
10)
if err != nil {
for host, ok := range hostResult {
if !ok {
fmt.Printf("Host %s did not transtion into desired mode(s)\n", host)
}
}
log.Fatalf("error: %+v\n", err.Error())
}
fmt.Print(result.String())
case "SLADrainHosts":
fmt.Println("Setting hosts to DRAINING using SLA aware draining")
if hostList == "" {
log.Fatal("No hosts specified to drain")
}
hosts := strings.Split(hostList, ",")
policy := aurora.SlaPolicy{PercentageSlaPolicy: &aurora.PercentageSlaPolicy{Percentage: 50.0}}
result, err := r.SLADrainHosts(&policy, 30, hosts...)
if err != nil {
log.Fatalf("error: %+v\n", err.Error())
}
// Monitor change to DRAINING and DRAINED mode
hostResult, err := monitor.HostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_DRAINED, aurora.MaintenanceMode_DRAINING},
5,
10)
if err != nil {
for host, ok := range hostResult {
if !ok {
fmt.Printf("Host %s did not transtion into desired mode(s)\n", host)
}
}
log.Fatalf("error: %+v\n", err.Error())
}
fmt.Print(result.String())
case "endMaintenance":
fmt.Println("Setting hosts to ACTIVE")
if hostList == "" {
log.Fatal("No hosts specified to drain")
}
hosts := strings.Split(hostList, ",")
_, result, err := r.EndMaintenance(hosts...)
if err != nil {
log.Fatalf("error: %+v\n", err.Error())
}
// Monitor change to DRAINING and DRAINED mode
hostResult, err := monitor.HostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_NONE},
5,
10)
if err != nil {
for host, ok := range hostResult {
if !ok {
fmt.Printf("Host %s did not transtion into desired mode(s)\n", host)
}
}
log.Fatalf("error: %+v\n", err.Error())
}
fmt.Print(result.String())
case "getPendingReasons":
fmt.Println("Getting pending reasons")
taskQ := &aurora.TaskQuery{
Role: &job.JobKey().Role,
Environment: &job.JobKey().Environment,
JobName: &job.JobKey().Name,
}
reasons, err := r.GetPendingReason(taskQ)
if err != nil {
log.Fatalf("error: %+v\n ", err)
}
fmt.Printf("length: %d\n ", len(reasons))
fmt.Printf("tasks: %+v\n", reasons)
case "getJobs":
fmt.Println("GetJobs...role: ", role)
_, result, err := r.GetJobs(role)
if err != nil {
log.Fatalf("error: %+v\n", err.Error())
}
fmt.Println("map size: ", len(result.Configs))
fmt.Println(result.String())
case "snapshot":
fmt.Println("Forcing scheduler to write snapshot to mesos replicated log")
err := r.Snapshot()
if err != nil {
log.Fatalf("error: %+v\n", err.Error())
}
case "performBackup":
fmt.Println("Writing Backup of Snapshot to file system")
err := r.PerformBackup()
if err != nil {
log.Fatalf("error: %+v\n", err.Error())
}
case "forceExplicitRecon":
fmt.Println("Force an explicit recon")
err := r.ForceExplicitTaskReconciliation(nil)
if err != nil {
log.Fatalf("error: %+v\n", err.Error())
}
case "forceImplicitRecon":
fmt.Println("Force an implicit recon")
err := r.ForceImplicitTaskReconciliation()
if err != nil {
log.Fatalf("error: %+v\n", err.Error())
}
default:
log.Fatal("Command not supported")
}
}