australis/cmd/start.go
Renán I. Del Valle 4806936c71
Adding ability to start an update (#12)
* Adding ability to start an update.

* Refactoring Job parsing code to be re-usable.
2020-05-07 15:40:22 -07:00

297 lines
11 KiB
Go

/**
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package cmd
import (
"encoding/json"
"errors"
"io/ioutil"
"os"
"time"
"github.com/aurora-scheduler/australis/internal"
"github.com/aurora-scheduler/gorealis/v2/gen-go/apache/aurora"
"github.com/spf13/cobra"
)
const countFlag = "count"
const percentageFlag = "percentage"
const jsonFlag = "json"
const jsonFileFlag = "json-file"
func init() {
rootCmd.AddCommand(startCmd)
// Sub-commands
startCmd.AddCommand(startDrainCmd.Cmd)
startDrainCmd.Cmd.Run = drain
// Maintenance specific flags
startDrainCmd.Cmd.Flags().DurationVar(&startDrainCmd.MonitorInterval, "interval", time.Second*5, "Interval at which to poll scheduler.")
startDrainCmd.Cmd.Flags().DurationVar(&startDrainCmd.MonitorTimeout, "timeout", time.Minute*10, "Time after which the monitor will stop polling and throw an error.")
startDrainCmd.Cmd.Flags().StringVar(&fromJsonFile, jsonFileFlag, "", "JSON file to read list of agents from.")
startDrainCmd.Cmd.Flags().BoolVar(&fromJson, jsonFlag, false, "Read JSON list of agents from the STDIN.")
/* SLA Aware commands */
startCmd.AddCommand(startSLADrainCmd.Cmd)
startSLADrainCmd.Cmd.Run = slaDrain
// SLA Maintenance specific flags
startSLADrainCmd.Cmd.Flags().Int64Var(&count, countFlag, 5, "Instances count that should be running to meet SLA.")
startSLADrainCmd.Cmd.Flags().Float64Var(&percent, percentageFlag, 80.0, "Percentage of instances that should be running to meet SLA.")
startSLADrainCmd.Cmd.Flags().DurationVar(&duration, "duration", time.Minute*1, "Minimum time duration a task needs to be `RUNNING` to be treated as active.")
startSLADrainCmd.Cmd.Flags().DurationVar(&forceDrainTimeout, "sla-limit", time.Minute*60, "Time limit after which SLA-Aware drain sheds SLA Awareness.")
startSLADrainCmd.Cmd.Flags().DurationVar(&startSLADrainCmd.MonitorInterval, "interval", time.Second*10, "Interval at which to poll scheduler.")
startSLADrainCmd.Cmd.Flags().DurationVar(&startSLADrainCmd.MonitorTimeout, "timeout", time.Minute*20, "Time after which the monitor will stop polling and throw an error.")
startSLADrainCmd.Cmd.Flags().StringVar(&fromJsonFile, jsonFileFlag, "", "JSON file to read list of agents from.")
startSLADrainCmd.Cmd.Flags().BoolVar(&fromJson, jsonFlag, false, "Read JSON list of agents from the STDIN.")
startCmd.AddCommand(startMaintenanceCmd.Cmd)
startMaintenanceCmd.Cmd.Run = maintenance
// SLA Maintenance specific flags
startMaintenanceCmd.Cmd.Flags().DurationVar(&startMaintenanceCmd.MonitorInterval, "interval", time.Second*5, "Interval at which to poll scheduler.")
startMaintenanceCmd.Cmd.Flags().DurationVar(&startMaintenanceCmd.MonitorTimeout, "timeout", time.Minute*10, "Time after which the monitor will stop polling and throw an error.")
startMaintenanceCmd.Cmd.Flags().StringVar(&fromJsonFile, jsonFileFlag, "", "JSON file to read list of agents from.")
startMaintenanceCmd.Cmd.Flags().BoolVar(&fromJson, jsonFlag, false, "Read JSON list of agents from the STDIN.")
// Start update command
startCmd.AddCommand(startUpdateCmd.Cmd)
startUpdateCmd.Cmd.Run = update
startUpdateCmd.Cmd.Flags().DurationVar(&startUpdateCmd.MonitorInterval, "interval", time.Second*5, "Interval at which to poll scheduler.")
startUpdateCmd.Cmd.Flags().DurationVar(&startUpdateCmd.MonitorTimeout, "timeout", time.Minute*10, "Time after which the monitor will stop polling and throw an error.")
}
var startCmd = &cobra.Command{
Use: "start",
Short: "Start a service, maintenance on a host (DRAIN), a snapshot, an update, or a backup.",
}
var startDrainCmd = internal.MonitorCmdConfig{
Cmd: &cobra.Command{
Use: "drain [space separated host list or use JSON flags]",
Short: "Place a list of space separated Mesos Agents into draining mode.",
Long: `Adds a Mesos Agent to Aurora's Drain list. Agents in this list
are not allowed to schedule new tasks and any tasks already running on this Agent
are killed and rescheduled in an Agent that is not in maintenance mode. Command
expects a space separated list of hosts to place into maintenance mode.`,
Args: argsValidateJSONFlags,
},
}
var startSLADrainCmd = internal.MonitorCmdConfig{
Cmd: &cobra.Command{
Use: "sla-drain [space separated host list or use JSON flags]",
Short: "Place a list of space separated Mesos Agents into maintenance mode using SLA aware strategies.",
Long: `Adds a Mesos Agent to Aurora's Drain list. Agents in this list
are not allowed to schedule new tasks and any tasks already running on this Agent
are killed and rescheduled in an Agent that is not in maintenance mode. Command
expects a space separated list of hosts to place into maintenance mode.
If the --count argument is passed, tasks will be drained using the count SLA policy as a fallback
when a Job does not have a defined SLA policy.
If the --percentage argument is passed, tasks will be drained using the percentage SLA policy as a fallback
when a Job does not have a defined SLA policy.`,
Args: argsValidateJSONFlags,
},
}
var startMaintenanceCmd = internal.MonitorCmdConfig{
Cmd: &cobra.Command{
Use: "maintenance [space separated host list or use JSON flags]",
Short: "Place a list of space separated Mesos Agents into maintenance mode.",
Long: `Places Mesos Agent into Maintenance mode. Agents in this list
are de-prioritized for scheduling a task. Command
expects a space separated list of hosts to place into maintenance mode.`,
Args: argsValidateJSONFlags,
},
}
var startUpdateCmd = internal.MonitorCmdConfig{
Cmd: &cobra.Command{
Use: "update [update config]",
Short: "Start an update on an Aurora long running service.",
Long: `Starts the update process on an Aurora long running service. If no such service exists, the update mechanism
will act as a deployment, creating all new instances based on the requirements in the update configuration.`,
Args: cobra.ExactArgs(1),
},
}
func argsValidateJSONFlags(cmd *cobra.Command, args []string) error {
if cmd.Flags().Changed(jsonFlag) && cmd.Flags().Changed(jsonFileFlag) {
return errors.New("only json file or json stdin must be set")
}
// These two flags are mutually exclusive
if cmd.Flags().Changed(jsonFlag) != cmd.Flags().Changed(jsonFileFlag) {
return nil
}
if len(args) < 1 {
return errors.New("at least one host must be specified")
}
return nil
}
func hostList(cmd *cobra.Command, args []string) []string {
var hosts []string
if cmd.Flags().Changed(jsonFlag) {
err := json.NewDecoder(os.Stdin).Decode(&hosts)
if err != nil {
log.Fatal(err)
}
} else if cmd.Flags().Changed(jsonFileFlag) {
data, err := ioutil.ReadFile(fromJsonFile)
if err != nil {
log.Fatal(err)
}
err = json.Unmarshal(data, &hosts)
if err != nil {
log.Fatal(err)
}
} else {
hosts = args
}
return hosts
}
func drain(cmd *cobra.Command, args []string) {
hosts := hostList(cmd, args)
log.Infoln("Setting hosts to DRAINING")
log.Infoln(hosts)
result, err := client.DrainHosts(hosts...)
if err != nil {
log.Fatalf("error: %+v", err)
}
log.Debugln(result)
log.Infof("Monitoring for %v at %v intervals", monitorHostCmd.MonitorTimeout, monitorHostCmd.MonitorInterval)
// Monitor change to DRAINING and DRAINED mode
hostResult, err := client.MonitorHostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_DRAINED},
startDrainCmd.MonitorInterval,
startDrainCmd.MonitorTimeout)
internal.MaintenanceMonitorPrint(hostResult, []aurora.MaintenanceMode{aurora.MaintenanceMode_DRAINED}, toJson)
if err != nil {
log.Fatalln(err)
}
}
func slaDrainHosts(policy *aurora.SlaPolicy, interval, timeout time.Duration, hosts ...string) {
result, err := client.SLADrainHosts(policy, int64(forceDrainTimeout.Seconds()), hosts...)
if err != nil {
log.Fatalf("error: %+v", err)
}
log.Debugln(result)
log.Infof("Monitoring for %v at %v intervals", timeout, interval)
// Monitor change to DRAINING and DRAINED mode
hostResult, err := client.MonitorHostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_DRAINED},
interval,
timeout)
internal.MaintenanceMonitorPrint(hostResult, []aurora.MaintenanceMode{aurora.MaintenanceMode_DRAINED}, toJson)
if err != nil {
log.Fatalf("error: %+v", err)
}
}
func slaDrain(cmd *cobra.Command, args []string) {
hosts := hostList(cmd, args)
// This check makes sure only a single flag is set.
// If they're both set or both not set, the statement will evaluate to true.
if cmd.Flags().Changed(percentageFlag) == cmd.Flags().Changed(countFlag) {
log.Fatal("Either percentage or count must be set exclusively.")
}
policy := &aurora.SlaPolicy{}
if cmd.Flags().Changed(percentageFlag) {
log.Infoln("Setting hosts to DRAINING with the Percentage SLA policy.")
policy.PercentageSlaPolicy = &aurora.PercentageSlaPolicy{
Percentage: percent,
DurationSecs: int64(duration.Seconds()),
}
}
if cmd.Flags().Changed(countFlag) {
log.Infoln("Setting hosts to DRAINING with the Count SLA policy.")
policy.CountSlaPolicy = &aurora.CountSlaPolicy{Count: count, DurationSecs: int64(duration.Seconds())}
}
log.Infoln("Hosts affected: ", args)
slaDrainHosts(policy, startDrainCmd.MonitorInterval, startDrainCmd.MonitorTimeout, hosts...)
}
func maintenance(cmd *cobra.Command, args []string) {
hosts := hostList(cmd, args)
log.Infoln("Setting hosts to Maintenance mode")
log.Infoln(hosts)
result, err := client.StartMaintenance(hosts...)
if err != nil {
log.Fatalf("error: %+v", err)
}
log.Debugln(result)
log.Infof("Monitoring for %v at %v intervals", monitorHostCmd.MonitorTimeout, monitorHostCmd.MonitorInterval)
// Monitor change to DRAINING and DRAINED mode
hostResult, err := client.MonitorHostMaintenance(
hosts,
[]aurora.MaintenanceMode{aurora.MaintenanceMode_SCHEDULED},
startMaintenanceCmd.MonitorInterval,
startMaintenanceCmd.MonitorTimeout)
internal.MaintenanceMonitorPrint(hostResult, []aurora.MaintenanceMode{aurora.MaintenanceMode_SCHEDULED}, toJson)
if err != nil {
log.Fatalf("error: %+v", err)
}
}
func update(cmd *cobra.Command, args []string) {
updateJob, err := internal.UnmarshalUpdate(args[0])
if err != nil {
log.Fatal(err)
}
update, err := updateJob.ToRealis()
if err != nil {
log.Fatal(err)
}
result, err := client.StartJobUpdate(update, "")
if err != nil {
log.Fatalf("Update failed to start %v", err)
}
if ok, monitorErr := client.MonitorJobUpdate(*result.GetKey(),
startUpdateCmd.MonitorInterval,
startUpdateCmd.MonitorTimeout); !ok || monitorErr != nil {
log.Fatal("update did not ROLL FORWARD before monitor timed out")
}
}