/* * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ namespace java org.apache.aurora.gen namespace py gen.apache.aurora.api // Thrift interface definition for the aurora scheduler. /* * TODO(wfarner): It would be nice if we could put some HTML tags here, regex doesn't handle it though. * The result of an API operation. A result may only be specified when this is OK. */ enum ResponseCode { INVALID_REQUEST = 0, OK = 1, ERROR = 2, WARNING = 3, AUTH_FAILED = 4, /** Raised when an operation was unable to proceed due to an in-progress job update. */ JOB_UPDATING_ERROR = 5, /** Raised when a scheduler is transiently unavailable and later retry is recommended. */ ERROR_TRANSIENT = 6 } // Aurora executor framework name. const string AURORA_EXECUTOR_NAME = 'AuroraExecutor' // TODO(maxim): Remove in 0.7.0. (AURORA-749) struct Identity { 2: string user } /** A single host attribute. */ struct Attribute { 1: string name 2: set values } enum MaintenanceMode { NONE = 1, SCHEDULED = 2, DRAINING = 3, DRAINED = 4 } /** The attributes assigned to a host. */ struct HostAttributes { 1: string host 2: set attributes 3: optional MaintenanceMode mode 4: optional string slaveId } /** * A constraint that specifies an explicit set of values, at least one of which must be present * on a host for a task to be scheduled there. */ struct ValueConstraint { /** If true, treat this as a 'not' - to avoid specific values. */ 1: bool negated 2: set values } /** * A constraint the specifies the maximum number of active tasks on a host with a matching * attribute that may be scheduled simultaneously. */ struct LimitConstraint { 1: i32 limit } /** Types of constraints that may be applied to a task. */ union TaskConstraint { 1: ValueConstraint value 2: LimitConstraint limit } /** A constraint that defines whether a task may be scheduled on a host. */ struct Constraint { /** Mesos slave attribute that the constraint is matched against. */ 1: string name 2: TaskConstraint constraint } struct Package { 1: string role 2: string name 3: i32 version } /** Arbitrary key-value metadata to be included into TaskConfig. */ struct Metadata { 1: string key 2: string value } /** A unique identifier for a Job. */ struct JobKey { /** User role (Unix service account), for example "mesos" */ 1: string role /** Environment, for example "devel" */ 2: string environment /** Name, for example "labrat" */ 3: string name } // TODO(jly): Deprecated, remove in 0.21. See AURORA-1959. /** A unique lock key. */ union LockKey { 1: JobKey job } // TODO(jly): Deprecated, remove in 0.21. See AURORA-1959. /** A generic lock struct to facilitate context specific resource/operation serialization. */ struct Lock { /** ID of the lock - unique per storage */ 1: LockKey key /** UUID - facilitating soft lock authorization */ 2: string token /** Lock creator */ 3: string user /** Lock creation timestamp in milliseconds */ 4: i64 timestampMs /** Optional message to record with the lock */ 5: optional string message } /** A unique identifier for the active task within a job. */ struct InstanceKey { /** Key identifying the job. */ 1: JobKey jobKey /** Unique instance ID for the active task in a job. */ 2: i32 instanceId } /** URI which mirrors CommandInfo.URI in the Mesos Protobuf */ struct MesosFetcherURI { /** Where to get the resource from */ 1: string value /** Extract compressed archive after downloading */ 2: optional bool extract /** Cache value using Mesos Fetcher caching mechanism **/ 3: optional bool cache } struct ExecutorConfig { /** Name identifying the Executor. */ 1: string name /** Executor configuration data. */ 2: string data } /** The mode for a volume mount */ enum Mode { /** Read Write */ RW = 1 /** Read Only */ RO = 2 } /** A volume mount point within a container */ struct Volume { /** The path inside the container where the mount will be created. */ 1: string containerPath /** The path on the host that will serve as the source for the mount. */ 2: string hostPath /** The access mode */ 3: Mode mode } /** Describes an image for use with the Mesos unified containerizer in the Docker format */ struct DockerImage { /** The name of the image to run */ 1: string name /** The Docker tag identifying the image */ 2: string tag } /** Describes an image for use with the Mesos unified containerizer in the AppC format */ struct AppcImage { /** The name of the image to run */ 1: string name /** The appc image id identifying the image */ 2: string imageId } /** Describes an image to be used with the Mesos unified containerizer */ union Image { 1: DockerImage docker 2: AppcImage appc } /** Describes a mesos container, this is the default */ struct MesosContainer { /** the optional filesystem image to use when launching this task. */ 1: optional Image image /** the optional list of volumes to mount into the task. */ 2: optional list volumes } /** Describes a parameter passed to docker cli */ struct DockerParameter { /** a parameter to pass to docker. (e.g. volume) */ 1: string name /** the value to pass to a parameter (e.g. /src/webapp:/opt/webapp) */ 2: string value } /** Describes a docker container */ struct DockerContainer { /** The container image to be run */ 1: string image /** The arbitrary parameters to pass to container */ 2: optional list parameters } /** Describes a container to be used in a task */ union Container { 1: MesosContainer mesos 2: DockerContainer docker } /** Describes resource value required to run a task. */ union Resource { 1: double numCpus 2: i64 ramMb 3: i64 diskMb 4: string namedPort 5: i64 numGpus } struct PartitionPolicy { 1: bool reschedule 2: optional i64 delaySecs } /** SLA requirements expressed as the percentage of instances to be RUNNING every durationSecs */ struct PercentageSlaPolicy { /* The percentage of active instances required every `durationSecs`. */ 1: double percentage /** Minimum time duration a task needs to be `RUNNING` to be treated as active */ 2: i64 durationSecs } /** SLA requirements expressed as the number of instances to be RUNNING every durationSecs */ struct CountSlaPolicy { /** The number of active instances required every `durationSecs` */ 1: i64 count /** Minimum time duration a task needs to be `RUNNING` to be treated as active */ 2: i64 durationSecs } /** SLA requirements to be delegated to an external coordinator */ struct CoordinatorSlaPolicy { /** URL for the coordinator service that needs to be contacted for SLA checks */ 1: string coordinatorUrl /** Field in the Coordinator response json indicating if the action is allowed or not */ 2: string statusKey } /** SLA requirements expressed in one of the many types */ union SlaPolicy { 1: PercentageSlaPolicy percentageSlaPolicy 2: CountSlaPolicy countSlaPolicy 3: CoordinatorSlaPolicy coordinatorSlaPolicy } /** Description of the tasks contained within a job. */ struct TaskConfig { /** Job task belongs to. */ 28: JobKey job // TODO(maxim): Deprecated. See AURORA-749. /** contains the role component of JobKey */ 17: Identity owner 7: bool isService 11: i32 priority 13: i32 maxTaskFailures // TODO(mnurolahzade): Deprecated. See AURORA-1708. /** Whether this is a production task, which can preempt. */ 18: optional bool production /** Task tier type. */ 30: optional string tier /** All resources required to run a task. */ 32: set resources 20: set constraints /** Resources to retrieve with Mesos Fetcher */ 33: optional set mesosFetcherUris /** * Custom links to include when displaying this task on the scheduler dashboard. Keys are anchor * text, values are URLs. Wildcards are supported for dynamic link crafting based on host, ports, * instance, etc. */ 22: optional map taskLinks 23: optional string contactEmail /** Executor configuration */ 25: optional ExecutorConfig executorConfig /** Used to display additional details in the UI. */ 27: optional set metadata /** Policy for how to deal with task partitions */ 34: optional PartitionPolicy partitionPolicy /** SLA requirements to be met during maintenance */ 35: optional SlaPolicy slaPolicy // This field is deliberately placed at the end to work around a bug in the immutable wrapper // code generator. See AURORA-1185 for details. /** the container the task should use to execute */ 29: Container container = { "mesos": {} } } struct ResourceAggregate { /** Aggregated resource values. */ 4: set resources } /** Defines the policy for launching a new cron job when one is already running. */ enum CronCollisionPolicy { /** Kills the existing job with the colliding name, and runs the new cron job. */ KILL_EXISTING = 0, /** Cancels execution of the new job, leaving the running job in tact. */ CANCEL_NEW = 1, /** * DEPRECATED. For existing jobs, treated the same as CANCEL_NEW. * createJob will reject jobs with this policy. */ RUN_OVERLAP = 2 } /** * Description of an Aurora job. One task will be scheduled for each instance within the job. */ struct JobConfiguration { /** * Key for this job. If not specified name, owner.role, and a reasonable default environment are * used to construct it server-side. */ 9: JobKey key // TODO(maxim): Deprecated. See AURORA-749. /** Owner of this job. */ 7: Identity owner /** * If present, the job will be handled as a cron job with this crontab-syntax schedule. */ 4: optional string cronSchedule /** Collision policy to use when handling overlapping cron runs. Default is KILL_EXISTING. */ 5: CronCollisionPolicy cronCollisionPolicy /** Task configuration for this job. */ 6: TaskConfig taskConfig /** * The number of instances in the job. Generated instance IDs for tasks will be in the range * [0, instances). */ 8: i32 instanceCount } struct JobStats { /** Number of tasks in active state for this job. */ 1: i32 activeTaskCount /** Number of tasks in finished state for this job. */ 2: i32 finishedTaskCount /** Number of failed tasks for this job. */ 3: i32 failedTaskCount /** Number of tasks in pending state for this job. */ 4: i32 pendingTaskCount } struct JobSummary { 1: JobConfiguration job 2: JobStats stats /** Timestamp of next cron run in ms since epoch, for a cron job */ 3: optional i64 nextCronRunMs } /** Closed range of integers. */ struct Range { 1: i32 first 2: i32 last } struct ConfigGroup { 1: TaskConfig config 3: set instances } struct ConfigSummary { 1: JobKey key 2: set groups } struct PopulateJobResult { 2: TaskConfig taskConfig } struct GetQuotaResult { /** Total allocated resource quota. */ 1: ResourceAggregate quota /** Resources consumed by production jobs from a shared resource pool. */ 2: optional ResourceAggregate prodSharedConsumption /** Resources consumed by non-production jobs from a shared resource pool. */ 3: optional ResourceAggregate nonProdSharedConsumption /** Resources consumed by production jobs from a dedicated resource pool. */ 4: optional ResourceAggregate prodDedicatedConsumption /** Resources consumed by non-production jobs from a dedicated resource pool. */ 5: optional ResourceAggregate nonProdDedicatedConsumption } /** States that a task may be in. */ enum ScheduleStatus { // TODO(maxim): This state does not add much value. Consider dropping it completely. /* Initial state for a task. A task will remain in this state until it has been persisted. */ INIT = 11, /** The task will be rescheduled, but is being throttled for restarting too frequently. */ THROTTLED = 16, /** Task is awaiting assignment to a slave. */ PENDING = 0, /** Task has been assigned to a slave. */ ASSIGNED = 9, /** Slave has acknowledged receipt of task and is bootstrapping the task. */ STARTING = 1, /** The task is running on the slave. */ RUNNING = 2, /** The task terminated with an exit code of zero. */ FINISHED = 3, /** The task is being preempted by another task. */ PREEMPTING = 13, /** The task is being restarted in response to a user request. */ RESTARTING = 12, /** The task is being restarted in response to a host maintenance request. */ DRAINING = 17, /** The task terminated with a non-zero exit code. */ FAILED = 4, /** Execution of the task was terminated by the system. */ KILLED = 5, /** The task is being forcibly killed. */ KILLING = 6, /** A fault in the task environment has caused the system to believe the task no longer exists. * This can happen, for example, when a slave process disappears. */ LOST = 7, /** * The task is currently partitioned and in an unknown state. **/ PARTITIONED = 18 } // States that a task may be in while still considered active. const set ACTIVE_STATES = [ScheduleStatus.ASSIGNED, ScheduleStatus.DRAINING, ScheduleStatus.KILLING, ScheduleStatus.PENDING, ScheduleStatus.PREEMPTING, ScheduleStatus.RESTARTING ScheduleStatus.RUNNING, ScheduleStatus.STARTING, ScheduleStatus.PARTITIONED, ScheduleStatus.THROTTLED] // States that a task may be in while associated with a slave machine and non-terminal. const set SLAVE_ASSIGNED_STATES = [ScheduleStatus.ASSIGNED, ScheduleStatus.DRAINING, ScheduleStatus.KILLING, ScheduleStatus.PREEMPTING, ScheduleStatus.RESTARTING, ScheduleStatus.RUNNING, ScheduleStatus.PARTITIONED, ScheduleStatus.STARTING] // States that a task may be in while in an active sandbox. const set LIVE_STATES = [ScheduleStatus.KILLING, ScheduleStatus.PREEMPTING, ScheduleStatus.RESTARTING, ScheduleStatus.DRAINING, ScheduleStatus.PARTITIONED, ScheduleStatus.RUNNING] // States a completed task may be in. const set TERMINAL_STATES = [ScheduleStatus.FAILED, ScheduleStatus.FINISHED, ScheduleStatus.KILLED, ScheduleStatus.LOST] // Regular expressions for matching valid identifiers for job path components. All expressions // below should accept and reject the same set of inputs. const string GOOD_IDENTIFIER_PATTERN = "^[\\w\\-\\.]+$" // JVM: Use with java.util.regex.Pattern#compile const string GOOD_IDENTIFIER_PATTERN_JVM = GOOD_IDENTIFIER_PATTERN // Python: Use with re.compile const string GOOD_IDENTIFIER_PATTERN_PYTHON = GOOD_IDENTIFIER_PATTERN /** Event marking a state transition within a task's lifecycle. */ struct TaskEvent { /** Epoch timestamp in milliseconds. */ 1: i64 timestamp /** New status of the task. */ 2: ScheduleStatus status /** Audit message that explains why a transition occurred. */ 3: optional string message /** Hostname of the scheduler machine that performed the event. */ 4: optional string scheduler } /** A task assignment that is provided to an executor. */ struct AssignedTask { /** The mesos task ID for this task. Guaranteed to be globally unique */ 1: string taskId /** * The mesos slave ID that this task has been assigned to. * This will not be populated for a PENDING task. */ 2: string slaveId /** * The name of the machine that this task has been assigned to. * This will not be populated for a PENDING task. */ 3: string slaveHost /** Information about how to run this task. */ 4: TaskConfig task /** Ports reserved on the machine while this task is running. */ 5: map assignedPorts /** * The instance ID assigned to this task. Instance IDs must be unique and contiguous within a * job, and will be in the range [0, N-1] (inclusive) for a job that has N instances. */ 6: i32 instanceId } /** A task that has been scheduled. */ struct ScheduledTask { /** The task that was scheduled. */ 1: AssignedTask assignedTask /** The current status of this task. */ 2: ScheduleStatus status /** * The number of failures that this task has accumulated over the multi-generational history of * this task. */ 3: i32 failureCount /** * The number of partitions this task has accumulated over its lifetime. */ 6: i32 timesPartitioned /** State change history for this task. */ 4: list taskEvents /** * The task ID of the previous generation of this task. When a task is automatically rescheduled, * a copy of the task is created and ancestor ID of the previous task's task ID. */ 5: string ancestorId } struct ScheduleStatusResult { 1: list tasks } struct GetJobsResult { 1: set configs } /** * Contains a set of restrictions on matching tasks where all restrictions must be met * (terms are AND'ed together). */ struct TaskQuery { 14: optional string role 9: optional string environment 2: optional string jobName 4: optional set taskIds 5: optional set statuses 7: optional set instanceIds 10: optional set slaveHosts 11: optional set jobKeys 12: optional i32 offset 13: optional i32 limit } struct HostStatus { 1: string host 2: MaintenanceMode mode } struct RoleSummary { 1: string role 2: i32 jobCount 3: i32 cronJobCount } struct Hosts { 1: set hostNames } struct PendingReason { 1: string taskId 2: string reason } /** States that a job update may be in. */ enum JobUpdateStatus { /** Update is in progress. */ ROLLING_FORWARD = 0, /** Update has failed and is being rolled back. */ ROLLING_BACK = 1, /** Update has been paused while in progress. */ ROLL_FORWARD_PAUSED = 2, /** Update has been paused during rollback. */ ROLL_BACK_PAUSED = 3, /** Update has completed successfully. */ ROLLED_FORWARD = 4, /** Update has failed and rolled back. */ ROLLED_BACK = 5, /** Update was aborted. */ ABORTED = 6, /** Unknown error during update. */ ERROR = 7, /** * Update failed to complete. * This can happen if failure thresholds are met while rolling forward, but rollback is disabled, * or if failure thresholds are met when rolling back. */ FAILED = 8, /** Update has been blocked while in progress due to missing/expired pulse. */ ROLL_FORWARD_AWAITING_PULSE = 9, /** Update has been blocked during rollback due to missing/expired pulse. */ ROLL_BACK_AWAITING_PULSE = 10 } /** States the job update can be in while still considered active. */ const set ACTIVE_JOB_UPDATE_STATES = [JobUpdateStatus.ROLLING_FORWARD, JobUpdateStatus.ROLLING_BACK, JobUpdateStatus.ROLL_FORWARD_PAUSED, JobUpdateStatus.ROLL_BACK_PAUSED, JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE, JobUpdateStatus.ROLL_BACK_AWAITING_PULSE] /** States the job update can be in while waiting for a pulse. */ const set AWAITNG_PULSE_JOB_UPDATE_STATES = [JobUpdateStatus.ROLL_FORWARD_AWAITING_PULSE, JobUpdateStatus.ROLL_BACK_AWAITING_PULSE] /** Job update actions that can be applied to job instances. */ enum JobUpdateAction { /** * An instance was moved to the target state successfully, and declared healthy if the desired * state did not involve deleting the instance. */ INSTANCE_UPDATED = 1, /** * An instance was rolled back because the job update did not succeed. The instance was reverted * to the original state prior to the job update, which means that the instance was removed if * the update added instances to the job. */ INSTANCE_ROLLED_BACK = 2, /** * An instance is being moved from the original state to the desired state. */ INSTANCE_UPDATING = 3, /** * An instance is being moved from the desired state back to the original state, because the job * update failed. */ INSTANCE_ROLLING_BACK = 4, /** An instance update was attempted but failed and was not rolled back. */ INSTANCE_UPDATE_FAILED = 5, /** An instance rollback was attempted but failed. */ INSTANCE_ROLLBACK_FAILED = 6 } /** Status of the coordinated update. Intended as a response to pulseJobUpdate RPC. */ enum JobUpdatePulseStatus { /** * Update is active. See ACTIVE_JOB_UPDATE_STATES for statuses considered active. */ OK = 1, /** * Update has reached terminal state. See TERMINAL_JOB_UPDATE_STATES for statuses * considered terminal. */ FINISHED = 2 } /** Job update key. */ struct JobUpdateKey { /** Job being updated */ 1: JobKey job /** Update ID. */ 2: string id } /** Limits the amount of active changes being made to instances to groupSize. */ struct QueueJobUpdateStrategy { 1: i32 groupSize } /** Similar to Queue strategy but will not start a new group until all instances in an active * group have finished updating. */ struct BatchJobUpdateStrategy { 1: i32 groupSize /* Update will pause automatically after each batch completes */ 2: bool autopauseAfterBatch } /** Same as Batch strategy but each time an active group completes, the size of the next active * group may change. */ struct VariableBatchJobUpdateStrategy { 1: list groupSizes /* Update will pause automatically after each batch completes */ 2: bool autopauseAfterBatch } union JobUpdateStrategy { 1: QueueJobUpdateStrategy queueStrategy 2: BatchJobUpdateStrategy batchStrategy 3: VariableBatchJobUpdateStrategy varBatchStrategy } /** Job update thresholds and limits. */ struct JobUpdateSettings { /** Deprecated, please set value inside of desired update strategy instead. * Max number of instances being updated at any given moment. */ 1: i32 updateGroupSize /** Max number of instance failures to tolerate before marking instance as FAILED. */ 2: i32 maxPerInstanceFailures /** Max number of FAILED instances to tolerate before terminating the update. */ 3: i32 maxFailedInstances /** Min time to watch a RUNNING instance. */ 5: i32 minWaitInInstanceRunningMs /** If true, enables failed update rollback. */ 6: bool rollbackOnFailure /** Instance IDs to act on. All instances will be affected if this is not set. */ 7: set updateOnlyTheseInstances /** Deprecated, please set updateStrategy to the Batch strategy instead. * If true, use updateGroupSize as strict batching boundaries, and avoid proceeding to another * batch until the preceding batch finishes updating. */ 8: bool waitForBatchCompletion /** * If set, requires external calls to pulseJobUpdate RPC within the specified rate for the * update to make progress. If no pulses received within specified interval the update will * block. A blocked update is unable to continue but retains its current status. It may only get * unblocked by a fresh pulseJobUpdate call. */ 9: optional i32 blockIfNoPulsesAfterMs /** * If true, updates will obey the SLA requirements of the tasks being updated. If the SLA policy * differs between the old and new task configurations, updates will use the newest configuration. */ 10: optional bool slaAware /** Update strategy to be used for the update. See JobUpdateStrategy for choices. */ 11: optional JobUpdateStrategy updateStrategy } /** Event marking a state transition in job update lifecycle. */ struct JobUpdateEvent { /** Update status. */ 1: JobUpdateStatus status /** Epoch timestamp in milliseconds. */ 2: i64 timestampMs /** User who performed this event (if user-initiated). */ 3: optional string user /** * Message from the user (for user-initiated transitions) or the scheduler about why the state was * changed. */ 4: optional string message } /** Event marking a state transition in job instance update lifecycle. */ struct JobInstanceUpdateEvent { /** Job instance ID. */ 1: i32 instanceId /** Epoch timestamp in milliseconds. */ 2: i64 timestampMs /** Job update action taken on the instance. */ 3: JobUpdateAction action /** Optional message explaining the instance update event. */ 4: optional string message } /** Maps instance IDs to TaskConfigs it. */ struct InstanceTaskConfig { /** A TaskConfig associated with instances. */ 1: TaskConfig task /** Instances associated with the TaskConfig. */ 2: set instances } /** Current job update state including status and created/modified timestamps. */ struct JobUpdateState { /** Current status of the update. */ 1: JobUpdateStatus status /** Created timestamp in milliseconds. */ 2: i64 createdTimestampMs /** Last modified timestamp in milliseconds. */ 3: i64 lastModifiedTimestampMs } /** Summary of the job update including job key, user and current state. */ struct JobUpdateSummary { /** Unique identifier for the update. */ 5: JobUpdateKey key /** User initiated an update. */ 3: string user /** Current job update state. */ 4: JobUpdateState state /** Update metadata supplied by the client. */ 6: optional set metadata } /** Update configuration and setting details. */ struct JobUpdateInstructions { /** Actual InstanceId -> TaskConfig mapping when the update was requested. */ 1: set initialState /** Desired configuration when the update completes. */ 2: InstanceTaskConfig desiredState /** Update specific settings. */ 3: JobUpdateSettings settings } /** Full definition of the job update. */ struct JobUpdate { /** Update summary. */ 1: JobUpdateSummary summary /** Update configuration. */ 2: JobUpdateInstructions instructions } struct JobUpdateDetails { /** Update definition. */ 1: JobUpdate update /** History for this update. */ 2: list updateEvents /** History for the individual instances updated. */ 3: list instanceEvents } /** A request to update the following instances of an existing job. Used by startUpdate. */ struct JobUpdateRequest { /** Desired TaskConfig to apply. */ 1: TaskConfig taskConfig /** Desired number of instances of the task config. */ 2: i32 instanceCount /** Update settings and limits. */ 3: JobUpdateSettings settings /** Update metadata supplied by the client issuing the JobUpdateRequest. */ 4: optional set metadata } /** * Contains a set of restrictions on matching job updates where all restrictions must be met * (terms are AND'ed together). */ struct JobUpdateQuery { /** Job role. */ 2: optional string role /** Unique identifier for a job update. */ 8: optional JobUpdateKey key /** Job key. */ 3: optional JobKey jobKey /** User who created the update. */ 4: optional string user /** Set of update statuses. */ 5: optional set updateStatuses /** Offset to serve data from. Used by pagination. */ 6: i32 offset /** Number or records to serve. Used by pagination. */ 7: i32 limit } struct HostMaintenanceRequest { 1: string host 2: SlaPolicy defaultSlaPolicy 3: i64 timeoutSecs 4: i64 createdTimestampMs } struct ListBackupsResult { 1: set backups } struct StartMaintenanceResult { 1: set statuses } struct DrainHostsResult { 1: set statuses } struct QueryRecoveryResult { 1: set tasks } struct MaintenanceStatusResult { 1: set statuses } struct EndMaintenanceResult { 1: set statuses } struct RoleSummaryResult { 1: set summaries } struct JobSummaryResult { 1: set summaries } struct ConfigSummaryResult { 1: ConfigSummary summary } struct GetPendingReasonResult { 1: set reasons } /** Result of the startUpdate call. */ struct StartJobUpdateResult { /** Unique identifier for the job update. */ 1: JobUpdateKey key /** Summary of the update that is in progress for the given JobKey. */ 2: optional JobUpdateSummary updateSummary } /** Result of the getJobUpdateSummaries call. */ struct GetJobUpdateSummariesResult { 1: list updateSummaries } /** Result of the getJobUpdateDetails call. */ struct GetJobUpdateDetailsResult { // TODO(zmanji): Remove this once we complete AURORA-1765 1: JobUpdateDetails details 2: list detailsList } /** Result of the pulseJobUpdate call. */ struct PulseJobUpdateResult { 1: JobUpdatePulseStatus status } struct GetJobUpdateDiffResult { /** Instance addition diff details. */ 1: set add /** Instance removal diff details. */ 2: set remove /** Instance update diff details. */ 3: set update /** Instances unchanged by the update. */ 4: set unchanged } /** Tier information. */ struct TierConfig { /** Name of tier. */ 1: string name /** Tier attributes. */ 2: map settings } /** Result of the getTierConfigResult call. */ struct GetTierConfigResult { /** Name of the default tier. */ 1: string defaultTierName /** Set of tier configurations. */ 2: set tiers } /** Information about the scheduler. */ struct ServerInfo { 1: string clusterName /** A url prefix for job container stats. */ 3: string statsUrlPrefix } union Result { 1: PopulateJobResult populateJobResult 3: ScheduleStatusResult scheduleStatusResult 4: GetJobsResult getJobsResult 5: GetQuotaResult getQuotaResult 6: ListBackupsResult listBackupsResult 7: StartMaintenanceResult startMaintenanceResult 8: DrainHostsResult drainHostsResult 9: QueryRecoveryResult queryRecoveryResult 10: MaintenanceStatusResult maintenanceStatusResult 11: EndMaintenanceResult endMaintenanceResult 17: RoleSummaryResult roleSummaryResult 18: JobSummaryResult jobSummaryResult 20: ConfigSummaryResult configSummaryResult 21: GetPendingReasonResult getPendingReasonResult 22: StartJobUpdateResult startJobUpdateResult 23: GetJobUpdateSummariesResult getJobUpdateSummariesResult 24: GetJobUpdateDetailsResult getJobUpdateDetailsResult 25: PulseJobUpdateResult pulseJobUpdateResult 26: GetJobUpdateDiffResult getJobUpdateDiffResult 27: GetTierConfigResult getTierConfigResult } struct ResponseDetail { 1: string message } struct Response { 1: ResponseCode responseCode 5: ServerInfo serverInfo /** Payload from the invoked RPC. */ 3: optional Result result /** * Messages from the server relevant to the request, such as warnings or use of deprecated * features. */ 6: list details } // A service that provides all the read only calls to the Aurora scheduler. service ReadOnlyScheduler { /** Returns a summary of the jobs grouped by role. */ Response getRoleSummary() /** Returns a summary of jobs, optionally only those owned by a specific role. */ Response getJobSummary(1: string role) /** Fetches the status of tasks. */ Response getTasksStatus(1: TaskQuery query) /** * Same as getTaskStatus but without the TaskConfig.ExecutorConfig data set. * This is an interim solution until we have a better way to query TaskConfigs (AURORA-541). */ Response getTasksWithoutConfigs(1: TaskQuery query) /** Returns user-friendly reasons (if available) for tasks retained in PENDING state. */ Response getPendingReason(1: TaskQuery query) /** Fetches the configuration summary of active tasks for the specified job. */ Response getConfigSummary(1: JobKey job) /** * Fetches the status of jobs. * ownerRole is optional, in which case all jobs are returned. */ Response getJobs(1: string ownerRole) /** Fetches the quota allocated for a user. */ Response getQuota(1: string ownerRole) /** * Populates fields in a job configuration as though it were about to be run. * This can be used to diff a configuration running tasks. */ Response populateJobConfig(1: JobConfiguration description) /** Gets job update summaries. */ Response getJobUpdateSummaries(1: JobUpdateQuery jobUpdateQuery) /** Gets job update details. */ Response getJobUpdateDetails(2: JobUpdateQuery query) /** Gets the diff between client (desired) and server (current) job states. */ Response getJobUpdateDiff(1: JobUpdateRequest request) /** Gets tier configurations. */ Response getTierConfigs() } service AuroraSchedulerManager extends ReadOnlyScheduler { /** * Creates a new job. The request will be denied if a job with the provided name already exists * in the cluster. */ Response createJob(1: JobConfiguration description) /** * Enters a job into the cron schedule, without actually starting the job. * If the job is already present in the schedule, this will update the schedule entry with the new * configuration. */ Response scheduleCronJob(1: JobConfiguration description) /** * Removes a job from the cron schedule. The request will be denied if the job was not previously * scheduled with scheduleCronJob. */ Response descheduleCronJob(4: JobKey job) /** * Starts a cron job immediately. The request will be denied if the specified job does not * exist for the role account, or the job is not a cron job. */ Response startCronJob(4: JobKey job) /** Restarts a batch of shards. */ Response restartShards(5: JobKey job, 3: set shardIds) /** Initiates a kill on tasks. */ Response killTasks(4: JobKey job, 5: set instances, 6: string message) /** * Adds new instances with the TaskConfig of the existing instance pointed by the key. */ Response addInstances(3: InstanceKey key, 4: i32 count) // TODO(maxim): reevaluate if it's still needed when client updater is gone (AURORA-785). /** * Replaces the template (configuration) for the existing cron job. * The cron job template (configuration) must exist for the call to succeed. */ Response replaceCronTemplate(1: JobConfiguration config) /** Starts update of the existing service job. */ Response startJobUpdate( /** A description of how to change the job. */ 1: JobUpdateRequest request, /** A user-specified message to include with the induced job update state change. */ 3: string message) /** * Pauses the specified job update. Can be resumed by resumeUpdate call. */ Response pauseJobUpdate( /** The update to pause. */ 1: JobUpdateKey key, /** A user-specified message to include with the induced job update state change. */ 3: string message) /** Resumes progress of a previously paused job update. */ Response resumeJobUpdate( /** The update to resume. */ 1: JobUpdateKey key, /** A user-specified message to include with the induced job update state change. */ 3: string message) /** Permanently aborts the job update. Does not remove the update history. */ Response abortJobUpdate( /** The update to abort. */ 1: JobUpdateKey key, /** A user-specified message to include with the induced job update state change. */ 3: string message) /** * Rollbacks the specified active job update to the initial state. */ Response rollbackJobUpdate( /** The update to rollback. */ 1: JobUpdateKey key, /** A user-specified message to include with the induced job update state change. */ 2: string message) /** * Allows progress of the job update in case blockIfNoPulsesAfterMs is specified in * JobUpdateSettings. Unblocks progress if the update was previously blocked. * Responds with ResponseCode.INVALID_REQUEST in case an unknown update key is specified. */ Response pulseJobUpdate(1: JobUpdateKey key) } struct ExplicitReconciliationSettings { 1: optional i32 batchSize } // It would be great to compose these services rather than extend, but that won't be possible until // https://issues.apache.org/jira/browse/THRIFT-66 is resolved. service AuroraAdmin extends AuroraSchedulerManager { /** Assign quota to a user. This will overwrite any pre-existing quota for the user. */ Response setQuota(1: string ownerRole, 2: ResourceAggregate quota) /** * Forces a task into a specific state. This does not guarantee the task will enter the given * state, as the task must still transition within the bounds of the state machine. However, * it attempts to enter that state via the state machine. */ Response forceTaskState( 1: string taskId, 2: ScheduleStatus status) /** Immediately writes a storage snapshot to disk. */ Response performBackup() /** Lists backups that are available for recovery. */ Response listBackups() /** Loads a backup to an in-memory storage. This must precede all other recovery operations. */ Response stageRecovery(1: string backupId) /** Queries for tasks in a staged recovery. */ Response queryRecovery(1: TaskQuery query) /** Deletes tasks from a staged recovery. */ Response deleteRecoveryTasks(1: TaskQuery query) /** Commits a staged recovery, completely replacing the previous storage state. */ Response commitRecovery() /** Unloads (aborts) a staged recovery. */ Response unloadRecovery() /** Put the given hosts into maintenance mode. */ Response startMaintenance(1: Hosts hosts) /** Ask scheduler to begin moving tasks scheduled on given hosts. */ Response drainHosts(1: Hosts hosts) /** Retrieve the current maintenance states for a group of hosts. */ Response maintenanceStatus(1: Hosts hosts) /** Set the given hosts back into serving mode. */ Response endMaintenance(1: Hosts hosts) /** * Ask scheduler to put hosts into DRAINING mode and move scheduled tasks off of the hosts * such that its SLA requirements are satisfied. Use defaultSlaPolicy if it is not set for a task. **/ Response slaDrainHosts(1: Hosts hosts, 2: SlaPolicy defaultSlaPolicy, 3: i64 timeoutSecs) /** Start a storage snapshot and block until it completes. */ Response snapshot() /** Tell scheduler to trigger an explicit task reconciliation with the given settings. */ Response triggerExplicitTaskReconciliation(1: ExplicitReconciliationSettings settings) /** Tell scheduler to trigger an implicit task reconciliation. */ Response triggerImplicitTaskReconciliation() /** * Force prune any (terminal) tasks that match the query. If no statuses are supplied with the * query, it will default to all terminal task states. If statuses are supplied, they must be * terminal states. */ Response pruneTasks(1: TaskQuery query) } // The name of the header that should be sent to bypass leader redirection in the Scheduler. const string BYPASS_LEADER_REDIRECT_HEADER_NAME = 'Bypass-Leader-Redirect' // The path under which a task's filesystem should be mounted when using images and the Mesos // unified containerizer. const string TASK_FILESYSTEM_MOUNT_POINT = 'taskfs'