package peloton.private.resmgr

Get desktop application:
View/edit binary Protocol Buffers messages

* ResourceManagerService describes the internal interface of Resource Manager to other Peloton applications such as Job Manager and Placement Engine. This includes the EnqueueGangs and GetPlacements APIs called by Job Manager, and DequeueGangs and SetPlacements APIs called by Placement Engine.

rpc EnqueueGangs (EnqueueGangsRequest, EnqueueGangsResponse)
resmgrsvc.proto:36
* Enqueue a list of Gangs, each of which is a list of one or more tasks, to a given leaf resource pool for scheduling. The Gangs will be in PENDING state first and then transit to READY state when the resource pool has available resources. This method will be called by Job Manager when a new job is created or new Gangs are added. If any Gangs fail to enqueue, Job Manager should retry those failed Gangs.
message EnqueueGangsRequest
resmgrsvc.proto:202
- optional api.v0.peloton.ResourcePoolID resPool = 1
  ResourcePool
- repeated Gang gangs = 2
  The list of gangs to enqueue
- string reason = 3
  The reason for enqueuing the gang, needed for resmgr internal task state debugging. e.g. tasks returned by placement engine should have specific reason for why task cannot be placed thus returned.
message EnqueueGangsResponse
resmgrsvc.proto:215
- optional EnqueueGangsResponse.Error error = 1
rpc DequeueGangs (DequeueGangsRequest, DequeueGangsResponse)
resmgrsvc.proto:47
* Dequeue a list of Gangs, each comprised of tasks that are in READY state for placement. The tasks will transit from READY to PLACING state after the return of this method. This method will be called by Placement Engine to retrieve a list of gangs for computing placement. If tasks are in PLACING state for too long in case of Placement Engine failures, the tasks will be timed out and transit back to READY state.
message DequeueGangsRequest
resmgrsvc.proto:232
- uint32 limit = 1
  Max number of ready gangs to dequeue
- uint32 timeout = 2
  Timeout in milliseconds if no gangs are ready
- TaskType type = 3
  Task Type to identify which kind of tasks need to be dequeued
message DequeueGangsResponse
resmgrsvc.proto:243
- optional DequeueGangsResponse.Error error = 1
- repeated Gang gangs = 2
  The list of gangs that have been dequeued
rpc SetPlacements (SetPlacementsRequest, SetPlacementsResponse)
resmgrsvc.proto:59
* Sets the placement information for successfully placed tasks and the reason for unsuccessful tasks. For successfully placed tasks the tasks will transit from PLACING to PLACED state after this call. This method will be called by Placement Engine after it computes the placement decision for those tasks. For unsuccessful tasks the tasks are returned back to the resource manager along with the reason for the failure. These tasks will be tried again at a later time for placement again.
message SetPlacementsRequest
resmgrsvc.proto:262
- repeated Placement placements = 1
  List of successful task placements to set
- repeated SetPlacementsRequest.FailedPlacement failedPlacements = 2
  List of failed task placements to return
message SetPlacementsResponse
resmgrsvc.proto:278
- optional SetPlacementsResponse.Error error = 1
rpc GetPlacements (GetPlacementsRequest, GetPlacementsResponse)
resmgrsvc.proto:68
* Get the placement information for a list of tasks. The tasks will transit from PLACED to LAUNCHING state after this call. This method is called by Job Manager to launch the tasks on Mesos. If the tasks are in LAUNCHING state for too long without transiting to RUNNING state, the tasks will be timedout and transit back to PLACED state.
message GetPlacementsRequest
resmgrsvc.proto:289
- uint32 limit = 1
  Max number of placements to retrieve
- uint32 timeout = 2
  Timeout in milliseconds if no placements
message GetPlacementsResponse
resmgrsvc.proto:297
- optional GetPlacementsResponse.Error error = 1
- repeated Placement placements = 2
  List of task placements to return
rpc NotifyTaskUpdates (NotifyTaskUpdatesRequest, NotifyTaskUpdatesResponse)
resmgrsvc.proto:74
* Notifies task status updates to resource manager. This will be called by Host manager to notify resource manager on task status updates.
message NotifyTaskUpdatesRequest
resmgrsvc.proto:311
- repeated eventstream.Event events = 1
message NotifyTaskUpdatesResponse
resmgrsvc.proto:315
- optional NotifyTaskUpdatesResponse.Error error = 1
- uint64 purgeOffset = 2
rpc GetTasksByHosts (GetTasksByHostsRequest, GetTasksByHostsResponse)
resmgrsvc.proto:82
* Get the list of Tasks running on the the list of host provided. This information is needed from the placement engines to find out which tasks are running on which hosts so the placement engine can place tasks taking this information into account.
message GetTasksByHostsRequest
resmgrsvc.proto:323
- repeated string hostnames = 1
- TaskType type = 2
  Task Type to identify which kind of tasks need to be dequeued, if this is left out all tasks will be returned.
message GetTasksByHostsResponse
resmgrsvc.proto:335
- optional GetTasksByHostsResponse.Error error = 1
- map<string, TaskList> hostTasksMap = 2
  This will return a map from hostname to a list of tasks running on the host.
rpc GetActiveTasks (GetActiveTasksRequest, GetActiveTasksResponse)
resmgrsvc.proto:87
* Get task to state map. This information is helpful for debug purpose.
message GetActiveTasksRequest
resmgrsvc.proto:344
- string jobID = 1
  optional jobID to filter out tasks
- string respoolID = 2
  optional respoolID to filter out tasks
- repeated string states = 3
  optional states to filter out tasks
message GetActiveTasksResponse
resmgrsvc.proto:355
- optional GetActiveTasksResponse.Error error = 1
- map<string, string> taskStatesMap = 2
  This will return a map from task id to state. DEPRECATED
- map<string, GetActiveTasksResponse.TaskEntries> tasksByState = 4
  This will return a map from state to list of tasks.
rpc GetPendingTasks (GetPendingTasksRequest, GetPendingTasksResponse)
resmgrsvc.proto:97
* Returns the tasks which are waiting on resources in a resource pool in the order in which they were added, up to a max limit number of gangs. Eg specifying a limit of 10 would return pending tasks from the first 10 gangs in the queue. The tasks are grouped according to their gang membership since one gang can contain multiple tasks and it is the unit of scheduling.
message GetPendingTasksRequest
resmgrsvc.proto:389
Returns the pending tasks in a resource pool in the order in which they will be processed, grouped by the gang in which they belong.
- optional api.v0.peloton.ResourcePoolID respoolID = 1
  respoolID of the pool
- uint32 limit = 2
  limit is the number of gangs to be returned.
message GetPendingTasksResponse
resmgrsvc.proto:404
* Response message for GetPendingTasks method Return errors: NOT_FOUND: if the resource pool is not found. INVALID_ARGUMENT: if the resource pool is not supplied or is not a leaf node INTERNAL: if failed to get pending tasks because of internal errors.
- map<string, GetPendingTasksResponse.PendingGangs> pendingGangsByQueue = 2
  This will return a map from queue type to the pending gangs
rpc KillTasks (KillTasksRequest, KillTasksResponse)
resmgrsvc.proto:102
* Kill Tasks kills/Delete the tasks in Resource Manager
message KillTasksRequest
resmgrsvc.proto:419
- repeated api.v0.peloton.TaskID tasks = 1
  Peloton Task Ids for
message KillTasksResponse
resmgrsvc.proto:434
- repeated KillTasksResponse.Error error = 1
rpc GetPreemptibleTasks (GetPreemptibleTasksRequest, GetPreemptibleTasksResponse)
resmgrsvc.proto:109
* Get the list of tasks to preempt. The tasks will transition from RUNNING to PREEMPTING state after the return of this method. This method will be called by the job manager to kill the tasks and re-enqueue them.
message GetPreemptibleTasksRequest
resmgrsvc.proto:135
- uint32 limit = 1
  Max number of running tasks to dequeue
- uint32 timeout = 2
  Timeout in milliseconds if no tasks are ready
message GetPreemptibleTasksResponse
resmgrsvc.proto:143
- optional GetPreemptibleTasksResponse.Error error = 1
- repeated Task tasks = 2
  DEPRECATED by preemptionCandidates The list of tasks that have been dequeued
- repeated PreemptionCandidate preemptionCandidates = 3
  The list of tasks to be preempted
rpc UpdateTasksState (UpdateTasksStateRequest, UpdateTasksStateResponse)
resmgrsvc.proto:115
* UpdateTasksState is used to let the resource manager know that the tasks in the request have been moved to corresponding state.
message UpdateTasksStateRequest
resmgrsvc.proto:444
UpdateTasksStateRequest is the request message for updating task's state to a desired state in resource manager
- repeated UpdateTasksStateRequest.UpdateTaskStateEntry taskStates = 1
  List of UpdateTaskEntry
message UpdateTasksStateResponse
resmgrsvc.proto:460
UpdateTasksStateResponse is the response message for UpdateTasksState
(message has no fields)
rpc GetOrphanTasks (GetOrphanTasksRequest, GetOrphanTasksResponse)
resmgrsvc.proto:121
* GetOrphanTasks returns the list of orphan tasks in resource manager. This API is for debug purpose only.
message GetOrphanTasksRequest
resmgrsvc.proto:463
GetOrphanTasksRequest is the request message for GetOrphanTasks
- string respoolID = 1
  optional respoolID to filter out tasks
message GetOrphanTasksResponse
resmgrsvc.proto:469
GetOrphanTasksResponse is the response message for GetOrphanTasksResponse
- repeated Task orphanTasks = 1
rpc GetHostsByScores (GetHostsByScoresRequest, GetHostsByScoresResponse)
resmgrsvc.proto:128
* GetHostsByScores returns a list of batch hosts by rankings, in which hosts are ranked by suitability for draining including number of running tasks, normalized task priorities, average task runtime, etc.
message GetHostsByScoresRequest
resmgrsvc.proto:474
GetHostsByScoresRequest is the request message for GetHostsByScores
- uint32 limit = 1
  Max number of hosts to retrieve
message GetHostsByScoresResponse
resmgrsvc.proto:480
GetHostsByScoresResponse is the response message for GetHostsByScores
- repeated string hosts = 1
  The list of hosts to be moved to different partition

Used in: DequeueGangsResponse.Error

string message = 1

Used in: DequeueGangsResponse

optional RequestTimedout timedout = 1
optional DequeueGangsFailure failure = 2

EnqueueGangsFailure will be return as part of failure in enqueue Gangs

Used in: EnqueueGangsResponse.Error

repeated EnqueueGangsFailure.FailedTask failed = 1
List of failed tasks in gangs which are failed to enqueue/requeue

ErrorCode returns the errorcode for the failure

Used in: FailedTask

ENQUEUE_GANGS_FAILURE_ERROR_CODE_UNKNOWN = 0
Error code UNKNOWN
ENQUEUE_GANGS_FAILURE_ERROR_CODE_INTERNAL = 1
Error code if task is failed to be enqueued/requeued
ENQUEUE_GANGS_FAILURE_ERROR_CODE_ALREADY_EXIST = 2
Error code if same task is already present
ENQUEUE_GANGS_FAILURE_ERROR_CODE_FAILED_DUE_TO_GANG_FAILED = 3
Error code if other tasks in gang failed

Used in: EnqueueGangsFailure

optional Task task = 1
Resmgr task which is failed to enqueue/requeue
string message = 2
Error message with failed reason
ErrorCode errorcode = 3
Error code associated with the failure by that caller can identify the failure

Used in: EnqueueGangsResponse

optional ResourcePoolNotFound notFound = 1
optional ResourcePoolNoPermission noPermission = 2
optional EnqueueGangsFailure failure = 3

Used in: DequeueGangsResponse, EnqueueGangsRequest, SetPlacementsRequest.FailedPlacement

repeated Task tasks = 1
List of tasks to be scheduled together

Used in: GetActiveTasksResponse

string message = 1

Used in: GetActiveTasksResponse

repeated TaskEntry taskEntry = 1

Used in: TaskEntries

string taskID = 1
Mesos task ID of the task.
string taskState = 2
State of the task.
string reason = 3
Reason for the task being the current state.
string lastUpdateTime = 4
Last time the state was updated
string hostname = 5
Depending on the state of the task, this can either mean the host where the task has been placed OR where the task is running. This field will not be set for tasks in PENDING and PLACING states.

List of pending tasks IDs in a gang

Used in: PendingGangs

repeated string taskIDs = 1

List of pending gangs

Used in: GetPendingTasksResponse

repeated PendingGang pendingGangs = 1

Used in: GetPlacementsResponse.Error

string message = 1

Used in: GetPlacementsResponse

optional GetPlacementsFailure failure = 1

Used in: GetPreemptibleTasksResponse.Error

string message = 1

Used in: GetPreemptibleTasksResponse

optional RequestTimedout timedout = 1
optional GetPreemptibleTasksFailure failure = 2

Used in: GetTasksByHostsResponse

string message = 1

Used in: KillTasksResponse.Error

optional api.v0.peloton.TaskID task = 1
string message = 2

Used in: KillTasksResponse

optional TasksNotFound notFound = 1
optional KillTasksError killError = 2

Used in: NotifyTaskUpdatesResponse.Error

string message = 1

Used in: NotifyTaskUpdatesResponse

optional NotifyTaskUpdatesError error = 1

* Placement describes the mapping of a list of tasks to a host so that Job Manager can launch the tasks on the host.

Used in: GetPlacementsResponse, SetPlacementsFailure.FailedPlacement, SetPlacementsRequest

string hostname = 2
The name of the host where the tasks are placed
optional mesos.v1.AgentID agentId = 3
The Mesos agent ID of the host where the tasks are placed
repeated uint32 ports = 5
The list of allocated ports which should be sufficient for all placed tasks
TaskType type = 6
Type of the tasks in the placement. Note all tasks must belong to same type. By default the type is batch task.
optional api.v0.peloton.HostOfferID hostOfferID = 7
The unique offer id of the offers on the host where the tasks are placed
repeated Placement.Task taskIDs = 8
The list of tasks to be placed

Task to be placed

Used in: Placement

optional api.v0.peloton.TaskID pelotonTaskID = 1
optional mesos.v1.TaskID mesosTaskID = 2

PreemptionCandidate represents a task which has been chosen to be preempted

Used in: GetPreemptibleTasksResponse

optional api.v0.peloton.TaskID id = 1
The unique ID of the task Deprecated in favor of task_id
PreemptionReason reason = 2
The reason for choosing the task for preemption
optional mesos.v1.TaskID task_id = 3
The unique ID of the task

The reason for choosing the task for preemption

Used in: PreemptionCandidate

PREEMPTION_REASON_UNKNOWN = 0
Reserved for compatibility
PREEMPTION_REASON_REVOKE_RESOURCES = 1
Resource Preemption
PREEMPTION_REASON_HOST_MAINTENANCE = 2
Host maintenance

Used in: DequeueGangsResponse.Error, GetPreemptibleTasksResponse.Error

string message = 1

Used in: EnqueueGangsResponse.Error

optional api.v0.peloton.ResourcePoolID id = 1
string message = 2

Used in: EnqueueGangsResponse.Error

optional api.v0.peloton.ResourcePoolID id = 1
string message = 2

Used in: SetPlacementsResponse.Error

repeated SetPlacementsFailure.FailedPlacement failed = 1

Used in: SetPlacementsFailure

optional Placement placement = 1
string message = 2

Represents a failed gang which couldn't be placed.

Used in: SetPlacementsRequest

string reason = 1
The reason for the failure.
optional Gang gang = 2
The gang which couldn't be placed.

Used in: SetPlacementsResponse

optional SetPlacementsFailure failure = 1

* Task describes a task instance at Resource Manager layer. Only includes the minimal set of fields required for Resource Manager and Placement Engine, such as resource config, constraint etc.

Used in: hostmgr.hostsvc.CompletedReservation, hostmgr.hostsvc.Reservation, EnqueueGangsFailure.FailedTask, Gang, GetOrphanTasksResponse, GetPreemptibleTasksResponse, TaskList

string name = 1
Name of the task
optional api.v0.peloton.TaskID id = 2
The unique ID of the task
optional api.v0.peloton.JobID jobId = 3
The Job ID of the task for use cases like gang scheduling
optional mesos.v1.TaskID taskId = 4
The mesos task ID of the task
optional api.v0.task.ResourceConfig resource = 5
Resource config of the task
uint32 priority = 6
Priority of a task. Higher value takes priority over lower value when making scheduling decisions as well as preemption decisions
bool preemptible = 7
Whether the task is preemptible. If a task is not preemptible, then it will have to be launched using reserved resources.
optional mesos.v1.Labels labels = 8
List of user-defined labels for the task, these are used to enforce the constraint. These are copied from the TaskConfig.
optional api.v0.task.Constraint constraint = 9
Constraint on the labels of the host or tasks on the host that this task should run on. This is copied from the TaskConfig.
TaskType type = 10
Type of the Task
uint32 numPorts = 11
Number of dynamic ports
uint32 minInstances = 12
Minimum number of running instances. Value > 1 indicates task is in scheduling gang of that size; task instanceID is in [0..minInstances-1]. If value <= 1, task is not in scheduling gang and is scheduled singly.
string hostname = 13
Hostname of the host on which the task is running on.
bool controller = 14
Whether this is a controler task. A controller is a special batch task which controls other tasks inside a job. E.g. spark driver tasks in a spark job will be a controller task.
double placementTimeoutSeconds = 15
This is the timeout for the placement, it needs to be set for different timeout value for each placement iteration. Also it will also be used for communicating between placement engine in case of Host reservation.
double placementRetryCount = 16
Retry count for how many cycles this task is failed to be Placed. This is needed for calculating next backoff period and decide when we need to do Host reservation.
bool revocable = 17
Whether the task is revocable. If true, then it will be launched with usage slack resources. Revocable tasks will be killed by QoS controller, if resources are required by tasks to which initial allocation was done.
string desiredHost = 18
The name of the host where the instance should be running on upon restart. It is used for best effort in-place update/restart. When this field is set upon enqueuegang, the task would directly move to ready queue.
double placementAttemptCount = 19
Number of attempts this task has been retried for placement in a cycle.
bool readyForHostReservation = 20
Flag to indicate the task is ready for host reservation
api.v0.job.PlacementStrategy placementStrategy = 21
Preference for placing tasks of the job on hosts.

Used in: GetTasksByHostsResponse

repeated Task tasks = 1

* TaskType task type definition such as batch, service and infra agent.

Used in: DequeueGangsRequest, GetTasksByHostsRequest, Placement, Task

UNKNOWN = 0
This is unknown type, this is also used in DequeueGangsRequest to indicate that we want tasks of any task type back.
BATCH = 1
Normal batch task
STATELESS = 2
STATELESS task which is long running and will be restarted upon failures.
STATEFUL = 3
STATEFUL task which is using persistent volume and is long running
DAEMON = 4
DAEMON task which has one instance running on each host for infra agents like muttley, m3collector etc.

Used in: KillTasksResponse.Error

optional api.v0.peloton.TaskID task = 1
string message = 2

UpdateTaskStateEntry is the entry for UpdateTaskState Request will have list of UpdateTaskStateEntry

Used in: UpdateTasksStateRequest

optional api.v0.peloton.TaskID task = 1
Peloton Task ID
optional mesos.v1.TaskID mesosTaskId = 2
Mesos task ID for this instance
api.v0.task.TaskState state = 3
Desired state for the resource manager task

package peloton.private.resmgr

service ResourceManagerService

rpc EnqueueGangs (EnqueueGangsRequest, EnqueueGangsResponse)

message EnqueueGangsRequest

optional api.v0.peloton.ResourcePoolID resPool = 1

repeated Gang gangs = 2

string reason = 3

message EnqueueGangsResponse

optional EnqueueGangsResponse.Error error = 1

rpc DequeueGangs (DequeueGangsRequest, DequeueGangsResponse)

message DequeueGangsRequest

uint32 limit = 1

uint32 timeout = 2

TaskType type = 3

message DequeueGangsResponse

optional DequeueGangsResponse.Error error = 1

repeated Gang gangs = 2

rpc SetPlacements (SetPlacementsRequest, SetPlacementsResponse)

message SetPlacementsRequest

repeated Placement placements = 1

repeated SetPlacementsRequest.FailedPlacement failedPlacements = 2

message SetPlacementsResponse

optional SetPlacementsResponse.Error error = 1

rpc GetPlacements (GetPlacementsRequest, GetPlacementsResponse)

message GetPlacementsRequest

uint32 limit = 1

uint32 timeout = 2

message GetPlacementsResponse

optional GetPlacementsResponse.Error error = 1

repeated Placement placements = 2

rpc NotifyTaskUpdates (NotifyTaskUpdatesRequest, NotifyTaskUpdatesResponse)

message NotifyTaskUpdatesRequest

repeated eventstream.Event events = 1

message NotifyTaskUpdatesResponse

optional NotifyTaskUpdatesResponse.Error error = 1

uint64 purgeOffset = 2

rpc GetTasksByHosts (GetTasksByHostsRequest, GetTasksByHostsResponse)

message GetTasksByHostsRequest

repeated string hostnames = 1

TaskType type = 2

message GetTasksByHostsResponse

optional GetTasksByHostsResponse.Error error = 1

map<string, TaskList> hostTasksMap = 2

rpc GetActiveTasks (GetActiveTasksRequest, GetActiveTasksResponse)

message GetActiveTasksRequest

string jobID = 1

string respoolID = 2

repeated string states = 3

message GetActiveTasksResponse

optional GetActiveTasksResponse.Error error = 1

map<string, string> taskStatesMap = 2

map<string, GetActiveTasksResponse.TaskEntries> tasksByState = 4

rpc GetPendingTasks (GetPendingTasksRequest, GetPendingTasksResponse)

message GetPendingTasksRequest

optional api.v0.peloton.ResourcePoolID respoolID = 1

uint32 limit = 2

message GetPendingTasksResponse

map<string, GetPendingTasksResponse.PendingGangs> pendingGangsByQueue = 2

rpc KillTasks (KillTasksRequest, KillTasksResponse)

message KillTasksRequest

repeated api.v0.peloton.TaskID tasks = 1

message KillTasksResponse

repeated KillTasksResponse.Error error = 1

rpc GetPreemptibleTasks (GetPreemptibleTasksRequest, GetPreemptibleTasksResponse)

message GetPreemptibleTasksRequest

uint32 limit = 1

uint32 timeout = 2

message GetPreemptibleTasksResponse

optional GetPreemptibleTasksResponse.Error error = 1

repeated Task tasks = 2

repeated PreemptionCandidate preemptionCandidates = 3

rpc UpdateTasksState (UpdateTasksStateRequest, UpdateTasksStateResponse)

message UpdateTasksStateRequest

repeated UpdateTasksStateRequest.UpdateTaskStateEntry taskStates = 1

message UpdateTasksStateResponse

rpc GetOrphanTasks (GetOrphanTasksRequest, GetOrphanTasksResponse)

message GetOrphanTasksRequest

string respoolID = 1

message GetOrphanTasksResponse

repeated Task orphanTasks = 1