package inference

Get desktop application:
View/edit binary Protocol Buffers messages

@@ @@.. cpp:var:: service InferenceService @@ @@ Inference Server GRPC endpoints. @@

rpc CudaSharedMemoryRegister (CudaSharedMemoryRegisterRequest, CudaSharedMemoryRegisterResponse)
grpc_service.proto:191
@@ .. cpp:var:: rpc CudaSharedMemoryRegister( @@ CudaSharedMemoryRegisterRequest) @@ returns (CudaSharedMemoryRegisterResponse) @@ @@ Register a CUDA-shared-memory region. @@
message CudaSharedMemoryRegisterRequest
grpc_service.proto:1611
@@ @@.. cpp:var:: message CudaSharedMemoryRegisterRequest @@ @@ Request message for CudaSharedMemoryRegister. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the region to register. @@
- bytes raw_handle = 2
  @@ .. cpp:var:: bytes raw_handle @@ @@ The raw serialized cudaIPC handle. @@
- int64 device_id = 3
  @@ .. cpp:var:: int64 device_id @@ @@ The GPU device ID on which the cudaIPC handle was created. @@
- uint64 byte_size = 4
  @@ .. cpp:var:: uint64 byte_size @@ @@ Size of the shared memory block, in bytes. @@
message CudaSharedMemoryRegisterResponse
grpc_service.proto:1644
@@ @@.. cpp:var:: message CudaSharedMemoryRegisterResponse @@ @@ Response message for CudaSharedMemoryRegister. @@
(message has no fields)
rpc CudaSharedMemoryStatus (CudaSharedMemoryStatusRequest, CudaSharedMemoryStatusResponse)
grpc_service.proto:180
@@ .. cpp:var:: rpc CudaSharedMemoryStatus( @@ CudaSharedMemoryStatusRequest) @@ returns (CudaSharedMemoryStatusRespose) @@ @@ Get the status of all registered CUDA-shared-memory regions. @@
message CudaSharedMemoryStatusRequest
grpc_service.proto:1552
@@ @@.. cpp:var:: message CudaSharedMemoryStatusRequest @@ @@ Request message for CudaSharedMemoryStatus. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the region to get status for. If empty the @@ status is returned for all registered regions. @@
message CudaSharedMemoryStatusResponse
grpc_service.proto:1568
@@ @@.. cpp:var:: message CudaSharedMemoryStatusResponse @@ @@ Response message for CudaSharedMemoryStatus. @@
- map<string, CudaSharedMemoryStatusResponse.RegionStatus> regions = 1
  @@ @@ .. cpp:var:: map<string,RegionStatus> regions @@ @@ Status for each of the registered regions, indexed by @@ region name. @@
rpc CudaSharedMemoryUnregister (CudaSharedMemoryUnregisterRequest, CudaSharedMemoryUnregisterResponse)
grpc_service.proto:202
@@ .. cpp:var:: rpc CudaSharedMemoryUnregister( @@ CudaSharedMemoryUnregisterRequest) @@ returns (CudaSharedMemoryUnregisterResponse) @@ @@ Unregister a CUDA-shared-memory region. @@
message CudaSharedMemoryUnregisterRequest
grpc_service.proto:1651
@@ @@.. cpp:var:: message CudaSharedMemoryUnregisterRequest @@ @@ Request message for CudaSharedMemoryUnregister. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the cuda region to unregister. If empty @@ all cuda shared-memory regions are unregistered. @@
message CudaSharedMemoryUnregisterResponse
grpc_service.proto:1667
@@ @@.. cpp:var:: message CudaSharedMemoryUnregisterResponse @@ @@ Response message for CudaSharedMemoryUnregister. @@
(message has no fields)
rpc LogSettings (LogSettingsRequest, LogSettingsResponse)
grpc_service.proto:219
@@ .. cpp:var:: rpc LogSettings(LogSettingsRequest) @@ returns (LogSettingsResponse) @@ @@ Update and get the log settings of the Triton server. @@
message LogSettingsRequest
grpc_service.proto:1744
@@ @@.. cpp:var:: message LogSettingsRequest @@ @@ Request message for LogSettings. @@
- map<string, LogSettingsRequest.SettingValue> settings = 1
  @@ .. cpp:var:: map<string,SettingValue> settings @@ @@ The current log settings. @@
message LogSettingsResponse
grpc_service.proto:1781
@@ @@.. cpp:var:: message LogSettingsResponse @@ @@ Response message for LogSettings. @@
- map<string, LogSettingsResponse.SettingValue> settings = 1
  @@ .. cpp:var:: map<string,SettingValue> settings @@ @@ The current log settings. @@
rpc ModelConfig (ModelConfigRequest, ModelConfigResponse)
grpc_service.proto:100
@@ .. cpp:var:: rpc ModelConfig(ModelConfigRequest) returns @@ (ModelConfigResponse) @@ @@ Get model configuration. @@
message ModelConfigRequest
grpc_service.proto:845
@@ @@.. cpp:var:: message ModelConfigRequest @@ @@ Request message for ModelConfig. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the model. @@
- string version = 2
  @@ .. cpp:var:: string version @@ @@ The version of the model. If not given the model version @@ is selected automatically based on the version policy. @@
message ModelConfigResponse
grpc_service.proto:867
@@ @@.. cpp:var:: message ModelConfigResponse @@ @@ Response message for ModelConfig. @@
- optional ModelConfig config = 1
  @@ @@ .. cpp:var:: ModelConfig config @@ @@ The model configuration. @@
rpc ModelInfer (ModelInferRequest, ModelInferResponse)
grpc_service.proto:83
@@ .. cpp:var:: rpc ModelInfer(ModelInferRequest) returns @@ (ModelInferResponse) @@ @@ Perform inference using a specific model. @@
rpc ModelMetadata (ModelMetadataRequest, ModelMetadataResponse)
grpc_service.proto:76
@@ .. cpp:var:: rpc ModelMetadata(ModelMetadataRequest) returns @@ (ModelMetadataResponse) @@ @@ Get model metadata. @@
message ModelMetadataRequest
grpc_service.proto:344
@@ @@.. cpp:var:: message ModelMetadataRequest @@ @@ Request message for ModelMetadata. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the model. @@
- string version = 2
  @@ .. cpp:var:: string version @@ @@ The version of the model to check for readiness. If not @@ given the server will choose a version based on the @@ model and internal policy. @@
message ModelMetadataResponse
grpc_service.proto:367
@@ @@.. cpp:var:: message ModelMetadataResponse @@ @@ Response message for ModelMetadata. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The model name. @@
- repeated string versions = 2
  @@ @@ .. cpp:var:: string versions (repeated) @@ @@ The versions of the model. @@
- string platform = 3
  @@ @@ .. cpp:var:: string platform @@ @@ The model's platform. @@
- repeated ModelMetadataResponse.TensorMetadata inputs = 4
  @@ @@ .. cpp:var:: TensorMetadata inputs (repeated) @@ @@ The model's inputs. @@
- repeated ModelMetadataResponse.TensorMetadata outputs = 5
  @@ @@ .. cpp:var:: TensorMetadata outputs (repeated) @@ @@ The model's outputs. @@
rpc ModelReady (ModelReadyRequest, ModelReadyResponse)
grpc_service.proto:62
@@ .. cpp:var:: rpc ModelReady(ModelReadyRequest) returns @@ (ModelReadyResponse) @@ @@ Check readiness of a model in the inference server. @@
message ModelReadyRequest
grpc_service.proto:271
@@ @@.. cpp:var:: message ModelReadyRequest @@ @@ Request message for ModelReady. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the model to check for readiness. @@
- string version = 2
  @@ .. cpp:var:: string version @@ @@ The version of the model to check for readiness. If not given the @@ server will choose a version based on the model and internal policy. @@
message ModelReadyResponse
grpc_service.proto:293
@@ @@.. cpp:var:: message ModelReadyResponse @@ @@ Response message for ModelReady. @@
- bool ready = 1
  @@ @@ .. cpp:var:: bool ready @@ @@ True if the model is ready, false it not ready. @@
rpc ModelStatistics (ModelStatisticsRequest, ModelStatisticsResponse)
grpc_service.proto:108
@@ .. cpp:var:: rpc ModelStatistics( @@ ModelStatisticsRequest) @@ returns (ModelStatisticsResponse) @@ @@ Get the cumulative inference statistics for a model. @@
message ModelStatisticsRequest
grpc_service.proto:882
@@ @@.. cpp:var:: message ModelStatisticsRequest @@ @@ Request message for ModelStatistics. @@
- string name = 1
  @@ .. cpp:var:: string name @@ @@ The name of the model. If not given returns statistics for @@ all models. @@
- string version = 2
  @@ .. cpp:var:: string version @@ @@ The version of the model. If not given returns statistics for @@ all model versions. @@
message ModelStatisticsResponse
grpc_service.proto:1227
@@ @@.. cpp:var:: message ModelStatisticsResponse @@ @@ Response message for ModelStatistics. @@
- repeated ModelStatistics model_stats = 1
  @@ .. cpp:var:: ModelStatistics model_stats (repeated) @@ @@ Statistics for each requested model. @@
rpc ModelStreamInfer (stream ModelInferRequest, stream ModelStreamInferResponse)
grpc_service.proto:90
@@ .. cpp:var:: rpc ModelStreamInfer(stream ModelInferRequest) returns @@ (stream ModelStreamInferResponse) @@ @@ Perform streaming inference. @@
message ModelStreamInferResponse
grpc_service.proto:822
@@ @@.. cpp:var:: message ModelStreamInferResponse @@ @@ Response message for ModelStreamInfer. @@
- string error_message = 1
  @@ @@ .. cpp:var:: string error_message @@ @@ The message describing the error. The empty message @@ indicates the inference was successful without errors. @@
- optional ModelInferResponse infer_response = 2
  @@ @@ .. cpp:var:: ModelInferResponse infer_response @@ @@ Holds the results of the request. @@
rpc RepositoryIndex (RepositoryIndexRequest, RepositoryIndexResponse)
grpc_service.proto:117
@@ .. cpp:var:: rpc RepositoryIndex(RepositoryIndexRequest) returns @@ (RepositoryIndexResponse) @@ @@ Get the index of model repository contents. @@
message RepositoryIndexRequest
grpc_service.proto:1281
@@ @@.. cpp:var:: message RepositoryIndexRequest @@ @@ Request message for RepositoryIndex. @@
- string repository_name = 1
  @@ .. cpp:var:: string repository_name @@ @@ The name of the repository. If empty the index is returned @@ for all repositories. @@
- bool ready = 2
  @@ .. cpp:var:: bool ready @@ @@ If true returned only models currently ready for inferencing. @@
message RepositoryIndexResponse
grpc_service.proto:1302
@@ @@.. cpp:var:: message RepositoryIndexResponse @@ @@ Response message for RepositoryIndex. @@
- repeated RepositoryIndexResponse.ModelIndex models = 1
  @@ @@ .. cpp:var:: ModelIndex models (repeated) @@ @@ An index entry for each model. @@
rpc RepositoryModelLoad (RepositoryModelLoadRequest, RepositoryModelLoadResponse)
grpc_service.proto:126
@@ .. cpp:var:: rpc RepositoryModelLoad(RepositoryModelLoadRequest) returns @@ (RepositoryModelLoadResponse) @@ @@ Load or reload a model from a repository. @@
message RepositoryModelLoadRequest
grpc_service.proto:1352
@@ @@.. cpp:var:: message RepositoryModelLoadRequest @@ @@ Request message for RepositoryModelLoad. @@
- string repository_name = 1
  @@ .. cpp:var:: string repository_name @@ @@ The name of the repository to load from. If empty the model @@ is loaded from any repository. @@
- string model_name = 2
  @@ .. cpp:var:: string repository_name @@ @@ The name of the model to load, or reload. @@
- map<string, ModelRepositoryParameter> parameters = 3
  @@ .. cpp:var:: map<string,ModelRepositoryParameter> parameters @@ @@ Optional model repository request parameters. @@
message RepositoryModelLoadResponse
grpc_service.proto:1379
@@ @@.. cpp:var:: message RepositoryModelLoadResponse @@ @@ Response message for RepositoryModelLoad. @@
(message has no fields)
rpc RepositoryModelUnload (RepositoryModelUnloadRequest, RepositoryModelUnloadResponse)
grpc_service.proto:136
@@ .. cpp:var:: rpc RepositoryModelUnload(RepositoryModelUnloadRequest) @@ returns (RepositoryModelUnloadResponse) @@ @@ Unload a model. @@
message RepositoryModelUnloadRequest
grpc_service.proto:1386
@@ @@.. cpp:var:: message RepositoryModelUnloadRequest @@ @@ Request message for RepositoryModelUnload. @@
- string repository_name = 1
  @@ .. cpp:var:: string repository_name @@ @@ The name of the repository from which the model was originally @@ loaded. If empty the repository is not considered. @@
- string model_name = 2
  @@ .. cpp:var:: string repository_name @@ @@ The name of the model to unload. @@
- map<string, ModelRepositoryParameter> parameters = 3
  @@ .. cpp:var:: map<string,ModelRepositoryParameter> parameters @@ @@ Optional model repository request parameters. @@
message RepositoryModelUnloadResponse
grpc_service.proto:1413
@@ @@.. cpp:var:: message RepositoryModelUnloadResponse @@ @@ Response message for RepositoryModelUnload. @@
(message has no fields)
rpc ServerLive (ServerLiveRequest, ServerLiveResponse)
grpc_service.proto:48
@@ .. cpp:var:: rpc ServerLive(ServerLiveRequest) returns @@ (ServerLiveResponse) @@ @@ Check liveness of the inference server. @@
message ServerLiveRequest
grpc_service.proto:227
@@ @@.. cpp:var:: message ServerLiveRequest @@ @@ Request message for ServerLive. @@
(message has no fields)
message ServerLiveResponse
grpc_service.proto:234
@@ @@.. cpp:var:: message ServerLiveResponse @@ @@ Response message for ServerLive. @@
- bool live = 1
  @@ @@ .. cpp:var:: bool live @@ @@ True if the inference server is live, false it not live. @@
rpc ServerMetadata (ServerMetadataRequest, ServerMetadataResponse)
grpc_service.proto:69
@@ .. cpp:var:: rpc ServerMetadata(ServerMetadataRequest) returns @@ (ServerMetadataResponse) @@ @@ Get server metadata. @@
message ServerMetadataRequest
grpc_service.proto:308
@@ @@.. cpp:var:: message ServerMetadataRequest @@ @@ Request message for ServerMetadata. @@
(message has no fields)
message ServerMetadataResponse
grpc_service.proto:315
@@ @@.. cpp:var:: message ServerMetadataResponse @@ @@ Response message for ServerMetadata. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The server name. @@
- string version = 2
  @@ @@ .. cpp:var:: string version @@ @@ The server version. @@
- repeated string extensions = 3
  @@ @@ .. cpp:var:: string extensions (repeated) @@ @@ The extensions supported by the server. @@
rpc ServerReady (ServerReadyRequest, ServerReadyResponse)
grpc_service.proto:55
@@ .. cpp:var:: rpc ServerReady(ServerReadyRequest) returns @@ (ServerReadyResponse) @@ @@ Check readiness of the inference server. @@
message ServerReadyRequest
grpc_service.proto:249
@@ @@.. cpp:var:: message ServerReadyRequest @@ @@ Request message for ServerReady. @@
(message has no fields)
message ServerReadyResponse
grpc_service.proto:256
@@ @@.. cpp:var:: message ServerReadyResponse @@ @@ Response message for ServerReady. @@
- bool ready = 1
  @@ @@ .. cpp:var:: bool ready @@ @@ True if the inference server is ready, false it not ready. @@
rpc SystemSharedMemoryRegister (SystemSharedMemoryRegisterRequest, SystemSharedMemoryRegisterResponse)
grpc_service.proto:158
@@ .. cpp:var:: rpc SystemSharedMemoryRegister( @@ SystemSharedMemoryRegisterRequest) @@ returns (SystemSharedMemoryRegisterResponse) @@ @@ Register a system-shared-memory region. @@
message SystemSharedMemoryRegisterRequest
grpc_service.proto:1487
@@ @@.. cpp:var:: message SystemSharedMemoryRegisterRequest @@ @@ Request message for SystemSharedMemoryRegister. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the region to register. @@
- string key = 2
  @@ .. cpp:var:: string shared_memory_key @@ @@ The key of the underlying memory object that contains the @@ shared memory region. @@
- uint64 offset = 3
  @@ .. cpp:var:: uint64 offset @@ @@ Offset, in bytes, within the underlying memory object to @@ the start of the shared memory region. @@
- uint64 byte_size = 4
  @@ .. cpp:var:: uint64 byte_size @@ @@ Size of the shared memory region, in bytes. @@
message SystemSharedMemoryRegisterResponse
grpc_service.proto:1522
@@ @@.. cpp:var:: message SystemSharedMemoryRegisterResponse @@ @@ Response message for SystemSharedMemoryRegister. @@
(message has no fields)
rpc SystemSharedMemoryStatus (SystemSharedMemoryStatusRequest, SystemSharedMemoryStatusResponse)
grpc_service.proto:147
@@ .. cpp:var:: rpc SystemSharedMemoryStatus( @@ SystemSharedMemoryStatusRequest) @@ returns (SystemSharedMemoryStatusRespose) @@ @@ Get the status of all registered system-shared-memory regions. @@
message SystemSharedMemoryStatusRequest
grpc_service.proto:1420
@@ @@.. cpp:var:: message SystemSharedMemoryStatusRequest @@ @@ Request message for SystemSharedMemoryStatus. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the region to get status for. If empty the @@ status is returned for all registered regions. @@
message SystemSharedMemoryStatusResponse
grpc_service.proto:1436
@@ @@.. cpp:var:: message SystemSharedMemoryStatusResponse @@ @@ Response message for SystemSharedMemoryStatus. @@
- map<string, SystemSharedMemoryStatusResponse.RegionStatus> regions = 1
  @@ @@ .. cpp:var:: map<string,RegionStatus> regions @@ @@ Status for each of the registered regions, indexed by @@ region name. @@
rpc SystemSharedMemoryUnregister (SystemSharedMemoryUnregisterRequest, SystemSharedMemoryUnregisterResponse)
grpc_service.proto:169
@@ .. cpp:var:: rpc SystemSharedMemoryUnregister( @@ SystemSharedMemoryUnregisterRequest) @@ returns (SystemSharedMemoryUnregisterResponse) @@ @@ Unregister a system-shared-memory region. @@
message SystemSharedMemoryUnregisterRequest
grpc_service.proto:1529
@@ @@.. cpp:var:: message SystemSharedMemoryUnregisterRequest @@ @@ Request message for SystemSharedMemoryUnregister. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the system region to unregister. If empty @@ all system shared-memory regions are unregistered. @@
message SystemSharedMemoryUnregisterResponse
grpc_service.proto:1545
@@ @@.. cpp:var:: message SystemSharedMemoryUnregisterResponse @@ @@ Response message for SystemSharedMemoryUnregister. @@
(message has no fields)
rpc TraceSetting (TraceSettingRequest, TraceSettingResponse)
grpc_service.proto:212
@@ .. cpp:var:: rpc TraceSetting(TraceSettingRequest) @@ returns (TraceSettingResponse) @@ @@ Update and get the trace setting of the Triton server. @@
message TraceSettingRequest
grpc_service.proto:1674
@@ @@.. cpp:var:: message TraceSettingRequest @@ @@ Request message for TraceSetting. @@
- map<string, TraceSettingRequest.SettingValue> settings = 1
  @@ .. cpp:var:: map<string,SettingValue> settings @@ @@ The new setting values to be updated, @@ settings that are not specified will remain unchanged. @@
- string model_name = 2
  @@ @@ .. cpp:var:: string model_name @@ @@ The name of the model to apply the new trace settings. @@ If not given, the new settings will be applied globally. @@
message TraceSettingResponse
grpc_service.proto:1714
@@ @@.. cpp:var:: message TraceSettingResponse @@ @@ Response message for TraceSetting. @@
- map<string, TraceSettingResponse.SettingValue> settings = 1
  @@ .. cpp:var:: map<string,SettingValue> settings @@ @@ The current trace settings, including any changes specified @@ by TraceSettingRequest. @@

@@ .. cpp:var:: message BatchInput @@ @@ A batch input is an additional input that must be added by @@ the backend based on all the requests in a batch. @@

Used in: ModelConfig

BatchInput.Kind kind = 1
@@ .. cpp:var:: Kind kind @@ @@ The kind of this batch input. @@
repeated string target_name = 2
@@ .. cpp:var:: string target_name (repeated) @@ @@ The name of the model inputs that the backend will create @@ for this batch input. @@
DataType data_type = 3
@@ .. cpp:var:: DataType data_type @@ @@ The input's datatype. The data type can be TYPE_INT32 or @@ TYPE_FP32. @@
repeated string source_input = 4
@@ .. cpp:var:: string source_input (repeated) @@ @@ The backend derives the value for each batch input from one or @@ more other inputs. 'source_input' gives the names of those @@ inputs. @@

@@ @@ .. cpp:enum:: Kind @@ @@ The kind of the batch input. @@

Used in: BatchInput

BATCH_ELEMENT_COUNT = 0
@@ .. cpp:enumerator:: Kind::BATCH_ELEMENT_COUNT = 0 @@ @@ The element count of the 'source_input' will be added as @@ input with shape [1]. @@
BATCH_ACCUMULATED_ELEMENT_COUNT = 1
@@ .. cpp:enumerator:: Kind::BATCH_ACCUMULATED_ELEMENT_COUNT = 1 @@ @@ The accumulated element count of the 'source_input' will be @@ added as input with shape [1]. For example, if there is a @@ batch of two request, each with 2 elements, an input of value @@ 2 will be added to the first request, and an input of value @@ 4 will be added to the second request. @@
BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO = 2
@@ .. cpp:enumerator:: @@ Kind::BATCH_ACCUMULATED_ELEMENT_COUNT_WITH_ZERO = 2 @@ @@ The accumulated element count of the 'source_input' will be @@ added as input with shape [1], except for the first request @@ in the batch. For the first request in the batch, the input @@ will have shape [2] where the first element is value 0. @@
BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3
@@ .. cpp:enumerator:: Kind::BATCH_MAX_ELEMENT_COUNT_AS_SHAPE = 3 @@ @@ Among the requests in the batch, the max element count of the @@ 'source_input' will be added as input with shape @@ [max_element_count] for the first request in the batch. @@ For other requests, such input will be with shape [0]. @@ The data of the tensor will be uninitialized. @@
BATCH_ITEM_SHAPE = 4
@@ .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE = 4 @@ @@ Among the requests in the batch, the shape of the @@ 'source_input' will be added as input with shape @@ [batch_size, len(input_dim)]. For example, if one @@ batch-2 input with shape [3, 1] and batch-1 input @@ with shape [2, 2] are batched, the batch input will @@ have shape [3, 2] and value [ [3, 1], [3, 1], [2, 2]]. @@
BATCH_ITEM_SHAPE_FLATTEN = 5
@@ .. cpp:enumerator:: Kind::BATCH_ITEM_SHAPE_FLATTEN = 5 @@ @@ Among the requests in the batch, the shape of the @@ 'source_input' will be added as input with single dimensional @@ shape [batch_size * len(input_dim)]. For example, if one @@ batch-2 input with shape [3, 1] and batch-1 input @@ with shape [2, 2] are batched, the batch input will @@ have shape [6] and value [3, 1, 3, 1, 2, 2]. @@

@@.. cpp:var:: message BatchOutput @@ @@ A batch output is an output produced by the model that must be handled @@ differently by the backend based on all the requests in a batch. @@

Used in: ModelConfig

repeated string target_name = 1
@@ .. cpp:var:: string target_name (repeated) @@ @@ The name of the outputs to be produced by this batch output @@ specification. @@
BatchOutput.Kind kind = 2
@@ .. cpp:var:: Kind kind @@ @@ The kind of this batch output. @@
repeated string source_input = 3
@@ .. cpp:var:: string source_input (repeated) @@ @@ The backend derives each batch output from one or more inputs. @@ 'source_input' gives the names of those inputs. @@

@@ @@ .. cpp:enum:: Kind @@ @@ The kind of the batch output. @@

Used in: BatchOutput

BATCH_SCATTER_WITH_INPUT_SHAPE = 0
@@ .. cpp:enumerator:: Kind::BATCH_SCATTER_WITH_INPUT_SHAPE = 0 @@ @@ The output should be scattered according to the shape of @@ 'source_input'. The dynamic dimension of the output will @@ be set to the value of the same dimension in the input. @@

@@ @@ .. cpp:var:: message RegionStatus @@ @@ Status for a shared memory region. @@

Used in: CudaSharedMemoryStatusResponse

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The name for the shared memory region. @@
uint64 device_id = 2
@@ .. cpp:var:: uin64 device_id @@ @@ The GPU device ID where the cudaIPC handle was created. @@
uint64 byte_size = 3
@@ .. cpp:var:: uint64 byte_size @@ @@ Size of the shared memory region, in bytes. @@

@@ @@.. cpp:enum:: DataType @@ @@ Data types supported for input and output tensors. @@

Used in: BatchInput, ModelInput, ModelOutput, ModelSequenceBatching.Control, ModelSequenceBatching.InitialState, ModelSequenceBatching.State, ModelWarmup.Input

TYPE_INVALID = 0
@@ .. cpp:enumerator:: DataType::INVALID = 0
TYPE_BOOL = 1
@@ .. cpp:enumerator:: DataType::BOOL = 1
TYPE_UINT8 = 2
@@ .. cpp:enumerator:: DataType::UINT8 = 2
TYPE_UINT16 = 3
@@ .. cpp:enumerator:: DataType::UINT16 = 3
TYPE_UINT32 = 4
@@ .. cpp:enumerator:: DataType::UINT32 = 4
TYPE_UINT64 = 5
@@ .. cpp:enumerator:: DataType::UINT64 = 5
TYPE_INT8 = 6
@@ .. cpp:enumerator:: DataType::INT8 = 6
TYPE_INT16 = 7
@@ .. cpp:enumerator:: DataType::INT16 = 7
TYPE_INT32 = 8
@@ .. cpp:enumerator:: DataType::INT32 = 8
TYPE_INT64 = 9
@@ .. cpp:enumerator:: DataType::INT64 = 9
TYPE_FP16 = 10
@@ .. cpp:enumerator:: DataType::FP16 = 10
TYPE_FP32 = 11
@@ .. cpp:enumerator:: DataType::FP32 = 11
TYPE_FP64 = 12
@@ .. cpp:enumerator:: DataType::FP64 = 12
TYPE_STRING = 13
@@ .. cpp:enumerator:: DataType::STRING = 13
TYPE_BF16 = 14
@@ .. cpp:enumerator:: DataType::BF16 = 14

@@ @@.. cpp:var:: message InferBatchStatistics @@ @@ Inference batch statistics. @@

Used in: ModelStatistics

uint64 batch_size = 1
@@ .. cpp:var:: uint64 batch_size @@ @@ The size of the batch. @@
optional StatisticDuration compute_input = 2
@@ .. cpp:var:: StatisticDuration compute_input @@ @@ The count and cumulative duration to prepare input tensor data as @@ required by the model framework / backend with the given batch size. @@ For example, this duration should include the time to copy input @@ tensor data to the GPU. @@
optional StatisticDuration compute_infer = 3
@@ .. cpp:var:: StatisticDuration compute_infer @@ @@ The count and cumulative duration to execute the model with the given @@ batch size. @@
optional StatisticDuration compute_output = 4
@@ .. cpp:var:: StatisticDuration compute_output @@ @@ The count and cumulative duration to extract output tensor data @@ produced by the model framework / backend with the given batch size. @@ For example, this duration should include the time to copy output @@ tensor data from the GPU. @@

@@ @@.. cpp:var:: message InferParameter @@ @@ An inference parameter value. @@

Used in: ModelInferRequest, ModelInferRequest.InferInputTensor, ModelInferRequest.InferRequestedOutputTensor, ModelInferResponse, ModelInferResponse.InferOutputTensor

oneof parameter_choice
@@ .. cpp:var:: oneof parameter_choice @@ @@ The parameter value can be a string, an int64, @@ an uint64, a double, or a boolean @@ @@ Note: double and uint64 are currently @@ placeholders for future use and @@ are not supported for custom parameters @@
- bool bool_param = 1
  @@ .. cpp:var:: bool bool_param @@ @@ A boolean parameter value. @@
- int64 int64_param = 2
  @@ .. cpp:var:: int64 int64_param @@ @@ An int64 parameter value. @@
- string string_param = 3
  @@ .. cpp:var:: string string_param @@ @@ A string parameter value. @@
- double double_param = 4
  @@ .. cpp:var:: double double_param @@ @@ A double parameter value. @@
- uint64 uint64_param = 5
  @@ .. cpp:var:: uint64 uint64_param @@ @@ A uint64 parameter value. @@ @@ Not supported for custom parameters @@

@@ @@.. cpp:var:: message InferResponseStatistics @@ @@ Statistics per response. @@

Used in: ModelStatistics

optional StatisticDuration compute_infer = 1
@@ .. cpp:var:: StatisticDuration compute_infer @@ @@ The count and cumulative duration to compute a response. @@
optional StatisticDuration compute_output = 2
@@ .. cpp:var:: StatisticDuration compute_output @@ @@ The count and cumulative duration to extract the output tensors of a @@ response. @@
optional StatisticDuration success = 3
@@ .. cpp:var:: StatisticDuration success @@ @@ The count and cumulative duration for successful responses. @@
optional StatisticDuration fail = 4
@@ .. cpp:var:: StatisticDuration fail @@ @@ The count and cumulative duration for failed responses. @@
optional StatisticDuration empty_response = 5
@@ .. cpp:var:: StatisticDuration empty_response @@ @@ The count and cumulative duration for empty responses. @@
optional StatisticDuration cancel = 6
@@ .. cpp:var:: StatisticDuration cancel @@ @@ The count and cumulative duration, for cleaning up resources held by @@ a cancelled request, for cancelled responses. @@

@@ @@.. cpp:var:: message InferStatistics @@ @@ Inference statistics. @@

Used in: ModelStatistics

optional StatisticDuration success = 1
@@ .. cpp:var:: StatisticDuration success @@ @@ Cumulative count and duration for successful inference @@ request. The "success" count and cumulative duration includes @@ cache hits. @@
optional StatisticDuration fail = 2
@@ .. cpp:var:: StatisticDuration fail @@ @@ Cumulative count and duration for failed inference @@ request. @@
optional StatisticDuration queue = 3
@@ .. cpp:var:: StatisticDuration queue @@ @@ The count and cumulative duration that inference requests wait in @@ scheduling or other queues. The "queue" count and cumulative @@ duration includes cache hits. @@
optional StatisticDuration compute_input = 4
@@ .. cpp:var:: StatisticDuration compute_input @@ @@ The count and cumulative duration to prepare input tensor data as @@ required by the model framework / backend. For example, this duration @@ should include the time to copy input tensor data to the GPU. @@ The "compute_input" count and cumulative duration do not account for @@ requests that were a cache hit. See the "cache_hit" field for more @@ info. @@
optional StatisticDuration compute_infer = 5
@@ .. cpp:var:: StatisticDuration compute_infer @@ @@ The count and cumulative duration to execute the model. @@ The "compute_infer" count and cumulative duration do not account for @@ requests that were a cache hit. See the "cache_hit" field for more @@ info. @@
optional StatisticDuration compute_output = 6
@@ .. cpp:var:: StatisticDuration compute_output @@ @@ The count and cumulative duration to extract output tensor data @@ produced by the model framework / backend. For example, this duration @@ should include the time to copy output tensor data from the GPU. @@ The "compute_output" count and cumulative duration do not account for @@ requests that were a cache hit. See the "cache_hit" field for more @@ info. @@
optional StatisticDuration cache_hit = 7
@@ .. cpp:var:: StatisticDuration cache_hit @@ @@ The count of response cache hits and cumulative duration to lookup @@ and extract output tensor data from the Response Cache on a cache @@ hit. For example, this duration should include the time to copy @@ output tensor data from the Response Cache to the response object. @@ On cache hits, triton does not need to go to the model/backend @@ for the output tensor data, so the "compute_input", "compute_infer", @@ and "compute_output" fields are not updated. Assuming the response @@ cache is enabled for a given model, a cache hit occurs for a @@ request to that model when the request metadata (model name, @@ model version, model inputs) hashes to an existing entry in the @@ cache. On a cache miss, the request hash and response output tensor @@ data is added to the cache. See response cache docs for more info: @@ https://github.com/triton-inference-server/server/blob/main/docs/response_cache.md @@
optional StatisticDuration cache_miss = 8
@@ .. cpp:var:: StatisticDuration cache_miss @@ @@ The count of response cache misses and cumulative duration to lookup @@ and insert output tensor data from the computed response to the cache. @@ For example, this duration should include the time to copy @@ output tensor data from the response object to the Response Cache. @@ Assuming the response cache is enabled for a given model, a cache @@ miss occurs for a request to that model when the request metadata @@ does NOT hash to an existing entry in the cache. See the response @@ cache docs for more info: @@ https://github.com/triton-inference-server/server/blob/main/docs/response_cache.md @@

message InferTensorContents

grpc_service.proto:494

@@ @@.. cpp:var:: message InferTensorContents @@ @@ The data contained in a tensor represented by the repeated type @@ that matches the tensor's data type. Protobuf oneof is not used @@ because oneofs cannot contain repeated fields. @@

Used in: ModelInferRequest.InferInputTensor, ModelInferResponse.InferOutputTensor

repeated bool bool_contents = 1
@@ @@ .. cpp:var:: bool bool_contents (repeated) @@ @@ Representation for BOOL data type. The size must match what is @@ expected by the tensor's shape. The contents must be the flattened, @@ one-dimensional, row-major order of the tensor elements. @@
repeated int32 int_contents = 2
@@ @@ .. cpp:var:: int32 int_contents (repeated) @@ @@ Representation for INT8, INT16, and INT32 data types. The size @@ must match what is expected by the tensor's shape. The contents @@ must be the flattened, one-dimensional, row-major order of the @@ tensor elements. @@
repeated int64 int64_contents = 3
@@ @@ .. cpp:var:: int64 int64_contents (repeated) @@ @@ Representation for INT64 data types. The size must match what @@ is expected by the tensor's shape. The contents must be the @@ flattened, one-dimensional, row-major order of the tensor elements. @@
repeated uint32 uint_contents = 4
@@ @@ .. cpp:var:: uint32 uint_contents (repeated) @@ @@ Representation for UINT8, UINT16, and UINT32 data types. The size @@ must match what is expected by the tensor's shape. The contents @@ must be the flattened, one-dimensional, row-major order of the @@ tensor elements. @@
repeated uint64 uint64_contents = 5
@@ @@ .. cpp:var:: uint64 uint64_contents (repeated) @@ @@ Representation for UINT64 data types. The size must match what @@ is expected by the tensor's shape. The contents must be the @@ flattened, one-dimensional, row-major order of the tensor elements. @@
repeated float fp32_contents = 6
@@ @@ .. cpp:var:: float fp32_contents (repeated) @@ @@ Representation for FP32 data type. The size must match what is @@ expected by the tensor's shape. The contents must be the flattened, @@ one-dimensional, row-major order of the tensor elements. @@
repeated double fp64_contents = 7
@@ @@ .. cpp:var:: double fp64_contents (repeated) @@ @@ Representation for FP64 data type. The size must match what is @@ expected by the tensor's shape. The contents must be the flattened, @@ one-dimensional, row-major order of the tensor elements. @@
repeated bytes bytes_contents = 8
@@ @@ .. cpp:var:: bytes bytes_contents (repeated) @@ @@ Representation for BYTES data type. The size must match what is @@ expected by the tensor's shape. The contents must be the flattened, @@ one-dimensional, row-major order of the tensor elements. @@

Used in: LogSettingsRequest

oneof parameter_choice
- bool bool_param = 1
  @@ .. cpp:var:: bool bool_param @@ @@ A boolean parameter value. @@
- uint32 uint32_param = 2
  @@ .. cpp:var:: uint32 uint32_param @@ @@ An uint32 parameter value. @@
- string string_param = 3
  @@ .. cpp:var:: string string_param @@ @@ A string parameter value. @@

Used in: LogSettingsResponse

oneof parameter_choice
- bool bool_param = 1
  @@ .. cpp:var:: bool bool_param @@ @@ A boolean parameter value. @@
- uint32 uint32_param = 2
  @@ .. cpp:var:: uint32 uint32_param @@ @@ An int32 parameter value. @@
- string string_param = 3
  @@ .. cpp:var:: string string_param @@ @@ A string parameter value. @@

@@ @@.. cpp:var:: message MemoryUsage @@ @@ Memory usage. @@

Used in: ModelStatistics

string type = 1
@@ .. cpp:var:: string type @@ @@ The type of memory, the value can be "CPU", "CPU_PINNED", "GPU". @@
int64 id = 2
@@ .. cpp:var:: int64 id @@ @@ The id of the memory, typically used with "type" to identify @@ a device that hosts the memory. @@
uint64 byte_size = 3
@@ .. cpp:var:: uint64 byte_size @@ @@ The byte size of the memory. @@

@@ @@.. cpp:var:: message ModelConfig @@ @@ A model configuration. @@

Used in: ModelConfigResponse

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the model. @@
string platform = 2
@@ .. cpp:var:: string platform @@ @@ Additional backend-specific configuration for the model. @@ Please refer to the backend documentation on whether this field @@ should be specified. @@
string backend = 17
@@ .. cpp:var:: string backend @@ @@ The backend used by the model. @@
string runtime = 25
@@ .. cpp:var:: string runtime @@ @@ The name of the backend library file used by the model. @@
optional ModelVersionPolicy version_policy = 3
@@ .. cpp:var:: ModelVersionPolicy version_policy @@ @@ Policy indicating which version(s) of the model will be served. @@
int32 max_batch_size = 4
@@ .. cpp:var:: int32 max_batch_size @@ @@ Maximum batch size allowed for inference. This can only decrease @@ what is allowed by the model itself. A max_batch_size value of 0 @@ indicates that batching is not allowed for the model and the @@ dimension/shape of the input and output tensors must exactly @@ match what is specified in the input and output configuration. A @@ max_batch_size value > 0 indicates that batching is allowed and @@ so the model expects the input tensors to have an additional @@ initial dimension for the batching that is not specified in the @@ input (for example, if the model supports batched inputs of @@ 2-dimensional tensors then the model configuration will specify @@ the input shape as [ X, Y ] but the model will expect the actual @@ input tensors to have shape [ N, X, Y ]). For max_batch_size > 0 @@ returned outputs will also have an additional initial dimension @@ for the batch. @@
repeated ModelInput input = 5
@@ .. cpp:var:: ModelInput input (repeated) @@ @@ The inputs request by the model. @@
repeated ModelOutput output = 6
@@ .. cpp:var:: ModelOutput output (repeated) @@ @@ The outputs produced by the model. @@
repeated BatchInput batch_input = 20
@@ .. cpp:var:: BatchInput batch_input (repeated) @@ @@ The model input(s) that the server should use to communicate @@ batch related values to the model. @@
repeated BatchOutput batch_output = 21
@@ .. cpp:var:: BatchOutput batch_output (repeated) @@ @@ The outputs produced by the model that requires special handling @@ by the model backend. @@
optional ModelOptimizationPolicy optimization = 12
@@ .. cpp:var:: ModelOptimizationPolicy optimization @@ @@ Optimization configuration for the model. If not specified @@ then default optimization policy is used. @@
oneof scheduling_choice
@@ .. cpp:var:: oneof scheduling_choice @@ @@ The scheduling policy for the model. If not specified the @@ default scheduling policy is used for the model. The default @@ policy is to execute each inference request independently. @@
- ModelDynamicBatching dynamic_batching = 11
  @@ .. cpp:var:: ModelDynamicBatching dynamic_batching @@ @@ If specified, enables the dynamic-batching scheduling @@ policy. With dynamic-batching the scheduler may group @@ together independent requests into a single batch to @@ improve inference throughput. @@
- ModelSequenceBatching sequence_batching = 13
  @@ .. cpp:var:: ModelSequenceBatching sequence_batching @@ @@ If specified, enables the sequence-batching scheduling @@ policy. With sequence-batching, inference requests @@ with the same correlation ID are routed to the same @@ model instance. Multiple sequences of inference requests @@ may be batched together into a single batch to @@ improve inference throughput. @@
- ModelEnsembling ensemble_scheduling = 15
  @@ .. cpp:var:: ModelEnsembling ensemble_scheduling @@ @@ If specified, enables the model-ensembling scheduling @@ policy. With model-ensembling, inference requests @@ will be processed according to the specification, such as an @@ execution sequence of models. The input specified in this model @@ config will be the input for the ensemble, and the output @@ specified will be the output of the ensemble. @@
repeated ModelInstanceGroup instance_group = 7
@@ .. cpp:var:: ModelInstanceGroup instance_group (repeated) @@ @@ Instances of this model. If not specified, one instance @@ of the model will be instantiated on each available GPU. @@
string default_model_filename = 8
@@ .. cpp:var:: string default_model_filename @@ @@ Optional filename of the model file to use if a @@ compute-capability specific model is not specified in @@ :cpp:var:`cc_model_filenames`. If not specified the default name @@ is 'model.graphdef', 'model.savedmodel', 'model.plan' or @@ 'model.pt' depending on the model type. @@
map<string, string> cc_model_filenames = 9
@@ .. cpp:var:: map<string,string> cc_model_filenames @@ @@ Optional map from CUDA compute capability to the filename of @@ the model that supports that compute capability. The filename @@ refers to a file within the model version directory. @@
map<string, string> metric_tags = 10
@@ .. cpp:var:: map<string,string> metric_tags @@ @@ Optional metric tags. User-specific key-value pairs for metrics @@ reported for this model. These tags are applied to the metrics @@ reported on the HTTP metrics port. @@
map<string, ModelParameter> parameters = 14
@@ .. cpp:var:: map<string,ModelParameter> parameters @@ @@ Optional model parameters. User-specified parameter values. @@
repeated ModelWarmup model_warmup = 16
@@ .. cpp:var:: ModelWarmup model_warmup (repeated) @@ @@ Warmup setting of this model. If specified, all instances @@ will be run with the request samples in sequence before @@ serving the model. @@ This field can only be specified if the model is not an ensemble @@ model. @@
optional ModelOperations model_operations = 18
@@ .. cpp:var:: ModelOperations model_operations @@ @@ Optional metadata of the libraries providing custom operations for @@ this model. @@
optional ModelTransactionPolicy model_transaction_policy = 19
@@ .. cpp:var:: ModelTransactionPolicy model_transaction_policy @@ @@ Optional specification that describes the nature of transactions @@ to be expected from the model. @@
optional ModelRepositoryAgents model_repository_agents = 23
@@ .. cpp:var:: ModelRepositoryAgents model_repository_agents @@ @@ Optional specification of the agent(s) that should be invoked @@ with repository actions are performed for this model. @@
optional ModelResponseCache response_cache = 24
@@ .. cpp:var:: ModelResponseCache response_cache @@ @@ Optional setting for utilizing the response cache for this @@ model. @@
optional ModelMetrics model_metrics = 26
@@ .. cpp:var:: ModelMetrics model_metrics @@ @@ Optional setting for custom metrics configuration for this model. @@ Application default is applied to metrics that are not specified. @@

@@ @@.. cpp:var:: message ModelDynamicBatching @@ @@ Dynamic batching configuration. These settings control how dynamic @@ batching operates for the model. @@

Used in: ModelConfig

repeated int32 preferred_batch_size = 1
@@ .. cpp:var:: int32 preferred_batch_size (repeated) @@ @@ Preferred batch sizes for dynamic batching. If a batch of one of @@ these sizes can be formed it will be executed immediately. If @@ not specified a preferred batch size will be chosen automatically @@ based on model and GPU characteristics. @@
uint64 max_queue_delay_microseconds = 2
@@ .. cpp:var:: uint64 max_queue_delay_microseconds @@ @@ The maximum time, in microseconds, a request will be delayed in @@ the scheduling queue to wait for additional requests for @@ batching. Default is 0. @@
bool preserve_ordering = 3
@@ .. cpp:var:: bool preserve_ordering @@ @@ Should the dynamic batcher preserve the ordering of responses to @@ match the order of requests received by the scheduler. Default is @@ false. If true, the responses will be returned in the same order as @@ the order of requests sent to the scheduler. If false, the responses @@ may be returned in arbitrary order. This option is specifically @@ needed when a sequence of related inference requests (i.e. inference @@ requests with the same correlation ID) are sent to the dynamic @@ batcher to ensure that the sequence responses are in the correct @@ order. @@
uint64 priority_levels = 4
@@ .. cpp:var:: uint64 priority_levels @@ @@ The number of priority levels to be enabled for the model, @@ the priority level starts from 1 and 1 is the highest priority. @@ Requests are handled in priority order with all priority 1 requests @@ processed before priority 2, all priority 2 requests processed before @@ priority 3, etc. Requests with the same priority level will be @@ handled in the order that they are received. @@
uint64 default_priority_level = 5
@@ .. cpp:var:: uint64 default_priority_level @@ @@ The priority level used for requests that don't specify their @@ priority. The value must be in the range [ 1, 'priority_levels' ]. @@
optional ModelQueuePolicy default_queue_policy = 6
@@ .. cpp:var:: ModelQueuePolicy default_queue_policy @@ @@ The default queue policy used for requests that don't require @@ priority handling and requests that specify priority levels where @@ there is no specific policy given. If not specified, a policy with @@ default field values will be used. @@
map<uint64, ModelQueuePolicy> priority_queue_policy = 7
@@ .. cpp:var:: map<uint64, ModelQueuePolicy> priority_queue_policy @@ @@ Specify the queue policy for the priority level. The default queue @@ policy will be used if a priority level doesn't specify a queue @@ policy. @@

@@ @@.. cpp:var:: message ModelEnsembling @@ @@ Model ensembling configuration. These settings specify the models that @@ compose the ensemble and how data flows between the models. @@

Used in: ModelConfig

repeated ModelEnsembling.Step step = 1
@@ .. cpp:var:: Step step (repeated) @@ @@ The models and the input / output mappings used within the ensemble. @@
uint32 max_inflight_requests = 2
@@ .. cpp:var:: uint32 max_inflight_requests @@ @@ BETA (Subject to change) @@ The maximum number of concurrent in-flight requests allowed at each @@ ensemble step across all ongoing ensemble requests for this model @@ instance. This per-step limit prevents unbounded memory growth when @@ ensemble steps produce responses faster than downstream steps can @@ consume them (for example, in decoupled models). @@ The default value is 0, which indicates that no limit is enforced. @@ @@ Note: Applying this limit may block upstream steps while they wait @@ for downstream capacity. This blocking does not cancel or internally @@ time out intermediate requests, but clients may experience increased @@ end-to-end latency. @@

@@ .. cpp:var:: message Step @@ @@ Each step specifies a model included in the ensemble, @@ maps ensemble tensor names to the model input tensors, @@ and maps model output tensors to ensemble tensor names @@

Used in: ModelEnsembling

string model_name = 1
@@ .. cpp:var:: string model_name @@ @@ The name of the model to execute for this step of the ensemble. @@
int64 model_version = 2
@@ .. cpp:var:: int64 model_version @@ @@ The version of the model to use for inference. If -1 @@ the latest/most-recent version of the model is used. @@
map<string, string> input_map = 3
@@ .. cpp:var:: map<string,string> input_map @@ @@ Map from name of an input tensor on this step's model to ensemble @@ tensor name. The ensemble tensor must have the same data type and @@ shape as the model input. Each model input must be assigned to @@ one ensemble tensor, but the same ensemble tensor can be assigned @@ to multiple model inputs. @@
map<string, string> output_map = 4
@@ .. cpp:var:: map<string,string> output_map @@ @@ Map from name of an output tensor on this step's model to ensemble @@ tensor name. The data type and shape of the ensemble tensor will @@ be inferred from the model output. It is optional to assign all @@ model outputs to ensemble tensors. One ensemble tensor name @@ can appear in an output map only once. @@
string model_namespace = 5
@@ .. cpp:var:: string model_namespace @@ @@ [RESERVED] currently this field is reserved for internal use, users @@ must not set any value to this field to avoid unexpected behavior. @@

@@ @@.. cpp:var:: message ModelInferRequest @@ @@ Request message for ModelInfer. @@

Used as request type in: GRPCInferenceService.ModelInfer, GRPCInferenceService.ModelStreamInfer

string model_name = 1
@@ .. cpp:var:: string model_name @@ @@ The name of the model to use for inferencing. @@
string model_version = 2
@@ .. cpp:var:: string model_version @@ @@ The version of the model to use for inference. If not @@ given the latest/most-recent version of the model is used. @@
string id = 3
@@ .. cpp:var:: string id @@ @@ Optional identifier for the request. If specified will be @@ returned in the response. @@
map<string, InferParameter> parameters = 4
@@ .. cpp:var:: map<string,InferParameter> parameters @@ @@ Optional inference parameters. @@
repeated ModelInferRequest.InferInputTensor inputs = 5
@@ @@ .. cpp:var:: InferInputTensor inputs (repeated) @@ @@ The input tensors for the inference. @@
repeated ModelInferRequest.InferRequestedOutputTensor outputs = 6
@@ @@ .. cpp:var:: InferRequestedOutputTensor outputs (repeated) @@ @@ The requested output tensors for the inference. Optional, if not @@ specified all outputs specified in the model config will be @@ returned. @@
repeated bytes raw_input_contents = 7
@@ @@ .. cpp:var:: bytes raw_input_contents @@ @@ The data contained in an input tensor can be represented in @@ "raw" bytes form or in the repeated type that matches the @@ tensor's data type. Using the "raw" bytes form will @@ typically allow higher performance due to the way protobuf @@ allocation and reuse interacts with GRPC. For example, see @@ https://github.com/grpc/grpc/issues/23231. @@ @@ To use the raw representation 'raw_input_contents' must be @@ initialized with data for each tensor in the same order as @@ 'inputs'. For each tensor, the size of this content must @@ match what is expected by the tensor's shape and data @@ type. The raw data must be the flattened, one-dimensional, @@ row-major order of the tensor elements without any stride @@ or padding between the elements. Note that the FP16 and BF16 data @@ types must be represented as raw content as there is no @@ specific data type for a 16-bit float type. @@ @@ If this field is specified then InferInputTensor::contents @@ must not be specified for any input tensor. @@

@@ @@ .. cpp:var:: message InferInputTensor @@ @@ An input tensor for an inference request. @@

Used in: ModelInferRequest

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The tensor name. @@
string datatype = 2
@@ @@ .. cpp:var:: string datatype @@ @@ The tensor data type. @@
repeated int64 shape = 3
@@ @@ .. cpp:var:: int64 shape (repeated) @@ @@ The tensor shape. @@
map<string, InferParameter> parameters = 4
@@ .. cpp:var:: map<string,InferParameter> parameters @@ @@ Optional inference input tensor parameters. @@
optional InferTensorContents contents = 5
@@ .. cpp:var:: InferTensorContents contents @@ @@ The tensor contents using a data-type format. This field @@ must not be specified if tensor contents are being specified @@ in ModelInferRequest.raw_input_contents. @@

@@ @@ .. cpp:var:: message InferRequestedOutputTensor @@ @@ An output tensor requested for an inference request. @@

Used in: ModelInferRequest

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The tensor name. @@
map<string, InferParameter> parameters = 2
@@ .. cpp:var:: map<string,InferParameter> parameters @@ @@ Optional requested output tensor parameters. @@

@@ @@.. cpp:var:: message ModelInferResponse @@ @@ Response message for ModelInfer. @@

Used as response type in: GRPCInferenceService.ModelInfer

Used as field type in: ModelStreamInferResponse

string model_name = 1
@@ .. cpp:var:: string model_name @@ @@ The name of the model used for inference. @@
string model_version = 2
@@ .. cpp:var:: string model_version @@ @@ The version of the model used for inference. @@
string id = 3
@@ .. cpp:var:: string id @@ @@ The id of the inference request if one was specified. @@
map<string, InferParameter> parameters = 4
@@ .. cpp:var:: map<string,InferParameter> parameters @@ @@ Optional inference response parameters. @@
repeated ModelInferResponse.InferOutputTensor outputs = 5
@@ @@ .. cpp:var:: InferOutputTensor outputs (repeated) @@ @@ The output tensors holding inference results. @@
repeated bytes raw_output_contents = 6
@@ @@ .. cpp:var:: bytes raw_output_contents @@ @@ The data contained in an output tensor can be represented in @@ "raw" bytes form or in the repeated type that matches the @@ tensor's data type. Using the "raw" bytes form will @@ typically allow higher performance due to the way protobuf @@ allocation and reuse interacts with GRPC. For example, see @@ https://github.com/grpc/grpc/issues/23231. @@ @@ To use the raw representation 'raw_output_contents' must be @@ initialized with data for each tensor in the same order as @@ 'outputs'. For each tensor, the size of this content must @@ match what is expected by the tensor's shape and data @@ type. The raw data must be the flattened, one-dimensional, @@ row-major order of the tensor elements without any stride @@ or padding between the elements. Note that the FP16 and BF16 data @@ types must be represented as raw content as there is no @@ specific data type for a 16-bit float type. @@ @@ If this field is specified then InferOutputTensor::contents @@ must not be specified for any output tensor. @@

@@ @@ .. cpp:var:: message InferOutputTensor @@ @@ An output tensor returned for an inference request. @@

Used in: ModelInferResponse

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The tensor name. @@
string datatype = 2
@@ @@ .. cpp:var:: string datatype @@ @@ The tensor data type. @@
repeated int64 shape = 3
@@ @@ .. cpp:var:: int64 shape (repeated) @@ @@ The tensor shape. @@
map<string, InferParameter> parameters = 4
@@ .. cpp:var:: map<string,InferParameter> parameters @@ @@ Optional output tensor parameters. @@
optional InferTensorContents contents = 5
@@ .. cpp:var:: InferTensorContents contents @@ @@ The tensor contents using a data-type format. This field @@ must not be specified if tensor contents are being specified @@ in ModelInferResponse.raw_output_contents. @@

@@ @@.. cpp:var:: message ModelInput @@ @@ An input required by the model. @@

Used in: ModelConfig

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the input. @@
DataType data_type = 2
@@ .. cpp:var:: DataType data_type @@ @@ The data-type of the input. @@
ModelInput.Format format = 3
@@ .. cpp:var:: Format format @@ @@ The format of the input. Optional. @@
repeated int64 dims = 4
@@ .. cpp:var:: int64 dims (repeated) @@ @@ The dimensions/shape of the input tensor that must be provided @@ when invoking the inference API for this model. @@
optional ModelTensorReshape reshape = 5
@@ .. cpp:var:: ModelTensorReshape reshape @@ @@ The shape expected for this input by the backend. The input will @@ be reshaped to this before being presented to the backend. The @@ reshape must have the same number of elements as the input shape @@ specified by 'dims'. Optional. @@
bool is_shape_tensor = 6
@@ .. cpp:var:: bool is_shape_tensor @@ @@ Whether or not the input is a shape tensor to the model. This field @@ is currently supported only for the TensorRT model. An error will be @@ generated if this specification does not comply with underlying @@ model. @@
bool allow_ragged_batch = 7
@@ .. cpp:var:: bool allow_ragged_batch @@ @@ Whether or not the input is allowed to be "ragged" in a dynamically @@ created batch. Default is false indicating that two requests will @@ only be batched if this tensor has the same shape in both requests. @@ True indicates that two requests can be batched even if this tensor @@ has a different shape in each request. @@
bool optional = 8
@@ .. cpp:var:: bool optional @@ @@ Whether or not the input is optional for the model execution. @@ If true, the input is not required in the inference request. @@ Default value is false. @@
bool is_non_linear_format_io = 9
@@ .. cpp:var:: bool is_non_linear_format_io @@ @@ Indicates whether the input tensor uses a non-linear IO format. This @@ field is currently supported only for TensorRT models. An error will @@ be generated if this specification does not comply with the @@ underlying model. @@

@@ @@ .. cpp:enum:: Format @@ @@ The format for the input. @@

Used in: ModelInput

FORMAT_NONE = 0
@@ .. cpp:enumerator:: Format::FORMAT_NONE = 0 @@ @@ The input has no specific format. This is the default. @@
FORMAT_NHWC = 1
@@ .. cpp:enumerator:: Format::FORMAT_NHWC = 1 @@ @@ HWC image format. Tensors with this format require 3 dimensions @@ if the model does not support batching (max_batch_size = 0) or 4 @@ dimensions if the model does support batching (max_batch_size @@ >= 1). In either case the 'dims' below should only specify the @@ 3 non-batch dimensions (i.e. HWC or CHW). @@
FORMAT_NCHW = 2
@@ .. cpp:enumerator:: Format::FORMAT_NCHW = 2 @@ @@ CHW image format. Tensors with this format require 3 dimensions @@ if the model does not support batching (max_batch_size = 0) or 4 @@ dimensions if the model does support batching (max_batch_size @@ >= 1). In either case the 'dims' below should only specify the @@ 3 non-batch dimensions (i.e. HWC or CHW). @@

@@ @@.. cpp:var:: message ModelInstanceGroup @@ @@ A group of one or more instances of a model and resources made @@ available for those instances. @@

Used in: ModelConfig

string name = 1
@@ .. cpp:var:: string name @@ @@ Optional name of this group of instances. If not specified the @@ name will be formed as <model name>_<group number>. The name of @@ individual instances will be further formed by a unique instance @@ number and GPU index: @@
ModelInstanceGroup.Kind kind = 4
@@ .. cpp:var:: Kind kind @@ @@ The kind of this instance group. Default is KIND_AUTO. If @@ KIND_AUTO or KIND_GPU then both 'count' and 'gpu' are valid and @@ may be specified. If KIND_CPU or KIND_MODEL only 'count' is valid @@ and 'gpu' cannot be specified. @@
int32 count = 2
@@ .. cpp:var:: int32 count @@ @@ For a group assigned to GPU, the number of instances created for @@ each GPU listed in 'gpus'. For a group assigned to CPU the number @@ of instances created. Default is 1.
optional ModelRateLimiter rate_limiter = 6
@@ .. cpp:var:: ModelRateLimiter rate_limiter @@ @@ The rate limiter specific settings to be associated with this @@ instance group. Optional, if not specified no rate limiting @@ will be applied to this instance group. @@
repeated int32 gpus = 3
@@ .. cpp:var:: int32 gpus (repeated) @@ @@ GPU(s) where instances should be available. For each GPU listed, @@ 'count' instances of the model will be available. Setting 'gpus' @@ to empty (or not specifying at all) is equivalent to listing all @@ available GPUs. @@
repeated ModelInstanceGroup.SecondaryDevice secondary_devices = 8
@@ .. cpp:var:: SecondaryDevice secondary_devices (repeated) @@ @@ Secondary devices that are required by instances specified by this @@ instance group. Optional. @@
repeated string profile = 5
@@ .. cpp:var:: string profile (repeated) @@ @@ For TensorRT models containing multiple optimization profile, this @@ parameter specifies a set of optimization profiles available to this @@ instance group. The inference server will choose the optimal profile @@ based on the shapes of the input tensors. This field should lie @@ between 0 and <TotalNumberOfOptimizationProfilesInPlanModel> - 1 @@ and be specified only for TensorRT backend, otherwise an error will @@ be generated. If not specified, the server will select the first @@ optimization profile by default. @@
bool passive = 7
@@ .. cpp:var:: bool passive @@ @@ Whether the instances within this instance group will be accepting @@ inference requests from the scheduler. If true, the instances will @@ not be added to the scheduler. Default value is false. @@
string host_policy = 9
@@ .. cpp:var:: string host_policy @@ @@ The host policy name that the instance to be associated with. @@ The default value is set to reflect the device kind of the instance, @@ for instance, KIND_CPU is "cpu", KIND_MODEL is "model" and @@ KIND_GPU is "gpu_<gpu_id>". @@

@@ @@ .. cpp:enum:: Kind @@ @@ Kind of this instance group. @@

Used in: ModelInstanceGroup

KIND_AUTO = 0
@@ .. cpp:enumerator:: Kind::KIND_AUTO = 0 @@ @@ This instance group represents instances that can run on either @@ CPU or GPU. If all GPUs listed in 'gpus' are available then @@ instances will be created on GPU(s), otherwise instances will @@ be created on CPU. @@
KIND_GPU = 1
@@ .. cpp:enumerator:: Kind::KIND_GPU = 1 @@ @@ This instance group represents instances that must run on the @@ GPU. @@
KIND_CPU = 2
@@ .. cpp:enumerator:: Kind::KIND_CPU = 2 @@ @@ This instance group represents instances that must run on the @@ CPU. @@
KIND_MODEL = 3
@@ .. cpp:enumerator:: Kind::KIND_MODEL = 3 @@ @@ This instance group represents instances that should run on the @@ CPU and/or GPU(s) as specified by the model or backend itself. @@ The inference server will not override the model/backend @@ settings. @@

@@ @@ .. cpp:var:: message SecondaryDevice @@ @@ A secondary device required for a model instance. @@

Used in: ModelInstanceGroup

SecondaryDevice.SecondaryDeviceKind kind = 1
@@ .. cpp:var:: SecondaryDeviceKind kind @@ @@ The secondary device kind. @@
int64 device_id = 2
@@ .. cpp:var:: int64 device_id @@ @@ Identifier for the secondary device. @@

@@ @@ .. cpp:enum:: SecondaryDeviceKind @@ @@ The kind of the secondary device. @@

Used in: SecondaryDevice

KIND_NVDLA = 0
@@ .. cpp:enumerator:: SecondaryDeviceKind::KIND_NVDLA = 0 @@ @@ An NVDLA core. http://nvdla.org @@ Currently KIND_NVDLA is only supported by the TensorRT backend. @@

@@ @@ .. cpp:var:: message TensorMetadata @@ @@ Metadata for a tensor. @@

Used in: ModelMetadataResponse

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The tensor name. @@
string datatype = 2
@@ @@ .. cpp:var:: string datatype @@ @@ The tensor data type. @@
repeated int64 shape = 3
@@ @@ .. cpp:var:: int64 shape (repeated) @@ @@ The tensor shape. A variable-size dimension is represented @@ by a -1 value. @@

@@ @@ .. cpp:var:: message ModelMetrics @@ @@ The metrics setting of this model. @@ NOTE: Consider reusing this message body for backend metric custom @@ configuration. @@

Used in: ModelConfig

repeated ModelMetrics.MetricControl metric_control = 1
@@ @@ .. cpp::var:: MetricControl metric_control (repeated) @@ @@ Optional custom configuration for selected metrics. @@

@@ @@ .. cpp:var:: message MetricControl @@ @@ Override metrics settings of this model. @@

Used in: ModelMetrics

optional MetricControl.MetricIdentifier metric_identifier = 1
@@ .. cpp:var:: MetricIdentifier metric_identifier @@ @@ The identifier defining metrics to be overridden with the @@ metric_options. @@
oneof metric_options
@@ .. cpp:var:: oneof metric_options @@ @@ The value to override the metrics defined in metric_identifier. @@
- MetricControl.HistogramOptions histogram_options = 2
  @@ .. cpp:var:: HistogramOptions histogram_options @@ @@ Histogram options. @@

@@ .. cpp:var:: message HistogramOptions @@ @@ Histogram metrics options. @@

Used in: MetricControl

repeated double buckets = 1
@@ .. cpp:var:: double buckets (repeated) @@ @@ Repeated double type in ascending order for histogram bucket @@ boundaries. Each bucket value represents a range less than or @@ equal to itself. The range greater than the largest bucket value @@ is allocated implicitly. @@ For example, [ -5.0, -2, 0, 3.5, 5 ]. @@

@@ @@ .. cpp:var:: message MetricIdentifier @@ @@ Specify metrics to be overridden with metric_option. @@

Used in: MetricControl

string family = 1
@@ .. cpp:var:: string family @@ @@ The name of the metric family to override with the custom value. @@ All core histogram metrics reported by Triton are customizable. @@ https://github.com/triton-inference-server/server/blob/main/docs/user_guide/metrics.md#histograms @@

@@ @@ .. cpp:var:: message ModelOperations @@ @@ The metadata of libraries providing custom operations for this model. @@

Used in: ModelConfig

repeated string op_library_filename = 1
@@ .. cpp:var:: string op_library_filename (repeated) @@ @@ Optional paths of the libraries providing custom operations for @@ this model. Valid only for ONNX models. @@

@@ @@.. cpp:var:: message ModelOptimizationPolicy @@ @@ Optimization settings for a model. These settings control if/how a @@ model is optimized and prioritized by the backend framework when @@ it is loaded. @@

Used in: ModelConfig

optional ModelOptimizationPolicy.Graph graph = 1
@@ .. cpp:var:: Graph graph @@ @@ The graph optimization setting for the model. Optional. @@
ModelOptimizationPolicy.ModelPriority priority = 2
@@ .. cpp:var:: ModelPriority priority @@ @@ The priority setting for the model. Optional. @@
optional ModelOptimizationPolicy.Cuda cuda = 3
@@ .. cpp:var:: Cuda cuda @@ @@ CUDA-specific optimization settings. Optional. @@
optional ModelOptimizationPolicy.ExecutionAccelerators execution_accelerators = 4
@@ .. cpp:var:: ExecutionAccelerators execution_accelerators @@ @@ The accelerators used for the model. Optional. @@
optional ModelOptimizationPolicy.PinnedMemoryBuffer input_pinned_memory = 5
@@ .. cpp:var:: PinnedMemoryBuffer input_pinned_memory @@ @@ Use pinned memory buffer when the data transfer for inputs @@ is between GPU memory and non-pinned system memory. @@ Default is true. @@
optional ModelOptimizationPolicy.PinnedMemoryBuffer output_pinned_memory = 6
@@ .. cpp:var:: PinnedMemoryBuffer output_pinned_memory @@ @@ Use pinned memory buffer when the data transfer for outputs @@ is between GPU memory and non-pinned system memory. @@ Default is true. @@
uint32 gather_kernel_buffer_threshold = 7
@@ .. cpp:var:: uint32 gather_kernel_buffer_threshold @@ @@ The backend may use a gather kernel to gather input data if the @@ device has direct access to the source buffer and the destination @@ buffer. In such case, the gather kernel will be used only if the @@ number of buffers to be gathered is greater or equal to @@ the specified value. If 0, the gather kernel will be disabled. @@ Default value is 0. @@ Currently only recognized by TensorRT backend. @@
bool eager_batching = 8
@@ .. cpp:var:: bool eager_batching @@ @@ Start preparing the next batch before the model instance is ready @@ for the next inference. This option can be used to overlap the @@ batch preparation with model execution, with the trade-off that @@ the next batch might be smaller than what it could have been. @@ Default value is false. @@ Currently only recognized by TensorRT backend. @@

@@ @@ .. cpp:var:: message Cuda @@ @@ CUDA-specific optimization settings. @@

Used in: ModelOptimizationPolicy

bool graphs = 1
@@ .. cpp:var:: bool graphs @@ @@ Use CUDA graphs API to capture model operations and execute @@ them more efficiently. Default value is false. @@ Currently only recognized by TensorRT backend. @@
bool busy_wait_events = 2
@@ .. cpp:var:: bool busy_wait_events @@ @@ Use busy-waiting to synchronize CUDA events to achieve minimum @@ latency from event complete to host thread to be notified, with @@ the cost of high CPU load. Default value is false. @@ Currently only recognized by TensorRT backend. @@
repeated Cuda.GraphSpec graph_spec = 3
@@ .. cpp:var:: GraphSpec graph_spec (repeated) @@ @@ Specification of the CUDA graph to be captured. If not specified @@ and 'graphs' is true, the default CUDA graphs will be captured @@ based on model settings. @@ Currently only recognized by TensorRT backend. @@
bool output_copy_stream = 4
@@ .. cpp:var:: bool output_copy_stream @@ @@ Uses a CUDA stream separate from the inference stream to copy the @@ output to host. However, be aware that setting this option to @@ true will lead to an increase in the memory consumption of the @@ model as Triton will allocate twice as much GPU memory for its @@ I/O tensor buffers. Default value is false. @@ Currently only recognized by TensorRT backend. @@

@@ .. cpp:var:: message GraphSpec @@ @@ Specification of the CUDA graph to be captured. @@

Used in: Cuda

int32 batch_size = 1
@@ .. cpp:var:: int32 batch_size @@ @@ The batch size of the CUDA graph. If 'max_batch_size' is 0, @@ 'batch_size' must be set to 0. Otherwise, 'batch_size' must @@ be set to value between 1 and 'max_batch_size'. @@
map<string, GraphSpec.Shape> input = 2
@@ .. cpp:var:: map<string, Shape> input @@ @@ The specification of the inputs. 'Shape' is the shape of the @@ input without batching dimension. @@
optional GraphSpec.LowerBound graph_lower_bound = 3
@@ .. cpp:var:: LowerBound graph_lower_bound @@ @@ Specify the lower bound of the CUDA graph. Optional. @@ If specified, the graph can be used for input shapes and @@ batch sizes that are in closed interval between the lower @@ bound specification and graph specification. For dynamic @@ shape model, this allows CUDA graphs to be launched @@ frequently without capturing all possible shape combinations. @@ However, using graph for shape combinations different from @@ the one used for capturing introduces uninitialized data for @@ execution and it may distort the inference result if @@ the model is sensitive to uninitialized data. @@

Used in: GraphSpec

int32 batch_size = 1
@@ .. cpp:var:: int32 batch_size @@ @@ The batch size of the CUDA graph. If 'max_batch_size' is 0, @@ 'batch_size' must be set to 0. Otherwise, 'batch_size' must @@ be set to value between 1 and 'max_batch_size'. @@
map<string, Shape> input = 2
@@ .. cpp:var:: map<string, Shape> input @@ @@ The specification of the inputs. 'Shape' is the shape of @@ the input without batching dimension. @@

@@ .. cpp:var:: message Dims @@ @@ Specification of tensor dimension. @@

Used in: GraphSpec, LowerBound

repeated int64 dim = 1
@@ .. cpp:var:: int64 dim (repeated) @@ @@ The dimension. @@

@@ @@ .. cpp:var:: message ExecutionAccelerators @@ @@ Specify the preferred execution accelerators to be used to execute @@ the model. Currently only recognized by ONNX Runtime backend and @@ TensorFlow backend. @@ @@ For ONNX Runtime backend, it will deploy the model with the execution @@ accelerators by priority, the priority is determined based on the @@ order that they are set, i.e. the provider at the front has highest @@ priority. Overall, the priority will be in the following order: @@ <gpu_execution_accelerator> (if instance is on GPU) @@ CUDA Execution Provider (if instance is on GPU) @@ <cpu_execution_accelerator> @@ Default CPU Execution Provider @@

Used in: ModelOptimizationPolicy

repeated ExecutionAccelerators.Accelerator gpu_execution_accelerator = 1
@@ .. cpp:var:: Accelerator gpu_execution_accelerator (repeated) @@ @@ The preferred execution provider to be used if the model instance @@ is deployed on GPU. @@ @@ For ONNX Runtime backend, possible value is "tensorrt" as name, @@ and no parameters are required. @@ @@ For TensorFlow backend, possible values are "tensorrt", @@ "auto_mixed_precision", "gpu_io". @@ @@ For "tensorrt", the following parameters can be specified: @@ "precision_mode": The precision used for optimization. @@ Allowed values are "FP32" and "FP16". Default value is "FP32". @@ @@ "max_cached_engines": The maximum number of cached TensorRT @@ engines in dynamic TensorRT ops. Default value is 100. @@ @@ "minimum_segment_size": The smallest model subgraph that will @@ be considered for optimization by TensorRT. Default value is 3. @@ @@ "max_workspace_size_bytes": The maximum GPU memory the model @@ can use temporarily during execution. Default value is 1GB. @@ @@ For "auto_mixed_precision", no parameters are required. If set, @@ the model will try to use FP16 for better performance. @@ This optimization can not be set with "tensorrt". @@ @@ For "gpu_io", no parameters are required. If set, the model will @@ be executed using TensorFlow Callable API to set input and output @@ tensors in GPU memory if possible, which can reduce data transfer @@ overhead if the model is used in ensemble. However, the Callable @@ object will be created on model creation and it will request all @@ outputs for every model execution, which may impact the @@ performance if a request does not require all outputs. This @@ optimization will only take affect if the model instance is @@ created with KIND_GPU. @@
repeated ExecutionAccelerators.Accelerator cpu_execution_accelerator = 2
@@ .. cpp:var:: Accelerator cpu_execution_accelerator (repeated) @@ @@ The preferred execution provider to be used if the model instance @@ is deployed on CPU. @@ @@ For ONNX Runtime backend, possible value is "openvino" as name, @@ and no parameters are required. @@

@@ @@ .. cpp:var:: message Accelerator @@ @@ Specify the accelerator to be used to execute the model. @@ Accelerator with the same name may accept different parameters @@ depending on the backends. @@

Used in: ExecutionAccelerators

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the execution accelerator. @@
map<string, string> parameters = 2
@@ .. cpp:var:: map<string, string> parameters @@ @@ Additional parameters used to configure the accelerator. @@

@@ @@ .. cpp:var:: message Graph @@ @@ Enable generic graph optimization of the model. If not specified @@ the framework's default level of optimization is used. Supports @@ TensorFlow graphdef and savedmodel and Onnx models. For TensorFlow @@ causes XLA to be enabled/disabled for the model. For Onnx defaults @@ to enabling all optimizations, -1 enables only basic optimizations, @@ +1 enables only basic and extended optimizations. @@

Used in: ModelOptimizationPolicy

int32 level = 1
@@ .. cpp:var:: int32 level @@ @@ The optimization level. Defaults to 0 (zero) if not specified. @@ @@ - -1: Disabled @@ - 0: Framework default @@ - 1+: Enable optimization level (greater values indicate @@ higher optimization levels) @@

@@ @@ .. cpp:enum:: ModelPriority @@ @@ Model priorities. A model will be given scheduling and execution @@ preference over models at lower priorities. Current model @@ priorities only work for TensorRT models. @@

Used in: ModelOptimizationPolicy

PRIORITY_DEFAULT = 0
@@ .. cpp:enumerator:: ModelPriority::PRIORITY_DEFAULT = 0 @@ @@ The default model priority. @@
PRIORITY_MAX = 1
@@ .. cpp:enumerator:: ModelPriority::PRIORITY_MAX = 1 @@ @@ The maximum model priority. @@
PRIORITY_MIN = 2
@@ .. cpp:enumerator:: ModelPriority::PRIORITY_MIN = 2 @@ @@ The minimum model priority. @@

@@ @@ .. cpp:var:: message PinnedMemoryBuffer @@ @@ Specify whether to use a pinned memory buffer when transferring data @@ between non-pinned system memory and GPU memory. Using a pinned @@ memory buffer for system from/to GPU transfers will typically provide @@ increased performance. For example, in the common use case where the @@ request provides inputs and delivers outputs via non-pinned system @@ memory, if the model instance accepts GPU IOs, the inputs will be @@ processed by two copies: from non-pinned system memory to pinned @@ memory, and from pinned memory to GPU memory. Similarly, pinned @@ memory will be used for delivering the outputs. @@

Used in: ModelOptimizationPolicy

bool enable = 1
@@ .. cpp:var:: bool enable @@ @@ Use pinned memory buffer. Default is true. @@

@@ @@.. cpp:var:: message ModelOutput @@ @@ An output produced by the model. @@

Used in: ModelConfig

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the output. @@
DataType data_type = 2
@@ .. cpp:var:: DataType data_type @@ @@ The data-type of the output. @@
repeated int64 dims = 3
@@ .. cpp:var:: int64 dims (repeated) @@ @@ The dimensions/shape of the output tensor. @@
optional ModelTensorReshape reshape = 5
@@ .. cpp:var:: ModelTensorReshape reshape @@ @@ The shape produced for this output by the backend. The output will @@ be reshaped from this to the shape specified in 'dims' before being @@ returned in the inference response. The reshape must have the same @@ number of elements as the output shape specified by 'dims'. Optional. @@
string label_filename = 4
@@ .. cpp:var:: string label_filename @@ @@ The label file associated with this output. Should be specified only @@ for outputs that represent classifications. Optional. @@
bool is_shape_tensor = 6
@@ .. cpp:var:: bool is_shape_tensor @@ @@ Whether or not the output is a shape tensor to the model. This field @@ is currently supported only for the TensorRT model. An error will be @@ generated if this specification does not comply with underlying @@ model. @@
bool is_non_linear_format_io = 7
@@ .. cpp:var:: bool is_non_linear_format_io @@ @@ Indicates whether the output tensor uses a non-linear IO format. This @@ field is currently supported only for TensorRT models. An error will @@ be generated if this specification does not comply with the @@ underlying model. @@

@@ @@.. cpp:var:: message ModelParameter @@ @@ A model parameter. @@

Used in: ModelConfig

string string_value = 1
@@ .. cpp:var:: string string_value @@ @@ The string value of the parameter. @@

@@ @@.. cpp:var:: message ModelQueuePolicy @@ @@ Queue policy for inference requests. @@

Used in: ModelDynamicBatching

ModelQueuePolicy.TimeoutAction timeout_action = 1
@@ @@ .. cpp:var:: TimeoutAction timeout_action @@ @@ The action applied to timed-out request. @@ The default action is REJECT. @@
uint64 default_timeout_microseconds = 2
@@ @@ .. cpp:var:: uint64 default_timeout_microseconds @@ @@ The default timeout for every request, in microseconds. @@ The default value is 0 which indicates that no timeout is set. @@
bool allow_timeout_override = 3
@@ @@ .. cpp:var:: bool allow_timeout_override @@ @@ Whether individual request can override the default timeout value. @@ When true, individual requests can set a timeout that is less than @@ the default timeout value but may not increase the timeout. @@ The default value is false. @@
uint32 max_queue_size = 4
@@ @@ .. cpp:var:: uint32 max_queue_size @@ @@ The maximum queue size for holding requests. A request will be @@ rejected immediately if it can't be enqueued because the queue is @@ full. The default value is 0 which indicates that no maximum @@ queue size is enforced. @@

@@ @@ .. cpp:enum:: TimeoutAction @@ @@ The action applied to timed-out requests. @@

Used in: ModelQueuePolicy

REJECT = 0
@@ .. cpp:enumerator:: Action::REJECT = 0 @@ @@ Reject the request and return error message accordingly. @@
DELAY = 1
@@ .. cpp:enumerator:: Action::DELAY = 1 @@ @@ Delay the request until all other requests at the same @@ (or higher) priority levels that have not reached their timeouts @@ are processed. A delayed request will eventually be processed, @@ but may be delayed indefinitely due to newly arriving requests. @@

@@ @@ .. cpp:var:: message ModelRateLimiter @@ @@ The specifications required by the rate limiter to properly @@ schedule the inference requests across the different models @@ and their instances. @@

Used in: ModelInstanceGroup

repeated ModelRateLimiter.Resource resources = 1
@@ .. cpp:var:: Resource resources (repeated) @@ @@ The resources required to execute the request on a model instance. @@ Resources are just names with a corresponding count. The execution @@ of the instance will be blocked until the specified resources are @@ available. By default an instance uses no rate-limiter resources. @@
uint32 priority = 2
@@ .. cpp:var:: uint32 priority @@ @@ The optional weighting value to be used for prioritizing across @@ instances. An instance with priority 2 will be given 1/2 the @@ number of scheduling chances as an instance_group with priority @@ 1. The default priority is 1. The priority of value 0 will be @@ treated as priority 1. @@

@@ .. cpp:var:: message Resource @@ @@ The resource property. @@

Used in: ModelRateLimiter

string name = 1
@@ .. cpp:var:: string name @@ @@ The name associated with the resource. @@
bool global = 2
@@ .. cpp:var:: bool global @@ @@ Whether or not the resource is global. If true then the resource @@ is assumed to be shared among the devices otherwise specified @@ count of the resource is assumed for each device associated @@ with the instance. @@
uint32 count = 3
@@ .. cpp:var:: uint32 count @@ @@ The number of resources required for the execution of the model @@ instance. @@

@@ @@.. cpp:var:: message ModelRepositoryAgents @@ @@ The repository agents for the model. @@

Used in: ModelConfig

repeated ModelRepositoryAgents.Agent agents = 1
@@ @@ .. cpp:var:: Agent agents (repeated) @@ @@ The ordered list of agents for the model. These agents will be @@ invoked in order to respond to repository actions occurring for the @@ model. @@

@@ @@ .. cpp:var:: message Agent @@ @@ A repository agent that should be invoked for the specified @@ repository actions for this model. @@

Used in: ModelRepositoryAgents

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the agent. @@
map<string, string> parameters = 2
@@ .. cpp:var:: map<string, string> parameters @@ @@ The parameters for the agent. @@

@@ @@.. cpp:var:: message ModelRepositoryParameter @@ @@ An model repository parameter value. @@

Used in: RepositoryModelLoadRequest, RepositoryModelUnloadRequest

oneof parameter_choice
@@ .. cpp:var:: oneof parameter_choice @@ @@ The parameter value can be a string, an int64 or @@ a boolean @@
- bool bool_param = 1
  @@ .. cpp:var:: bool bool_param @@ @@ A boolean parameter value. @@
- int64 int64_param = 2
  @@ .. cpp:var:: int64 int64_param @@ @@ An int64 parameter value. @@
- string string_param = 3
  @@ .. cpp:var:: string string_param @@ @@ A string parameter value. @@
- bytes bytes_param = 4
  @@ .. cpp:var:: bytes bytes_param @@ @@ A bytes parameter value. @@

@@ @@.. cpp:var:: message ModelResponseCache @@ @@ The response cache setting for the model. @@

Used in: ModelConfig

bool enable = 1
@@ @@ .. cpp::var:: bool enable @@ @@ Whether or not to use response cache for the model. If True, the @@ responses from the model are cached and when identical request @@ is encountered, instead of going through the model execution, @@ the response from the cache is utilized. By default, response @@ cache is disabled for the models. @@

@@ @@.. cpp:var:: message ModelSequenceBatching @@ @@ Sequence batching configuration. These settings control how sequence @@ batching operates for the model. @@

Used in: ModelConfig

oneof strategy_choice
@@ .. cpp:var:: oneof strategy_choice @@ @@ The strategy used by the sequence batcher. Default strategy @@ is 'direct'. @@
- ModelSequenceBatching.StrategyDirect direct = 3
  @@ .. cpp:var:: StrategyDirect direct @@ @@ StrategyDirect scheduling strategy. @@
- ModelSequenceBatching.StrategyOldest oldest = 4
  @@ .. cpp:var:: StrategyOldest oldest @@ @@ StrategyOldest scheduling strategy. @@
uint64 max_sequence_idle_microseconds = 1
@@ .. cpp:var:: uint64 max_sequence_idle_microseconds @@ @@ The maximum time, in microseconds, that a sequence is allowed to @@ be idle before it is aborted. The inference server considers a @@ sequence idle when it does not have any inference request queued @@ for the sequence. If this limit is exceeded, the inference server @@ will free the sequence slot allocated by the sequence and make it @@ available for another sequence. If not specified (or specified as @@ zero) a default value of 1000000 (1 second) is used. @@
repeated ModelSequenceBatching.ControlInput control_input = 2
@@ .. cpp:var:: ControlInput control_input (repeated) @@ @@ The model input(s) that the server should use to communicate @@ sequence start, stop, ready and similar control values to the @@ model. @@
repeated ModelSequenceBatching.State state = 5
@@ .. cpp:var:: State state (repeated) @@ @@ The optional state that can be stored in Triton for performing @@ inference requests on a sequence. Each sequence holds an implicit @@ state local to itself. The output state tensor provided by the @@ model in 'output_name' field of the current inference request will @@ be transferred as an input tensor named 'input_name' in the next @@ request of the same sequence. The input state of the first request @@ in the sequence contains garbage data. @@
bool iterative_sequence = 6
@@ .. cpp:var:: bool iterative_sequence @@ @@ Requests for iterative sequences are processed over a number @@ of iterations. An iterative sequence is initiated by a single @@ request and is "rescheduled" by the model until completion. @@ Requests for inflight requests will be batched together @@ and can complete independently. Note this feature @@ requires backend support. Default value is false.

@@ .. cpp:var:: message Control @@ @@ A control is a signal that the sequence batcher uses to @@ communicate with a backend. @@

Used in: ControlInput

Control.Kind kind = 1
@@ .. cpp:var:: Kind kind @@ @@ The kind of this control. @@
repeated int32 int32_false_true = 2
@@ .. cpp:var:: int32 int32_false_true (repeated) @@ @@ The control's true and false setting is indicated by setting @@ a value in an int32 tensor. The tensor must be a @@ 1-dimensional tensor with size equal to the batch size of @@ the request. 'int32_false_true' must have two entries: the @@ first the false value and the second the true value. @@
repeated float fp32_false_true = 3
@@ .. cpp:var:: float fp32_false_true (repeated) @@ @@ The control's true and false setting is indicated by setting @@ a value in a fp32 tensor. The tensor must be a @@ 1-dimensional tensor with size equal to the batch size of @@ the request. 'fp32_false_true' must have two entries: the @@ first the false value and the second the true value. @@
repeated bool bool_false_true = 5
@@ .. cpp:var:: bool bool_false_true (repeated) @@ @@ The control's true and false setting is indicated by setting @@ a value in a bool tensor. The tensor must be a @@ 1-dimensional tensor with size equal to the batch size of @@ the request. 'bool_false_true' must have two entries: the @@ first the false value and the second the true value. @@
DataType data_type = 4
@@ .. cpp:var:: DataType data_type @@ @@ The control's datatype. @@

@@ @@ .. cpp:enum:: Kind @@ @@ The kind of the control. @@

Used in: Control

CONTROL_SEQUENCE_START = 0
@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_START = 0 @@ @@ A new sequence is/is-not starting. If true a sequence is @@ starting, if false a sequence is continuing. Must @@ specify either int32_false_true, fp32_false_true or @@ bool_false_true for this control. This control is optional. @@
CONTROL_SEQUENCE_READY = 1
@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_READY = 1 @@ @@ A sequence is/is-not ready for inference. If true the @@ input tensor data is valid and should be used. If false @@ the input tensor data is invalid and inferencing should @@ be "skipped". Must specify either int32_false_true, @@ fp32_false_true or bool_false_true for this control. This @@ control is optional. @@
CONTROL_SEQUENCE_END = 2
@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_END = 2 @@ @@ A sequence is/is-not ending. If true a sequence is @@ ending, if false a sequence is continuing. Must specify @@ either int32_false_true, fp32_false_true or bool_false_true @@ for this control. This control is optional. @@
CONTROL_SEQUENCE_CORRID = 3
@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_CORRID = 3 @@ @@ The correlation ID of the sequence. The correlation ID @@ is an uint64_t value that is communicated in whole or @@ in part by the tensor. The tensor's datatype must be @@ specified by data_type and must be TYPE_UINT64, TYPE_INT64, @@ TYPE_UINT32 or TYPE_INT32. If a 32-bit datatype is specified @@ the correlation ID will be truncated to the low-order 32 @@ bits. This control is optional. @@

@@ .. cpp:var:: message ControlInput @@ @@ The sequence control values to communicate by a model input. @@

Used in: ModelSequenceBatching

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the model input. @@
repeated Control control = 2
@@ .. cpp:var:: Control control (repeated) @@ @@ The control value(s) that should be communicated to the @@ model using this model input. @@

@@ @@ .. cpp:var:: message InitialState @@ @@ Settings used to initialize data for implicit state. @@

Used in: State

DataType data_type = 1
@@ .. cpp:var:: DataType data_type @@ @@ The data-type of the state. @@
repeated int64 dims = 2
@@ .. cpp:var:: int64 dims (repeated) @@ @@ The shape of the state tensor, not including the batch @@ dimension. @@
oneof state_data
@@ .. cpp:var:: oneof state_data @@ @@ Specify how the initial state data is generated. @@
- bool zero_data = 3
  @@ @@ .. cpp:var:: bool zero_data @@ @@ The identifier for using zeros as initial state data. @@ Note that the value of 'zero_data' will not be checked, @@ instead, zero data will be used as long as the field is set. @@
- string data_file = 4
  @@ .. cpp:var:: string data_file @@ @@ The file whose content will be used as the initial data for @@ the state in row-major order. The file must be provided in @@ sub-directory 'initial_state' under the model directory. @@
string name = 5
@@ .. cpp:var:: string name @@ @@ The name of the state initialization. @@

@@ .. cpp:var:: message State @@ @@ An input / output pair of tensors that carry state for the sequence. @@

Used in: ModelSequenceBatching

string input_name = 1
@@ .. cpp:var:: string input_name @@ @@ The name of the model state input. @@
string output_name = 2
@@ .. cpp:var:: string output_name @@ @@ The name of the model state output. @@
DataType data_type = 3
@@ .. cpp:var:: DataType data_type @@ @@ The data-type of the state. @@
repeated int64 dims = 4
@@ .. cpp:var:: int64 dim (repeated) @@ @@ The dimension. @@
repeated InitialState initial_state = 5
@@ .. cpp:var:: InitialState initial_state (repeated) @@ @@ The optional field to specify the initial state for the model. @@
bool use_same_buffer_for_input_output = 6
@@ .. cpp:var:: bool use_same_buffer_for_input_output @@ @@ The optional field to use a single buffer for both input and output @@ state. Without this option, Triton allocates separate buffers @@ for input and output state @@ which can be problematic if the state size is @@ large. This option reduces the memory usage by allocating a single @@ buffer. Enabling this option is recommended whenever @@ the input state is processed before the output state is written. @@ When enabled the state @@ will always be updated independent of whether @@ TRITONBACKEND_StateUpdate is called @@ (however TRITONBACKEND_StateUpdate should still be called for @@ completeness). @@ @@ The default value is false. @@
bool use_growable_memory = 7
@@ .. cpp:var:: bool use_growable_memory @@ @@ The optional field to enable an implicit state buffer to grow @@ without reallocating or copying existing memory. @@ Additional memory will be appended to the end of the buffer and @@ existing data will be preserved. @@ This option is only available for CUDA memory and requires enabling @@ use_same_buffer_for_input_output. When using this option, @@ StateBuffer call will always return CUDA memory even if CPU memory @@ is requested. @@ @@ The default value is false. @@

@@ .. cpp:var:: message StrategyDirect @@ @@ The sequence batcher uses a specific, unique batch @@ slot for each sequence. All inference requests in a @@ sequence are directed to the same batch slot in the same @@ model instance over the lifetime of the sequence. This @@ is the default strategy. @@

Used in: ModelSequenceBatching

uint64 max_queue_delay_microseconds = 1
@@ .. cpp:var:: uint64 max_queue_delay_microseconds @@ @@ The maximum time, in microseconds, a candidate request @@ will be delayed in the sequence batch scheduling queue to @@ wait for additional requests for batching. Default is 0. @@
float minimum_slot_utilization = 2
@@ .. cpp:var:: float minimum_slot_utilization @@ @@ The minimum slot utilization that must be satisfied to @@ execute the batch before 'max_queue_delay_microseconds' expires. @@ For example, a value of 0.5 indicates that the batch should be @@ executed as soon as 50% or more of the slots are ready even if @@ the 'max_queue_delay_microseconds' timeout has not expired. @@ The default is 0.0, indicating that a batch will be executed @@ before 'max_queue_delay_microseconds' timeout expires if at least @@ one batch slot is ready. 'max_queue_delay_microseconds' will be @@ ignored unless minimum_slot_utilization is set to a non-zero @@ value. @@

@@ .. cpp:var:: message StrategyOldest @@ @@ The sequence batcher maintains up to 'max_candidate_sequences' @@ candidate sequences. 'max_candidate_sequences' can be greater @@ than the model's 'max_batch_size'. For inferencing the batcher @@ chooses from the candidate sequences up to 'max_batch_size' @@ inference requests. Requests are chosen in an oldest-first @@ manner across all candidate sequences. A given sequence is @@ not guaranteed to be assigned to the same batch slot for @@ all inference requests of that sequence. @@

Used in: ModelSequenceBatching

int32 max_candidate_sequences = 1
@@ .. cpp:var:: int32 max_candidate_sequences @@ @@ Maximum number of candidate sequences that the batcher @@ maintains. Excess sequences are kept in an ordered backlog @@ and become candidates when existing candidate sequences @@ complete. @@
repeated int32 preferred_batch_size = 2
@@ .. cpp:var:: int32 preferred_batch_size (repeated) @@ @@ Preferred batch sizes for dynamic batching of candidate @@ sequences. If a batch of one of these sizes can be formed @@ it will be executed immediately. If not specified a @@ preferred batch size will be chosen automatically @@ based on model and GPU characteristics. @@
uint64 max_queue_delay_microseconds = 3
@@ .. cpp:var:: uint64 max_queue_delay_microseconds @@ @@ The maximum time, in microseconds, a candidate request @@ will be delayed in the dynamic batch scheduling queue to @@ wait for additional requests for batching. Default is 0. @@
bool preserve_ordering = 4
@@ .. cpp:var:: bool preserve_ordering @@ @@ Should the dynamic batcher preserve the ordering of responses to @@ match the order of requests received by the scheduler. Default is @@ false. If true, the responses will be returned in the same order @@ as the order of requests sent to the scheduler. If false, the @@ responses may be returned in arbitrary order. This option is @@ specifically needed when a sequence of related inference requests @@ (i.e. inference requests with the same correlation ID) are sent @@ to the dynamic batcher to ensure that the sequence responses are @@ in the correct order. @@ @@ When using decoupled models, setting this to true may block the @@ responses from independent sequences from being returned to the @@ client until the previous request completes, hurting overall @@ performance. If using GRPC streaming protocol, the stream @@ ordering guarantee may be sufficient alone to ensure the @@ responses for each sequence are returned in sequence-order @@ without blocking based on independent requests, depending on the @@ use case. @@

@@ @@.. cpp:var:: message ModelStatistics @@ @@ Statistics for a specific model and version. @@

Used in: ModelStatisticsResponse

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the model. If not given returns statistics for all @@
string version = 2
@@ .. cpp:var:: string version @@ @@ The version of the model. @@
uint64 last_inference = 3
@@ .. cpp:var:: uint64 last_inference @@ @@ The timestamp of the last inference request made for this model, @@ as milliseconds since the epoch. @@
uint64 inference_count = 4
@@ .. cpp:var:: uint64 last_inference @@ @@ The cumulative count of successful inference requests made for this @@ model. Each inference in a batched request is counted as an @@ individual inference. For example, if a client sends a single @@ inference request with batch size 64, "inference_count" will be @@ incremented by 64. Similarly, if a clients sends 64 individual @@ requests each with batch size 1, "inference_count" will be @@ incremented by 64. The "inference_count" value DOES NOT include @@ cache hits. @@
uint64 execution_count = 5
@@ .. cpp:var:: uint64 last_inference @@ @@ The cumulative count of the number of successful inference executions @@ performed for the model. When dynamic batching is enabled, a single @@ model execution can perform inferencing for more than one inference @@ request. For example, if a clients sends 64 individual requests each @@ with batch size 1 and the dynamic batcher batches them into a single @@ large batch for model execution then "execution_count" will be @@ incremented by 1. If, on the other hand, the dynamic batcher is not @@ enabled for that each of the 64 individual requests is executed @@ independently, then "execution_count" will be incremented by 64. @@ The "execution_count" value DOES NOT include cache hits. @@
optional InferStatistics inference_stats = 6
@@ .. cpp:var:: InferStatistics inference_stats @@ @@ The aggregate statistics for the model/version. @@
repeated InferBatchStatistics batch_stats = 7
@@ .. cpp:var:: InferBatchStatistics batch_stats (repeated) @@ @@ The aggregate statistics for each different batch size that is @@ executed in the model. The batch statistics indicate how many actual @@ model executions were performed and show differences due to different @@ batch size (for example, larger batches typically take longer to @@ compute). @@
repeated MemoryUsage memory_usage = 8
@@ .. cpp:var:: MemoryUsage memory_usage (repeated) @@ @@ The memory usage detected during model loading, which may be used to @@ estimate the memory to be released once the model is unloaded. Note @@ that the estimation is inferenced by the profiling tools and @@ framework's memory schema, therefore it is advised to perform @@ experiments to understand the scenario that the reported memory usage @@ can be relied on. As a starting point, the GPU memory usage for @@ models in ONNX Runtime backend and TensorRT backend is usually @@ aligned. @@
map<string, InferResponseStatistics> response_stats = 9
@@ .. cpp:var:: map<string, InferResponseStatistics> response_stats @@ @@ The key and value pairs for all responses statistics. The key is a @@ string identifying a set of response statistics aggregated together @@ (i.e. index of the response sent). The value is the aggregated @@ response statistics. @@

@@ @@.. cpp:var:: message ModelTensorReshape @@ @@ Reshape specification for input and output tensors. @@

Used in: ModelInput, ModelOutput

repeated int64 shape = 1
@@ .. cpp:var:: int64 shape (repeated) @@ @@ The shape to use for reshaping. @@

@@ @@ .. cpp:var:: message ModelTransactionPolicy @@ @@ The specification that describes the nature of transactions @@ to be expected from the model. @@

Used in: ModelConfig

bool decoupled = 1
@@ .. cpp:var:: bool decoupled @@ @@ Indicates whether responses generated by the model are decoupled with @@ the requests issued to it, which means the number of responses @@ generated by model may differ from number of requests issued, and @@ that the responses may be out of order relative to the order of @@ requests. The default is false, which means the model will generate @@ exactly one response for each request. @@

@@ @@.. cpp:var:: message ModelVersionPolicy @@ @@ Policy indicating which versions of a model should be made @@ available by the inference server. @@

Used in: ModelConfig

oneof policy_choice
@@ .. cpp:var:: oneof policy_choice @@ @@ Each model must implement only a single version policy. The @@ default policy is 'Latest'. @@
- ModelVersionPolicy.Latest latest = 1
  @@ .. cpp:var:: Latest latest @@ @@ Serve only latest version(s) of the model. @@
- ModelVersionPolicy.All all = 2
  @@ .. cpp:var:: All all @@ @@ Serve all versions of the model. @@
- ModelVersionPolicy.Specific specific = 3
  @@ .. cpp:var:: Specific specific @@ @@ Serve only specific version(s) of the model. @@

@@ .. cpp:var:: message All @@ @@ Serve all versions of the model. @@

Used in: ModelVersionPolicy

(message has no fields)

@@ .. cpp:var:: message Latest @@ @@ Serve only the latest version(s) of a model. This is @@ the default policy. @@

Used in: ModelVersionPolicy

uint32 num_versions = 1
@@ .. cpp:var:: uint32 num_versions @@ @@ Serve only the 'num_versions' highest-numbered versions. T @@ The default value of 'num_versions' is 1, indicating that by @@ default only the single highest-number version of a @@ model will be served. @@

@@ .. cpp:var:: message Specific @@ @@ Serve only specific versions of the model. @@

Used in: ModelVersionPolicy

repeated int64 versions = 1
@@ .. cpp:var:: int64 versions (repeated) @@ @@ The specific versions of the model that will be served. @@

@@ @@.. cpp:var:: message ModelWarmup @@ @@ Settings used to construct the request sample for model warmup. @@

Used in: ModelConfig

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the request sample. @@
uint32 batch_size = 2
@@ .. cpp:var:: uint32 batch_size @@ @@ The batch size of the inference request. This must be >= 1. For @@ models that don't support batching, batch_size must be 1. If @@ batch_size > 1, the 'inputs' specified below will be duplicated to @@ match the batch size requested. @@
map<string, ModelWarmup.Input> inputs = 3
@@ .. cpp:var:: map<string, Input> inputs @@ @@ The warmup meta data associated with every model input, including @@ control tensors. @@
uint32 count = 4
@@ .. cpp:var:: uint32 count @@ @@ The number of iterations that this warmup sample will be executed. @@ For example, if this field is set to 2, 2 model executions using this @@ sample will be scheduled for warmup. Default value is 0 which @@ indicates that this sample will be used only once. @@ Note that for sequence model, 'count' may not work well @@ because the model often expect a valid sequence of requests which @@ should be represented by a series of warmup samples. 'count > 1' @@ essentially "resends" one of the sample, which may invalidate the @@ sequence and result in unexpected warmup failure. @@

@@ @@ .. cpp:var:: message Input @@ @@ Meta data associated with an input. @@

Used in: ModelWarmup

DataType data_type = 1
@@ .. cpp:var:: DataType data_type @@ @@ The data-type of the input. @@
repeated int64 dims = 2
@@ .. cpp:var:: int64 dims (repeated) @@ @@ The shape of the input tensor, not including the batch dimension. @@
oneof input_data_type
@@ .. cpp:var:: oneof input_data_type @@ @@ Specify how the input data is generated. If the input has STRING @@ data type and 'random_data' is set, the data generation will fall @@ back to 'zero_data'. @@
- bool zero_data = 3
  @@ @@ .. cpp:var:: bool zero_data @@ @@ The identifier for using zeros as input data. Note that the @@ value of 'zero_data' will not be checked, instead, zero data @@ will be used as long as the field is set. @@
- bool random_data = 4
  @@ @@ .. cpp:var:: bool random_data @@ @@ The identifier for using random data as input data. Note that @@ the value of 'random_data' will not be checked, instead, @@ random data will be used as long as the field is set. @@
- string input_data_file = 5
  @@ .. cpp:var:: string input_data_file @@ @@ The file whose content will be used as raw input data in @@ row-major order. The file must be provided in a sub-directory @@ 'warmup' under the model directory. The file contents should be @@ in binary format. For TYPE_STRING data-type, an element is @@ represented by a 4-byte unsigned integer giving the length @@ followed by the actual bytes. @@

@@ @@ .. cpp:var:: message ModelIndex @@ @@ Index entry for a model. @@

Used in: RepositoryIndexResponse

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The name of the model. @@
string version = 2
@@ .. cpp:var:: string version @@ @@ The version of the model. @@
string state = 3
@@ @@ .. cpp:var:: string state @@ @@ The state of the model. @@
string reason = 4
@@ @@ .. cpp:var:: string reason @@ @@ The reason, if any, that the model is in the given state. @@

@@ @@.. cpp:var:: message StatisticDuration @@ @@ Statistic recording a cumulative duration metric. @@

Used in: InferBatchStatistics, InferResponseStatistics, InferStatistics

uint64 count = 1
@@ .. cpp:var:: uint64 count @@ @@ Cumulative number of times this metric occurred. @@
uint64 ns = 2
@@ .. cpp:var:: uint64 total_time_ns @@ @@ Total collected duration of this metric in nanoseconds. @@

@@ @@ .. cpp:var:: message RegionStatus @@ @@ Status for a shared memory region. @@

Used in: SystemSharedMemoryStatusResponse

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The name for the shared memory region. @@
string key = 2
@@ .. cpp:var:: string shared_memory_key @@ @@ The key of the underlying memory object that contains the @@ shared memory region. @@
uint64 offset = 3
@@ .. cpp:var:: uint64 offset @@ @@ Offset, in bytes, within the underlying memory object to @@ the start of the shared memory region. @@
uint64 byte_size = 4
@@ .. cpp:var:: uint64 byte_size @@ @@ Size of the shared memory region, in bytes. @@

@@ @@ .. cpp:var:: message SettingValue @@ @@ The values to be associated with a trace setting. @@ If no value is provided, the setting will be clear and @@ the global setting value will be used. @@

Used in: TraceSettingRequest

repeated string value = 1
@@ @@ .. cpp:var:: string value (repeated) @@ @@ The value. @@

@@ @@ .. cpp:var:: message SettingValue @@ @@ The values to be associated with a trace setting. @@

Used in: TraceSettingResponse

repeated string value = 1
@@ @@ .. cpp:var:: string value (repeated) @@ @@ The value. @@

package inference

service GRPCInferenceService

rpc CudaSharedMemoryRegister (CudaSharedMemoryRegisterRequest, CudaSharedMemoryRegisterResponse)

message CudaSharedMemoryRegisterRequest

string name = 1

bytes raw_handle = 2

int64 device_id = 3

uint64 byte_size = 4

message CudaSharedMemoryRegisterResponse

rpc CudaSharedMemoryStatus (CudaSharedMemoryStatusRequest, CudaSharedMemoryStatusResponse)

message CudaSharedMemoryStatusRequest

string name = 1

message CudaSharedMemoryStatusResponse

map<string, CudaSharedMemoryStatusResponse.RegionStatus> regions = 1

rpc CudaSharedMemoryUnregister (CudaSharedMemoryUnregisterRequest, CudaSharedMemoryUnregisterResponse)

message CudaSharedMemoryUnregisterRequest

string name = 1

message CudaSharedMemoryUnregisterResponse

rpc LogSettings (LogSettingsRequest, LogSettingsResponse)

message LogSettingsRequest

map<string, LogSettingsRequest.SettingValue> settings = 1

message LogSettingsResponse

map<string, LogSettingsResponse.SettingValue> settings = 1

rpc ModelConfig (ModelConfigRequest, ModelConfigResponse)

message ModelConfigRequest

string name = 1

string version = 2

message ModelConfigResponse

optional ModelConfig config = 1

rpc ModelInfer (ModelInferRequest, ModelInferResponse)

rpc ModelMetadata (ModelMetadataRequest, ModelMetadataResponse)

message ModelMetadataRequest

string name = 1

string version = 2

message ModelMetadataResponse

string name = 1

repeated string versions = 2

string platform = 3

repeated ModelMetadataResponse.TensorMetadata inputs = 4

repeated ModelMetadataResponse.TensorMetadata outputs = 5

rpc ModelReady (ModelReadyRequest, ModelReadyResponse)

message ModelReadyRequest

string name = 1

string version = 2

message ModelReadyResponse

bool ready = 1

rpc ModelStatistics (ModelStatisticsRequest, ModelStatisticsResponse)

message ModelStatisticsRequest

string name = 1

string version = 2

message ModelStatisticsResponse

repeated ModelStatistics model_stats = 1

rpc ModelStreamInfer (stream ModelInferRequest, stream ModelStreamInferResponse)

message ModelStreamInferResponse

string error_message = 1

optional ModelInferResponse infer_response = 2

rpc RepositoryIndex (RepositoryIndexRequest, RepositoryIndexResponse)

message RepositoryIndexRequest

string repository_name = 1

bool ready = 2

message RepositoryIndexResponse

repeated RepositoryIndexResponse.ModelIndex models = 1

rpc RepositoryModelLoad (RepositoryModelLoadRequest, RepositoryModelLoadResponse)

message RepositoryModelLoadRequest

string repository_name = 1

string model_name = 2

map<string, ModelRepositoryParameter> parameters = 3

message RepositoryModelLoadResponse

rpc RepositoryModelUnload (RepositoryModelUnloadRequest, RepositoryModelUnloadResponse)

message RepositoryModelUnloadRequest

string repository_name = 1

string model_name = 2

map<string, ModelRepositoryParameter> parameters = 3

message RepositoryModelUnloadResponse

rpc ServerLive (ServerLiveRequest, ServerLiveResponse)

message ServerLiveRequest

message ServerLiveResponse

bool live = 1

rpc ServerMetadata (ServerMetadataRequest, ServerMetadataResponse)

message ServerMetadataRequest