package nvidia.inferenceserver

Get desktop application:
View/edit binary Protocol Buffers messages

@@ @@.. cpp:var:: service InferenceService @@ @@ Inference Server GRPC endpoints. @@

rpc CudaSharedMemoryRegister (CudaSharedMemoryRegisterRequest, CudaSharedMemoryRegisterResponse)
grpc_service_v2.proto:192
@@ .. cpp:var:: rpc CudaSharedMemoryRegister( @@ CudaSharedMemoryRegisterRequest) @@ returns (CudaSharedMemoryRegisterResponse) @@ @@ Register a CUDA-shared-memory region. @@
message CudaSharedMemoryRegisterRequest
grpc_service_v2.proto:1204
@@ @@.. cpp:var:: message CudaSharedMemoryRegisterRequest @@ @@ Request message for CudaSharedMemoryRegister. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the region to register. @@
- bytes raw_handle = 2
  @@ .. cpp:var:: bytes raw_handle @@ @@ The raw serialized cudaIPC handle. @@
- int64 device_id = 3
  @@ .. cpp:var:: int64 device_id @@ @@ The GPU device ID on which the cudaIPC handle was created. @@
- uint64 byte_size = 4
  @@ .. cpp:var:: uint64 byte_size @@ @@ Size of the shared memory block, in bytes. @@
message CudaSharedMemoryRegisterResponse
grpc_service_v2.proto:1237
@@ @@.. cpp:var:: message CudaSharedMemoryRegisterResponse @@ @@ Response message for CudaSharedMemoryRegister. @@
(message has no fields)
rpc CudaSharedMemoryStatus (CudaSharedMemoryStatusRequest, CudaSharedMemoryStatusResponse)
grpc_service_v2.proto:181
@@ .. cpp:var:: rpc CudaSharedMemoryStatus( @@ CudaSharedMemoryStatusRequest) @@ returns (CudaSharedMemoryStatusRespose) @@ @@ Get the status of all registered CUDA-shared-memory regions. @@
message CudaSharedMemoryStatusRequest
grpc_service_v2.proto:1145
@@ @@.. cpp:var:: message CudaSharedMemoryStatusRequest @@ @@ Request message for CudaSharedMemoryStatus. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the region to get status for. If empty the @@ status is returned for all registered regions. @@
message CudaSharedMemoryStatusResponse
grpc_service_v2.proto:1161
@@ @@.. cpp:var:: message CudaSharedMemoryStatusResponse @@ @@ Response message for CudaSharedMemoryStatus. @@
- map<string, CudaSharedMemoryStatusResponse.RegionStatus> regions = 1
  @@ @@ .. cpp:var:: map<string,RegionStatus> regions @@ @@ Status for each of the registered regions, indexed by @@ region name. @@
rpc CudaSharedMemoryUnregister (CudaSharedMemoryUnregisterRequest, CudaSharedMemoryUnregisterResponse)
grpc_service_v2.proto:203
@@ .. cpp:var:: rpc CudaSharedMemoryUnregister( @@ CudaSharedMemoryUnregisterRequest) @@ returns (CudaSharedMemoryUnregisterResponse) @@ @@ Unregister a CUDA-shared-memory region. @@
message CudaSharedMemoryUnregisterRequest
grpc_service_v2.proto:1244
@@ @@.. cpp:var:: message CudaSharedMemoryUnregisterRequest @@ @@ Request message for CudaSharedMemoryUnregister. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the cuda region to unregister. If empty @@ all cuda shared-memory regions are unregistered. @@
message CudaSharedMemoryUnregisterResponse
grpc_service_v2.proto:1260
@@ @@.. cpp:var:: message CudaSharedMemoryUnregisterResponse @@ @@ Response message for CudaSharedMemoryUnregister. @@
(message has no fields)
rpc ModelConfig (ModelConfigRequest, ModelConfigResponse)
grpc_service_v2.proto:99
@@ .. cpp:var:: rpc ModelConfig(ModelConfigRequest) returns @@ (ModelConfigResponse) @@ @@ Get model configuration. @@
message ModelConfigRequest
grpc_service_v2.proto:768
@@ @@.. cpp:var:: message ModelConfigRequest @@ @@ Request message for ModelConfig. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the model. @@
- string version = 2
  @@ .. cpp:var:: string version @@ @@ The version of the model. If not given the model version @@ is selected automatically based on the version policy. @@
message ModelConfigResponse
grpc_service_v2.proto:790
@@ @@.. cpp:var:: message ModelConfigResponse @@ @@ Response message for ModelConfig. @@
- optional ModelConfig config = 1
  @@ @@ .. cpp:var:: ModelConfig config @@ @@ The model configuration. @@
rpc ModelInfer (ModelInferRequest, ModelInferResponse)
grpc_service_v2.proto:82
@@ .. cpp:var:: rpc ModelInfer(ModelInferRequest) returns @@ (ModelInferResponse) @@ @@ Perform inference using a specific model. @@
rpc ModelMetadata (ModelMetadataRequest, ModelMetadataResponse)
grpc_service_v2.proto:75
@@ .. cpp:var:: rpc ModelMetadata(ModelMetadataRequest) returns @@ (ModelMetadataResponse) @@ @@ Get model metadata. @@
message ModelMetadataRequest
grpc_service_v2.proto:331
@@ @@.. cpp:var:: message ModelMetadataRequest @@ @@ Request message for ModelMetadata. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the model. @@
- string version = 2
  @@ .. cpp:var:: string version @@ @@ The version of the model to check for readiness. If not @@ given the server will choose a version based on the @@ model and internal policy. @@
message ModelMetadataResponse
grpc_service_v2.proto:354
@@ @@.. cpp:var:: message ModelMetadataResponse @@ @@ Response message for ModelMetadata. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The model name. @@
- repeated string versions = 2
  @@ @@ .. cpp:var:: string versions (repeated) @@ @@ The versions of the model. @@
- string platform = 3
  @@ @@ .. cpp:var:: string platform @@ @@ The model's platform. @@
- repeated ModelMetadataResponse.TensorMetadata inputs = 4
  @@ @@ .. cpp:var:: TensorMetadata inputs (repeated) @@ @@ The model's inputs. @@
- repeated ModelMetadataResponse.TensorMetadata outputs = 5
  @@ @@ .. cpp:var:: TensorMetadata outputs (repeated) @@ @@ The model's outputs. @@
rpc ModelReady (ModelReadyRequest, ModelReadyResponse)
grpc_service_v2.proto:61
@@ .. cpp:var:: rpc ModelReady(ModelReadyRequest) returns @@ (ModelReadyResponse) @@ @@ Check readiness of a model in the inference server. @@
message ModelReadyRequest
grpc_service_v2.proto:258
@@ @@.. cpp:var:: message ModelReadyRequest @@ @@ Request message for ModelReady. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the model to check for readiness. @@
- string version = 2
  @@ .. cpp:var:: string version @@ @@ The version of the model to check for readiness. If not given the @@ server will choose a version based on the model and internal policy. @@
message ModelReadyResponse
grpc_service_v2.proto:280
@@ @@.. cpp:var:: message ModelReadyResponse @@ @@ Response message for ModelReady. @@
- bool ready = 1
  @@ @@ .. cpp:var:: bool ready @@ @@ True if the model is ready, false it not ready. @@
rpc ModelStatistics (ModelStatisticsRequest, ModelStatisticsResponse)
grpc_service_v2.proto:107
@@ .. cpp:var:: rpc ModelStatistics( @@ ModelStatisticsRequest) @@ returns (ModelStatisticsResponse) @@ @@ Get the cumulative inference statistics for a model. @@
message ModelStatisticsRequest
grpc_service_v2.proto:805
@@ @@.. cpp:var:: message ModelStatisticsRequest @@ @@ Request message for ModelStatistics. @@
- string name = 1
  @@ .. cpp:var:: string name @@ @@ The name of the model. @@
- string version = 2
  @@ .. cpp:var:: string version @@ @@ The version of the model. If not given returns statistics for @@ all model versions. @@
message ModelStatisticsResponse
grpc_service_v2.proto:898
@@ @@.. cpp:var:: message ModelStatisticsResponse @@ @@ Response message for ModelStatistics. @@
- map<string, InferStatistics> inference = 1
  @@ .. cpp:var:: map<string, InferStatistics> inference @@ @@ Map from version to inference statistics for that version. @@
rpc ModelStreamInfer (stream ModelInferRequest, stream ModelStreamInferResponse)
grpc_service_v2.proto:89
@@ .. cpp:var:: rpc ModelStreamInfer(stream ModelInferRequest) returns @@ (stream ModelStreamInferResponse) @@ @@ Perform streaming inference. @@
message ModelStreamInferResponse
grpc_service_v2.proto:745
@@ @@.. cpp:var:: message ModelStreamInferResponse @@ @@ Response message for ModelStreamInfer. @@
- string error_message = 1
  @@ @@ .. cpp:var:: string error_message @@ @@ The message describing the error. The empty message @@ indicates the inference was successful without errors. @@
- optional ModelInferResponse infer_response = 2
  @@ @@ .. cpp:var:: ModelInferResponse infer_response @@ @@ Holds the results of the request. @@
rpc RepositoryIndex (RepositoryIndexRequest, RepositoryIndexResponse)
grpc_service_v2.proto:116
@@ .. cpp:var:: rpc RepositoryIndex(RepositoryIndexRequest) returns @@ (RepositoryIndexResponse) @@ @@ Get the index of model repository contents. @@
message RepositoryIndexRequest
grpc_service_v2.proto:912
@@ @@.. cpp:var:: message RepositoryIndexRequest @@ @@ Request message for RepositoryIndex. @@
- string repository_name = 1
  @@ .. cpp:var:: string repository_name @@ @@ The name of the repository. If empty the index is returned @@ for all repositories. @@
message RepositoryIndexResponse
grpc_service_v2.proto:927
@@ @@.. cpp:var:: message RepositoryIndexResponse @@ @@ Response message for RepositoryIndex. @@
- repeated RepositoryIndexResponse.ModelIndex models = 1
  @@ @@ .. cpp:var:: ModelIndex models (repeated) @@ @@ An index entry for each model. @@
rpc RepositoryModelLoad (RepositoryModelLoadRequest, RepositoryModelLoadResponse)
grpc_service_v2.proto:125
@@ .. cpp:var:: rpc RepositoryModelLoad(RepositoryModelLoadRequest) returns @@ (RepositoryModelLoadResponse) @@ @@ Load or reload a model from a repository. @@
message RepositoryModelLoadRequest
grpc_service_v2.proto:957
@@ @@.. cpp:var:: message RepositoryModelLoadRequest @@ @@ Request message for RepositoryModelLoad. @@
- string repository_name = 1
  @@ .. cpp:var:: string repository_name @@ @@ The name of the repository to load from. If empty the model @@ is loaded from any repository. @@
- string model_name = 2
  @@ .. cpp:var:: string repository_name @@ @@ The name of the model to load, or reload. @@
message RepositoryModelLoadResponse
grpc_service_v2.proto:978
@@ @@.. cpp:var:: message RepositoryModelLoadResponse @@ @@ Response message for RepositoryModelLoad. @@
(message has no fields)
rpc RepositoryModelUnload (RepositoryModelUnloadRequest, RepositoryModelUnloadResponse)
grpc_service_v2.proto:137
@@ .. cpp:var:: rpc RepositoryModelUnload(RepositoryModelUnloadRequest) returns @@ (RepositoryModelUnloadResponse) @@ @@ Unload a model. @@
message RepositoryModelUnloadRequest
grpc_service_v2.proto:985
@@ @@.. cpp:var:: message RepositoryModelUnloadRequest @@ @@ Request message for RepositoryModelUnload. @@
- string repository_name = 1
  @@ .. cpp:var:: string repository_name @@ @@ The name of the repository from which the model was originally @@ loaded. If empty the repository is not considered. @@
- string model_name = 2
  @@ .. cpp:var:: string repository_name @@ @@ The name of the model to unload. @@
message RepositoryModelUnloadResponse
grpc_service_v2.proto:1006
@@ @@.. cpp:var:: message RepositoryModelUnloadResponse @@ @@ Response message for RepositoryModelUnload. @@
(message has no fields)
rpc ServerLive (ServerLiveRequest, ServerLiveResponse)
grpc_service_v2.proto:47
@@ .. cpp:var:: rpc ServerLive(ServerLiveRequest) returns @@ (ServerLiveResponse) @@ @@ Check liveness of the inference server. @@
message ServerLiveRequest
grpc_service_v2.proto:214
@@ @@.. cpp:var:: message ServerLiveRequest @@ @@ Request message for ServerLive. @@
(message has no fields)
message ServerLiveResponse
grpc_service_v2.proto:221
@@ @@.. cpp:var:: message ServerLiveResponse @@ @@ Response message for ServerLive. @@
- bool live = 1
  @@ @@ .. cpp:var:: bool live @@ @@ True if the inference server is live, false it not live. @@
rpc ServerMetadata (ServerMetadataRequest, ServerMetadataResponse)
grpc_service_v2.proto:68
@@ .. cpp:var:: rpc ServerMetadata(ServerMetadataRequest) returns @@ (ServerMetadataResponse) @@ @@ Get server metadata. @@
message ServerMetadataRequest
grpc_service_v2.proto:295
@@ @@.. cpp:var:: message ServerMetadataRequest @@ @@ Request message for ServerMetadata. @@
(message has no fields)
message ServerMetadataResponse
grpc_service_v2.proto:302
@@ @@.. cpp:var:: message ServerMetadataResponse @@ @@ Response message for ServerMetadata. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The server name. @@
- string version = 2
  @@ @@ .. cpp:var:: string version @@ @@ The server version. @@
- repeated string extensions = 3
  @@ @@ .. cpp:var:: string extensions (repeated) @@ @@ The extensions supported by the server. @@
rpc ServerReady (ServerReadyRequest, ServerReadyResponse)
grpc_service_v2.proto:54
@@ .. cpp:var:: rpc ServerReady(ServerReadyRequest) returns @@ (ServerReadyResponse) @@ @@ Check readiness of the inference server. @@
message ServerReadyRequest
grpc_service_v2.proto:236
@@ @@.. cpp:var:: message ServerReadyRequest @@ @@ Request message for ServerReady. @@
(message has no fields)
message ServerReadyResponse
grpc_service_v2.proto:243
@@ @@.. cpp:var:: message ServerReadyResponse @@ @@ Response message for ServerReady. @@
- bool ready = 1
  @@ @@ .. cpp:var:: bool ready @@ @@ True if the inference server is ready, false it not ready. @@
rpc SystemSharedMemoryRegister (SystemSharedMemoryRegisterRequest, SystemSharedMemoryRegisterResponse)
grpc_service_v2.proto:159
@@ .. cpp:var:: rpc SystemSharedMemoryRegister( @@ SystemSharedMemoryRegisterRequest) @@ returns (SystemSharedMemoryRegisterResponse) @@ @@ Register a system-shared-memory region. @@
message SystemSharedMemoryRegisterRequest
grpc_service_v2.proto:1080
@@ @@.. cpp:var:: message SystemSharedMemoryRegisterRequest @@ @@ Request message for SystemSharedMemoryRegister. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the region to register. @@
- string key = 2
  @@ .. cpp:var:: string shared_memory_key @@ @@ The key of the underlying memory object that contains the @@ shared memory region. @@
- uint64 offset = 3
  @@ .. cpp:var:: uint64 offset @@ @@ Offset, in bytes, within the underlying memory object to @@ the start of the shared memory region. @@
- uint64 byte_size = 4
  @@ .. cpp:var:: uint64 byte_size @@ @@ Size of the shared memory region, in bytes. @@
message SystemSharedMemoryRegisterResponse
grpc_service_v2.proto:1115
@@ @@.. cpp:var:: message SystemSharedMemoryRegisterResponse @@ @@ Response message for SystemSharedMemoryRegister. @@
(message has no fields)
rpc SystemSharedMemoryStatus (SystemSharedMemoryStatusRequest, SystemSharedMemoryStatusResponse)
grpc_service_v2.proto:148
@@ .. cpp:var:: rpc SystemSharedMemoryStatus( @@ SystemSharedMemoryStatusRequest) @@ returns (SystemSharedMemoryStatusRespose) @@ @@ Get the status of all registered system-shared-memory regions. @@
message SystemSharedMemoryStatusRequest
grpc_service_v2.proto:1013
@@ @@.. cpp:var:: message SystemSharedMemoryStatusRequest @@ @@ Request message for SystemSharedMemoryStatus. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the region to get status for. If empty the @@ status is returned for all registered regions. @@
message SystemSharedMemoryStatusResponse
grpc_service_v2.proto:1029
@@ @@.. cpp:var:: message SystemSharedMemoryStatusResponse @@ @@ Response message for SystemSharedMemoryStatus. @@
- map<string, SystemSharedMemoryStatusResponse.RegionStatus> regions = 1
  @@ @@ .. cpp:var:: map<string,RegionStatus> regions @@ @@ Status for each of the registered regions, indexed by @@ region name. @@
rpc SystemSharedMemoryUnregister (SystemSharedMemoryUnregisterRequest, SystemSharedMemoryUnregisterResponse)
grpc_service_v2.proto:170
@@ .. cpp:var:: rpc SystemSharedMemoryUnregister( @@ SystemSharedMemoryUnregisterRequest) @@ returns (SystemSharedMemoryUnregisterResponse) @@ @@ Unregister a system-shared-memory region. @@
message SystemSharedMemoryUnregisterRequest
grpc_service_v2.proto:1122
@@ @@.. cpp:var:: message SystemSharedMemoryUnregisterRequest @@ @@ Request message for SystemSharedMemoryUnregister. @@
- string name = 1
  @@ @@ .. cpp:var:: string name @@ @@ The name of the system region to unregister. If empty @@ all system shared-memory regions are unregistered. @@
message SystemSharedMemoryUnregisterResponse
grpc_service_v2.proto:1138
@@ @@.. cpp:var:: message SystemSharedMemoryUnregisterResponse @@ @@ Response message for SystemSharedMemoryUnregister. @@
(message has no fields)

@@ @@.. cpp:var:: service GRPCService @@ @@ Inference Server GRPC endpoints. @@

rpc Health (HealthRequest, HealthResponse)
grpc_service.proto:54
@@ .. cpp:var:: rpc Health(HealthRequest) returns (HealthResponse) @@ @@ Check liveness and readiness of the inference server. @@
message HealthRequest
grpc_service.proto:140
@@ @@.. cpp:var:: message HealthRequest @@ @@ Request message for Health gRPC endpoint. @@
- string mode = 1
  @@ @@ .. cpp:var:: string mode @@ @@ The requested health action: 'live' requests the liveness @@ state of the inference server; 'ready' requests the readiness state @@ of the inference server. @@
message HealthResponse
grpc_service.proto:157
@@ @@.. cpp:var:: message HealthResponse @@ @@ Response message for Health gRPC endpoint. @@
- optional RequestStatus request_status = 1
  @@ @@ .. cpp:var:: RequestStatus request_status @@ @@ The status of the request, indicating success or failure. @@
- bool health = 2
  @@ @@ .. cpp:var:: bool health @@ @@ The result of the request. True indicates the inference server is @@ live/ready, false indicates the inference server is not live/ready. @@
rpc Infer (InferRequest, InferResponse)
grpc_service.proto:62
@@ .. cpp:var:: rpc Infer(InferRequest) returns (InferResponse) @@ @@ Request inference using a specific model. [ To handle large input @@ tensors likely need to set the maximum message size to that they @@ can be transmitted in one pass. @@
rpc ModelControl (ModelControlRequest, ModelControlResponse)
grpc_service.proto:78
@@ .. cpp:var:: rpc ModelControl(ModelControlRequest) returns @@ (ModelControlResponse) @@ @@ Request to load / unload a specified model. @@
message ModelControlRequest
grpc_service.proto:180
@@ @@.. cpp:var:: message ModelControlRequest @@ @@ Request message for ModelControl gRPC endpoint. @@
- string model_name = 1
  @@ @@ .. cpp:var:: string model_name @@ @@ The target model name. @@
- ModelControlRequest.Type type = 2
  @@ @@ .. cpp:var:: Type type @@ @@ The control type that is operated on the specified model. @@
message ModelControlResponse
grpc_service.proto:221
@@ @@.. cpp:var:: message ModelControlResponse @@ @@ Response message for ModelControl gRPC endpoint. @@
- optional RequestStatus request_status = 1
  @@ @@ .. cpp:var:: RequestStatus request_status @@ @@ The status of the request, indicating success or failure. @@
rpc Repository (RepositoryRequest, RepositoryResponse)
grpc_service.proto:94
@@ .. cpp:var:: rpc Status(RepositoryRequest) returns (RepositoryResponse) @@ @@ Get status associated with the model repository. @@
message RepositoryRequest
grpc_service.proto:486
@@ @@.. cpp:var:: message RepositoryRequest @@ @@ Request message for Repository gRPC endpoint. @@
- oneof request_type
  @@ .. cpp:var:: oneof request_type @@ @@ Types of the repository request @@
  - bool index = 1
    @@ @@ .. cpp:var:: bool index @@ @@ Request for the index of the model repository. @@
message RepositoryResponse
grpc_service.proto:508
@@ @@.. cpp:var:: message RepositoryResponse @@ @@ Response message for Repository gRPC endpoint. @@
- optional RequestStatus request_status = 1
  @@ @@ .. cpp:var:: RequestStatus request_status @@ @@ The status of the request, indicating success or failure. @@
- oneof response_type
  @@ .. cpp:var:: oneof response_type @@ @@ Types of the repository reponse, which is one-to-one mapping to @@ the repository request type. @@
  - ModelRepositoryIndex index = 2
    @@ @@ .. cpp:var:: bool index @@ @@ The index of the model repository. @@
rpc SharedMemoryControl (SharedMemoryControlRequest, SharedMemoryControlResponse)
grpc_service.proto:85
@@ .. cpp:var:: rpc SharedMemoryControl(SharedMemoryControlRequest) returns @@ (SharedMemoryControlResponse) @@ @@ Request to register / unregister a specified shared memory region. @@
message SharedMemoryControlRequest
grpc_service.proto:236
@@ @@.. cpp:var:: message SharedMemoryControlRequest @@ @@ Request message for managing registered shared memory regions in TRTIS. @@
- oneof shared_memory_control
  @@ .. cpp:var:: oneof shared_memory_control @@ @@ Types of control operations for shared memory @@
  - SharedMemoryControlRequest.Register register = 1
    @@ .. cpp:var:: Register register @@ @@ To register the specified shared memory region. @@
  - SharedMemoryControlRequest.Unregister unregister = 2
    @@ .. cpp:var:: Unregister unregister @@ @@ To unregister the specified shared memory region. @@
  - SharedMemoryControlRequest.UnregisterAll unregister_all = 3
    @@ .. cpp:var:: UnregisterAll unregister_all @@ @@ To unregister all active shared memory regions. @@
  - SharedMemoryControlRequest.Status status = 4
    @@ .. cpp:var:: Status status @@ @@ Get the status of all active shared memory regions. @@
message SharedMemoryControlResponse
grpc_service.proto:385
@@ @@.. cpp:var:: message SharedMemoryControlResponse @@ @@ Response message for SharedMemoryControl gRPC endpoint. @@
- optional RequestStatus request_status = 1
  @@ @@ .. cpp:var:: RequestStatus request_status @@ @@ The status of the request, indicating success or failure. @@
- oneof shared_memory_control
  - SharedMemoryControlResponse.Status shared_memory_status = 2
    @@ @@ .. cpp:var:: Status shared_memory_status @@ @@ The status of all active shared memory regions. @@
rpc Status (StatusRequest, StatusResponse)
grpc_service.proto:48
@@ .. cpp:var:: rpc Status(StatusRequest) returns (StatusResponse) @@ @@ Get status for entire inference server or for a specified model. @@
message StatusRequest
grpc_service.proto:102
@@ @@.. cpp:var:: message StatusRequest @@ @@ Request message for Status gRPC endpoint. @@
- string model_name = 1
  @@ @@ .. cpp:var:: string model_name @@ @@ The specific model status to be returned. If empty return status @@ for all models. @@
message StatusResponse
grpc_service.proto:118
@@ @@.. cpp:var:: message StatusResponse @@ @@ Response message for Status gRPC endpoint. @@
- optional RequestStatus request_status = 1
  @@ @@ .. cpp:var:: RequestStatus request_status @@ @@ The status of the request, indicating success or failure. @@
- optional ServerStatus server_status = 2
  @@ @@ .. cpp:var:: ServerStatus server_status @@ @@ The server and model status. @@
rpc StreamInfer (stream InferRequest, stream InferResponse)
grpc_service.proto:71
@@ .. cpp:var:: rpc StreamInfer(stream InferRequest) returns (stream @@ InferResponse) @@ @@ Request inferences using a specific model in a streaming manner. @@ Individual inference requests sent through the same stream will be @@ processed in order and be returned on completion @@

@@ @@ .. cpp:var:: message RegionStatus @@ @@ Status for a shared memory region. @@

Used in: CudaSharedMemoryStatusResponse

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The name for the shared memory region. @@
uint64 device_id = 2
@@ .. cpp:var:: uin64 device_id @@ @@ The GPU device ID where the cudaIPC handle was created. @@
uint64 byte_size = 3
@@ .. cpp:var:: uint64 byte_size @@ @@ Size of the shared memory region, in bytes. @@

@@ @@.. cpp:enum:: DataType @@ @@ Data types supported for input and output tensors. @@

Used in: InferResponseHeader.Output, ModelInput, ModelOutput, ModelSequenceBatching.Control, ModelWarmup.Input

TYPE_INVALID = 0
@@ .. cpp:enumerator:: DataType::INVALID = 0
TYPE_BOOL = 1
@@ .. cpp:enumerator:: DataType::BOOL = 1
TYPE_UINT8 = 2
@@ .. cpp:enumerator:: DataType::UINT8 = 2
TYPE_UINT16 = 3
@@ .. cpp:enumerator:: DataType::UINT16 = 3
TYPE_UINT32 = 4
@@ .. cpp:enumerator:: DataType::UINT32 = 4
TYPE_UINT64 = 5
@@ .. cpp:enumerator:: DataType::UINT64 = 5
TYPE_INT8 = 6
@@ .. cpp:enumerator:: DataType::INT8 = 6
TYPE_INT16 = 7
@@ .. cpp:enumerator:: DataType::INT16 = 7
TYPE_INT32 = 8
@@ .. cpp:enumerator:: DataType::INT32 = 8
TYPE_INT64 = 9
@@ .. cpp:enumerator:: DataType::INT64 = 9
TYPE_FP16 = 10
@@ .. cpp:enumerator:: DataType::FP16 = 10
TYPE_FP32 = 11
@@ .. cpp:enumerator:: DataType::FP32 = 11
TYPE_FP64 = 12
@@ .. cpp:enumerator:: DataType::FP64 = 12
TYPE_STRING = 13
@@ .. cpp:enumerator:: DataType::STRING = 13

@@ @@.. cpp:var:: message HealthRequestStats @@ @@ Statistics collected for Health requests. @@

Used in: ServerStatus

optional StatDuration success = 1
@@ .. cpp:var:: StatDuration success @@ @@ Total time required to handle successful Health requests, not @@ including HTTP or gRPC endpoint termination time. @@

@@ @@.. cpp:var:: message InferParameter @@ @@ An inference parameter value. @@

Used in: ModelInferRequest, ModelInferRequest.InferInputTensor, ModelInferRequest.InferRequestedOutputTensor, ModelInferResponse

oneof parameter_choice
@@ .. cpp:var:: oneof parameter_choice @@ @@ The parameter value can be a string, an int64 or @@ a boolean @@
- bool bool_param = 1
  @@ .. cpp:var:: bool bool_param @@ @@ A boolean parameter value. @@
- int64 int64_param = 2
  @@ .. cpp:var:: int64 int64_param @@ @@ An int64 parameter value. @@
- string string_param = 3
  @@ .. cpp:var:: string string_param @@ @@ A string parameter value. @@

@@ @@.. cpp:var:: message InferRequest @@ @@ Request message for Infer gRPC endpoint. @@

Used as request type in: GRPCService.Infer, GRPCService.StreamInfer

string model_name = 1
@@ .. cpp:var:: string model_name @@ @@ The name of the model to use for inferencing. @@
int64 model_version = 2
@@ .. cpp:var:: int64 version @@ @@ The version of the model to use for inference. If -1 @@ the latest/most-recent version of the model is used. @@
optional InferRequestHeader meta_data = 3
@@ .. cpp:var:: InferRequestHeader meta_data @@ @@ Meta-data for the request: input tensors, output @@ tensors, etc. @@
repeated bytes raw_input = 4
@@ .. cpp:var:: bytes raw_input (repeated) @@ @@ The raw input tensor data in the order specified in 'meta_data'. @@

@@ @@.. cpp:var:: message InferRequestHeader @@ @@ Meta-data for an inferencing request. The actual input data is @@ delivered separate from this header, in the HTTP body for an HTTP @@ request, or in the :cpp:var:`InferRequest` message for a gRPC request. @@

Used in: InferRequest

uint64 id = 5
@@ .. cpp:var:: uint64 id @@ @@ The ID of the inference request. The response of the request will @@ have the same ID in InferResponseHeader. The request sender can use @@ the ID to correlate the response to corresponding request if needed. @@
uint32 flags = 6
@@ .. cpp:var:: uint32 flags @@ @@ The flags associated with this request. This field holds a bitwise-or @@ of all flag values. @@
uint64 correlation_id = 4
@@ .. cpp:var:: uint64 correlation_id @@ @@ The correlation ID of the inference request. Default is 0, which @@ indictes that the request has no correlation ID. The correlation ID @@ is used to indicate two or more inference request are related to @@ each other. How this relationship is handled by the inference @@ server is determined by the model's scheduling policy. @@
uint32 batch_size = 1
@@ .. cpp:var:: uint32 batch_size @@ @@ The batch size of the inference request. This must be >= 1. For @@ models that don't support batching, batch_size must be 1. @@
repeated InferRequestHeader.Input input = 2
@@ .. cpp:var:: Input input (repeated) @@ @@ The input meta-data for the inputs provided with the the inference @@ request. @@
repeated InferRequestHeader.Output output = 3
@@ .. cpp:var:: Output output (repeated) @@ @@ The output meta-data for the inputs provided with the the inference @@ request. @@
uint32 priority = 7
@@ .. cpp:var:: uint32 priority @@ @@ The priority value of this request. If priority handling is not @@ enable for the model, then this value is ignored. The default value @@ is 0 which indicates that the request will be assigned the default @@ priority associated with the model. @@
uint64 timeout_microseconds = 8
@@ .. cpp:var:: uint64 timeout_microseconds @@ @@ The timeout for this request. This value overrides the timeout @@ specified by the model, if the model allows timeout override and if @@ the value is less than the default timeout specified by the model. @@ If the request cannot be processed within this timeout, the request @@ will be handled based on the model's timeout policy. @@ Note that request for ensemble model cannot override the timeout @@ values for the composing models. @@ The default value is 0 which indicates that the request does not @@ override the model's timeout value. @@

@@ .. cpp:enum:: Flag @@ @@ Flags that can be associated with an inference request. @@ All flags are packed bitwise into the 'flags' field and @@ so the value of each must be a power-of-2. @@

FLAG_NONE = 0
@@ .. cpp:enumerator:: Flag::FLAG_NONE = 0 @@ @@ Value indicating no flags are enabled. @@
FLAG_SEQUENCE_START = 1
@@ .. cpp:enumerator:: Flag::FLAG_SEQUENCE_START = 1 << 0 @@ @@ This request is the start of a related sequence of requests. @@
FLAG_SEQUENCE_END = 2
@@ .. cpp:enumerator:: Flag::FLAG_SEQUENCE_END = 1 << 1 @@ @@ This request is the end of a related sequence of requests. @@

@@ .. cpp:var:: message Input @@ @@ Meta-data for an input tensor provided as part of an inferencing @@ request. @@

Used in: InferRequestHeader

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the input tensor. @@
repeated int64 dims = 2
@@ .. cpp:var:: int64 dims (repeated) @@ @@ The shape of the input tensor, not including the batch dimension. @@ Optional if the model configuration for this input explicitly @@ specifies all dimensions of the shape. Required if the model @@ configuration for this input has any wildcard dimensions (-1). @@
uint64 batch_byte_size = 3
@@ .. cpp:var:: uint64 batch_byte_size @@ @@ The size of the full batch of the input tensor, in bytes. @@ Optional for tensors with fixed-sized datatypes. Required @@ for tensors with a non-fixed-size datatype (like STRING). @@
optional InferSharedMemory shared_memory = 4
@@ .. cpp:var:: InferSharedMemory shared_memory @@ @@ It is the location in shared memory that contains the tensor data @@ for this input. Using shared memory is optional but if this @@ message is used, all fields are required. @@

@@ .. cpp:var:: message Output @@ @@ Meta-data for a requested output tensor as part of an inferencing @@ request. @@

Used in: InferRequestHeader

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the output tensor. @@
optional Output.Class cls = 3
@@ .. cpp:var:: Class cls @@ @@ Optional. If defined return this output as a classification @@ instead of raw data. The output tensor will be interpreted as @@ probabilities and the classifications associated with the @@ highest probabilities will be returned. @@
optional InferSharedMemory shared_memory = 4
@@ .. cpp:var:: InferSharedMemory shared_memory @@ @@ It is the location in shared memory that the result tensor data @@ for this output will be written. Using shared memory is optional @@ but if this message is used, all fields are required. @@

@@ .. cpp:var:: message Class @@ @@ Options for an output returned as a classification. @@

Used in: Output

uint32 count = 1
@@ .. cpp:var:: uint32 count @@ @@ Indicates how many classification values should be returned @@ for the output. The 'count' highest priority values are @@ returned. @@

@@ @@.. cpp:var:: message InferRequestStats @@ @@ Statistics collected for Infer requests. @@

Used in: ModelVersionStatus

optional StatDuration success = 1
@@ .. cpp:var:: StatDuration success @@ @@ Total time required to handle successful Infer requests, not @@ including HTTP or GRPC endpoint handling time. @@
optional StatDuration failed = 2
@@ .. cpp:var:: StatDuration failed @@ @@ Total time required to handle failed Infer requests, not @@ including HTTP or GRPC endpoint handling time. @@
optional StatDuration compute = 3
@@ .. cpp:var:: StatDuration compute @@ @@ Time required to run inferencing for an inference request; @@ including time copying input tensors to GPU memory, time @@ executing the model, and time copying output tensors from GPU @@ memory. @@
optional StatDuration queue = 4
@@ .. cpp:var:: StatDuration queue @@ @@ Time an inference request waits in scheduling queue for an @@ available model instance. @@
optional StatDuration compute_input = 5
@@ .. cpp:var:: StatisticDuration compute_input @@ @@ The count and cumulative duration to prepare input tensor data as @@ required by the model framework / backend. For example, this duration @@ should include the time to copy input tensor data to the GPU. @@
optional StatDuration compute_infer = 6
@@ .. cpp:var:: StatisticDuration compute_infer @@ @@ The count and cumulative duration to execute the model. @@
optional StatDuration compute_output = 7
@@ .. cpp:var:: StatisticDuration compute_output @@ @@ The count and cumulative duration to extract output tensor data @@ produced by the model framework / backend. For example, this duration @@ should include the time to copy output tensor data from the GPU. @@

@@ @@.. cpp:var:: message InferResponse @@ @@ Response message for Infer gRPC endpoint. @@

Used as response type in: GRPCService.Infer, GRPCService.StreamInfer

optional RequestStatus request_status = 1
@@ @@ .. cpp:var:: RequestStatus request_status @@ @@ The status of the request, indicating success or failure. @@
optional InferResponseHeader meta_data = 2
@@ .. cpp:var:: InferResponseHeader meta_data @@ @@ The response meta-data for the output tensors. @@
repeated bytes raw_output = 3
@@ .. cpp:var:: bytes raw_output (repeated) @@ @@ The raw output tensor data in the order specified in 'meta_data'. @@

@@ @@.. cpp:var:: message InferResponseHeader @@ @@ Meta-data for the response to an inferencing request. The actual output @@ data is delivered separate from this header, in the HTTP body for an HTTP @@ request, or in the :cpp:var:`InferResponse` message for a gRPC request. @@

Used in: InferResponse

uint64 id = 5
@@ .. cpp:var:: uint64 id @@ @@ The ID of the inference response. The response will have the same ID @@ as the ID of its originated request. The request sender can use @@ the ID to correlate the response to corresponding request if needed. @@
string model_name = 1
@@ .. cpp:var:: string model_name @@ @@ The name of the model that produced the outputs. @@
int64 model_version = 2
@@ .. cpp:var:: int64 model_version @@ @@ The version of the model that produced the outputs. @@
uint32 batch_size = 3
@@ .. cpp:var:: uint32 batch_size @@ @@ The batch size of the outputs. This will always be equal to the @@ batch size of the inputs. For models that don't support @@ batching the batch_size will be 1. @@
repeated InferResponseHeader.Output output = 4
@@ .. cpp:var:: Output output (repeated) @@ @@ The outputs, in the same order as they were requested in @@ :cpp:var:`InferRequestHeader`. @@

@@ .. cpp:var:: message Output @@ @@ Meta-data for an output tensor requested as part of an inferencing @@ request. @@

Used in: InferResponseHeader

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the output tensor. @@
DataType data_type = 4
@@ .. cpp:var:: DataType data_type @@ @@ The datatype of the output tensor. @@
optional Output.Raw raw = 2
@@ .. cpp:var:: Raw raw @@ @@ If specified deliver results for this output as raw tensor data. @@ The actual output data is delivered in the HTTP body for an HTTP @@ request, or in the :cpp:var:`InferResponse` message for a gRPC @@ request. Only one of 'raw' and 'batch_classes' may be specified. @@
repeated Output.Classes batch_classes = 3
@@ .. cpp:var:: Classes batch_classes (repeated) @@ @@ If specified deliver results for this output as classifications. @@ There is one :cpp:var:`Classes` object for each batch entry in @@ the output. Only one of 'raw' and 'batch_classes' may be @@ specified. @@

@@ .. cpp:var:: message Class @@ @@ Information about each classification for this output. @@

Used in: Classes

int32 idx = 1
@@ .. cpp:var:: int32 idx @@ @@ The classification index. @@
float value = 2
@@ .. cpp:var:: float value @@ @@ The classification value as a float (typically a @@ probability). @@
string label = 3
@@ .. cpp:var:: string label @@ @@ The label for the class (optional, only available if provided @@ by the model). @@

@@ .. cpp:var:: message Classes @@ @@ Meta-data for an output tensor being returned as classifications. @@

Used in: Output

repeated Class cls = 1
@@ .. cpp:var:: Class cls (repeated) @@ @@ The topk classes for this output. @@

@@ .. cpp:var:: message Raw @@ @@ Meta-data for an output tensor being returned as raw data. @@

Used in: Output

repeated int64 dims = 1
@@ .. cpp:var:: int64 dims (repeated) @@ @@ The shape of the output tensor, not including the batch @@ dimension. @@
uint64 batch_byte_size = 2
@@ .. cpp:var:: uint64 batch_byte_size @@ @@ The full size of the output tensor, in bytes. For a @@ batch output, this is the size of the entire batch. @@

@@.. cpp:var:: message InferSharedMemory @@ @@ The meta-data for the shared memory from which to read the input @@ data and/or write the output data. @@

Used in: InferRequestHeader.Input, InferRequestHeader.Output

string name = 1
@@ .. cpp:var:: string name @@ @@ The name given during registration of a shared memory region that @@ holds the input data (or where the output data should be written). @@
uint64 offset = 2
@@ .. cpp:var:: uint64 offset @@ @@ The offset from the start of the shared memory region. @@ start = offset, end = offset + size; @@
uint64 byte_size = 3
@@ .. cpp:var:: uint64 byte_size @@ @@ Size of the memory block, in bytes. @@

@@ @@.. cpp:var:: message InferStatistics @@ @@ Inference statistics. @@

Used in: ModelStatisticsResponse

optional StatisticDuration success = 1
@@ .. cpp:var:: StatisticDuration success @@ @@ Cumulative count and duration for successful inference @@ request. @@
optional StatisticDuration fail = 2
@@ .. cpp:var:: StatisticDuration fail @@ @@ Cumulative count and duration for failed inference @@ request. @@
optional StatisticDuration queue = 3
@@ .. cpp:var:: StatisticDuration queue @@ @@ The count and cumulative duration that inference requests wait in @@ scheduling or other queues. @@
optional StatisticDuration compute_input = 4
@@ .. cpp:var:: StatisticDuration compute_input @@ @@ The count and cumulative duration to prepare input tensor data as @@ required by the model framework / backend. For example, this duration @@ should include the time to copy input tensor data to the GPU. @@
optional StatisticDuration compute_infer = 5
@@ .. cpp:var:: StatisticDuration compute_infer @@ @@ The count and cumulative duration to execute the model. @@
optional StatisticDuration compute_output = 6
@@ .. cpp:var:: StatisticDuration compute_output @@ @@ The count and cumulative duration to extract output tensor data @@ produced by the model framework / backend. For example, this duration @@ should include the time to copy output tensor data from the GPU. @@

message InferTensorContents

grpc_service_v2.proto:464

@@ @@.. cpp:var:: message InferTensorContents @@ @@ The data contained in a tensor. For a given data type the @@ tensor contents can be represented in "raw" bytes form or in @@ the repeated type that matches the tensor's data type. Protobuf @@ oneof is not used because oneofs cannot contain repeated fields. @@

Used in: ModelInferRequest.InferInputTensor, ModelInferResponse.InferOutputTensor

bytes raw_contents = 1
@@ @@ .. cpp:var:: bytes raw_contents @@ @@ Raw representation of the tensor contents. The size of this @@ content must match what is expected by the tensor's shape @@ and data type. The raw data must be the flattened, one-dimensional, @@ row-major order of the tensor elements without any stride or padding @@ between the elements. Note that the FP16 data type must be @@ represented as raw content as there is no standard support for a @@ 16-bit float type. @@
repeated bool bool_contents = 2
@@ @@ .. cpp:var:: bool bool_contents (repeated) @@ @@ Representation for BOOL data type. The size must match what is @@ expected by the tensor's shape. The contents must be the flattened, @@ one-dimensional, row-major order of the tensor elements. @@
repeated int32 int_contents = 3
@@ @@ .. cpp:var:: int32 int_contents (repeated) @@ @@ Representation for INT8, INT16, and INT32 data types. The size @@ must match what is expected by the tensor's shape. The contents @@ must be the flattened, one-dimensional, row-major order of the @@ tensor elements. @@
repeated int64 int64_contents = 4
@@ @@ .. cpp:var:: int64 int64_contents (repeated) @@ @@ Representation for INT64 data types. The size must match what @@ is expected by the tensor's shape. The contents must be the @@ flattened, one-dimensional, row-major order of the tensor elements. @@
repeated uint32 uint_contents = 5
@@ @@ .. cpp:var:: uint32 uint_contents (repeated) @@ @@ Representation for UINT8, UINT16, and UINT32 data types. The size @@ must match what is expected by the tensor's shape. The contents @@ must be the flattened, one-dimensional, row-major order of the @@ tensor elements. @@
repeated uint64 uint64_contents = 6
@@ @@ .. cpp:var:: uint64 uint64_contents (repeated) @@ @@ Representation for UINT64 data types. The size must match what @@ is expected by the tensor's shape. The contents must be the @@ flattened, one-dimensional, row-major order of the tensor elements. @@
repeated float fp32_contents = 7
@@ @@ .. cpp:var:: float fp32_contents (repeated) @@ @@ Representation for FP32 data type. The size must match what is @@ expected by the tensor's shape. The contents must be the flattened, @@ one-dimensional, row-major order of the tensor elements. @@
repeated double fp64_contents = 8
@@ @@ .. cpp:var:: double fp64_contents (repeated) @@ @@ Representation for FP64 data type. The size must match what is @@ expected by the tensor's shape. The contents must be the flattened, @@ one-dimensional, row-major order of the tensor elements. @@
repeated bytes byte_contents = 9
@@ @@ .. cpp:var:: bytes byte_contents (repeated) @@ @@ Representation for BYTES data type. The size must match what is @@ expected by the tensor's shape. The contents must be the flattened, @@ one-dimensional, row-major order of the tensor elements. @@

@@ @@.. cpp:var:: message ModelConfig @@ @@ A model configuration. @@

Used in: ModelConfigResponse, ModelStatus

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the model. @@
string platform = 2
@@ .. cpp:var:: string platform @@ @@ The framework for the model. Possible values are @@ "tensorrt_plan", "tensorflow_graphdef", @@ "tensorflow_savedmodel", "caffe2_netdef", @@ "onnxruntime_onnx", "pytorch_libtorch" and "custom". @@
optional ModelVersionPolicy version_policy = 3
@@ .. cpp:var:: ModelVersionPolicy version_policy @@ @@ Policy indicating which version(s) of the model will be served. @@
int32 max_batch_size = 4
@@ .. cpp:var:: int32 max_batch_size @@ @@ Maximum batch size allowed for inference. This can only decrease @@ what is allowed by the model itself. A max_batch_size value of 0 @@ indicates that batching is not allowed for the model and the @@ dimension/shape of the input and output tensors must exactly @@ match what is specified in the input and output configuration. A @@ max_batch_size value > 0 indicates that batching is allowed and @@ so the model expects the input tensors to have an additional @@ initial dimension for the batching that is not specified in the @@ input (for example, if the model supports batched inputs of @@ 2-dimensional tensors then the model configuration will specify @@ the input shape as [ X, Y ] but the model will expect the actual @@ input tensors to have shape [ N, X, Y ]). For max_batch_size > 0 @@ returned outputs will also have an additional initial dimension @@ for the batch. @@
repeated ModelInput input = 5
@@ .. cpp:var:: ModelInput input (repeated) @@ @@ The inputs request by the model. @@
repeated ModelOutput output = 6
@@ .. cpp:var:: ModelOutput output (repeated) @@ @@ The outputs produced by the model. @@
optional ModelOptimizationPolicy optimization = 12
@@ .. cpp:var:: ModelOptimizationPolicy optimization @@ @@ Optimization configuration for the model. If not specified @@ then default optimization policy is used. @@
oneof scheduling_choice
@@ .. cpp:var:: oneof scheduling_choice @@ @@ The scheduling policy for the model. If not specified the @@ default scheduling policy is used for the model. The default @@ policy is to execute each inference request independently. @@
- ModelDynamicBatching dynamic_batching = 11
  @@ .. cpp:var:: ModelDynamicBatching dynamic_batching @@ @@ If specified, enables the dynamic-batching scheduling @@ policy. With dynamic-batching the scheduler may group @@ together independent requests into a single batch to @@ improve inference throughput. @@
- ModelSequenceBatching sequence_batching = 13
  @@ .. cpp:var:: ModelSequenceBatching sequence_batching @@ @@ If specified, enables the sequence-batching scheduling @@ policy. With sequence-batching, inference requests @@ with the same correlation ID are routed to the same @@ model instance. Multiple sequences of inference requests @@ may be batched together into a single batch to @@ improve inference throughput. @@
- ModelEnsembling ensemble_scheduling = 15
  @@ .. cpp:var:: ModelEnsembling ensemble_scheduling @@ @@ If specified, enables the model-ensembling scheduling @@ policy. With model-ensembling, inference requests @@ will be processed according to the specification, such as an @@ execution sequence of models. The input specified in this model @@ config will be the input for the ensemble, and the output @@ specified will be the output of the ensemble. @@
repeated ModelInstanceGroup instance_group = 7
@@ .. cpp:var:: ModelInstanceGroup instance_group (repeated) @@ @@ Instances of this model. If not specified, one instance @@ of the model will be instantiated on each available GPU. @@
string default_model_filename = 8
@@ .. cpp:var:: string default_model_filename @@ @@ Optional filename of the model file to use if a @@ compute-capability specific model is not specified in @@ :cpp:var:`cc_model_filenames`. If not specified the default name @@ is 'model.graphdef', 'model.savedmodel', 'model.plan' or @@ 'model.netdef' depending on the model type. @@
map<string, string> cc_model_filenames = 9
@@ .. cpp:var:: map<string,string> cc_model_filenames @@ @@ Optional map from CUDA compute capability to the filename of @@ the model that supports that compute capability. The filename @@ refers to a file within the model version directory. @@
map<string, string> metric_tags = 10
@@ .. cpp:var:: map<string,string> metric_tags @@ @@ Optional metric tags. User-specific key-value pairs for metrics @@ reported for this model. These tags are applied to the metrics @@ reported on the HTTP metrics port. @@
map<string, ModelParameter> parameters = 14
@@ .. cpp:var:: map<string,ModelParameter> parameters @@ @@ Optional model parameters. User-specified parameter values that @@ are made available to custom backends. @@
repeated ModelWarmup model_warmup = 16
@@ .. cpp:var:: ModelWarmup model_warmup (repeated) @@ @@ Warmup setting of this model. If specified, all instances @@ will be run with the request samples in sequence before @@ serving the model. @@ This field can only be specified if the model is not an ensemble @@ model. @@

@@ .. cpp:enum:: Type @@ @@ Types of control operation @@

Used in: ModelControlRequest

UNLOAD = 0
@@ .. cpp:enumerator:: Type::UNLOAD = 0 @@ @@ To unload the specified model. @@
LOAD = 1
@@ .. cpp:enumerator:: Type::LOAD = 1 @@ @@ To load the specified model. If the model has been loaded, @@ it will be reloaded to fetch the latest change. @@

@@ @@.. cpp:var:: message ModelControlRequestStats @@ @@ Statistics collected for ModelControl requests. @@

Used in: ServerStatus

optional StatDuration success = 1
@@ .. cpp:var:: StatDuration success @@ @@ Total time required to handle successful ModelControl requests, not @@ including HTTP or gRPC endpoint termination time. @@

@@ @@.. cpp:var:: message ModelDynamicBatching @@ @@ Dynamic batching configuration. These settings control how dynamic @@ batching operates for the model. @@

Used in: ModelConfig

repeated int32 preferred_batch_size = 1
@@ .. cpp:var:: int32 preferred_batch_size (repeated) @@ @@ Preferred batch sizes for dynamic batching. If a batch of one of @@ these sizes can be formed it will be executed immediately. If @@ not specified a preferred batch size will be chosen automatically @@ based on model and GPU characteristics. @@
uint64 max_queue_delay_microseconds = 2
@@ .. cpp:var:: uint64 max_queue_delay_microseconds @@ @@ The maximum time, in microseconds, a request will be delayed in @@ the scheduling queue to wait for additional requests for @@ batching. Default is 0. @@
bool preserve_ordering = 3
@@ .. cpp:var:: bool preserve_ordering @@ @@ Should the dynamic batcher preserve the ordering of responses to @@ match the order of requests received by the scheduler. Default is @@ false. If true, the responses will be returned in the same order as @@ the order of requests sent to the scheduler. If false, the responses @@ may be returned in arbitrary order. This option is specifically @@ needed when a sequence of related inference requests (i.e. inference @@ requests with the same correlation ID) are sent to the dynamic @@ batcher to ensure that the sequence responses are in the correct @@ order. @@
uint32 priority_levels = 4
@@ .. cpp:var:: uint32 priority_levels @@ @@ The number of priority levels to be enabled for the model, @@ the priority level starts from 1 and 1 is the highest priority. @@ Requests are handled in priority order with all priority 1 requests @@ processed before priority 2, all priority 2 requests processed before @@ priority 3, etc. Requests with the same priority level will be @@ handled in the order that they are received. @@
uint32 default_priority_level = 5
@@ .. cpp:var:: uint32 default_priority_level @@ @@ The priority level used for requests that don't specify their @@ priority. The value must be in the range [ 1, 'priority_levels' ]. @@
optional ModelQueuePolicy default_queue_policy = 6
@@ .. cpp:var:: ModelQueuePolicy default_queue_policy @@ @@ The default queue policy used for requests that don't require @@ priority handling and requests that specify priority levels where @@ there is no specific policy given. If not specified, a policy with @@ default field values will be used. @@
map<uint32, ModelQueuePolicy> priority_queue_policy = 7
@@ .. cpp:var:: map<uint32, ModelQueuePolicy> priority_queue_policy @@ @@ Specify the queue policy for the priority level. The default queue @@ policy will be used if a priority level doesn't specify a queue @@ policy. @@

@@ @@.. cpp:var:: message ModelEnsembling @@ @@ Model ensembling configuration. These settings specify the models that @@ compose the ensemble and how data flows between the models. @@

Used in: ModelConfig

repeated ModelEnsembling.Step step = 1
@@ .. cpp:var:: Step step (repeated) @@ @@ The models and the input / output mappings used within the ensemble. @@

@@ .. cpp:var:: message Step @@ @@ Each step specifies a model included in the ensemble, @@ maps ensemble tensor names to the model input tensors, @@ and maps model output tensors to ensemble tensor names @@

Used in: ModelEnsembling

string model_name = 1
@@ .. cpp:var:: string model_name @@ @@ The name of the model to execute for this step of the ensemble. @@
int64 model_version = 2
@@ .. cpp:var:: int64 model_version @@ @@ The version of the model to use for inference. If -1 @@ the latest/most-recent version of the model is used. @@
map<string, string> input_map = 3
@@ .. cpp:var:: map<string,string> input_map @@ @@ Map from name of an input tensor on this step's model to ensemble @@ tensor name. The ensemble tensor must have the same data type and @@ shape as the model input. Each model input must be assigned to @@ one ensemble tensor, but the same ensemble tensor can be assigned @@ to multiple model inputs. @@
map<string, string> output_map = 4
@@ .. cpp:var:: map<string,string> output_map @@ @@ Map from name of an output tensor on this step's model to ensemble @@ tensor name. The data type and shape of the ensemble tensor will @@ be inferred from the model output. It is optional to assign all @@ model outputs to ensemble tensors. One ensemble tensor name @@ can appear in an output map only once. @@

@@ @@.. cpp:var:: message ModelInferRequest @@ @@ Request message for ModelInfer. @@

Used as request type in: GRPCInferenceService.ModelInfer, GRPCInferenceService.ModelStreamInfer

string model_name = 1
@@ .. cpp:var:: string model_name @@ @@ The name of the model to use for inferencing. @@
string model_version = 2
@@ .. cpp:var:: string model_version @@ @@ The version of the model to use for inference. If not @@ given the latest/most-recent version of the model is used. @@
string id = 3
@@ .. cpp:var:: string id @@ @@ Optional identifier for the request. If specified will be @@ returned in the response. @@
map<string, InferParameter> parameters = 4
@@ .. cpp:var:: map<string,InferParameter> parameters @@ @@ Optional inference parameters. @@
repeated ModelInferRequest.InferInputTensor inputs = 5
@@ @@ .. cpp:var:: InferInputTensor inputs (repeated) @@ @@ The input tensors for the inference. @@
repeated ModelInferRequest.InferRequestedOutputTensor outputs = 6
@@ @@ .. cpp:var:: InferRequestedOutputTensor outputs (repeated) @@ @@ The requested output tensors for the inference. Optional, if not @@ specified all outputs specified in the model config will be @@ returned. @@

@@ @@ .. cpp:var:: message InferInputTensor @@ @@ An input tensor for an inference request. @@

Used in: ModelInferRequest

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The tensor name. @@
string datatype = 2
@@ @@ .. cpp:var:: string datatype @@ @@ The tensor data type. @@
repeated int64 shape = 3
@@ @@ .. cpp:var:: int64 shape (repeated) @@ @@ The tensor shape. @@
map<string, InferParameter> parameters = 4
@@ .. cpp:var:: map<string,InferParameter> parameters @@ @@ Optional inference input tensor parameters. @@
optional InferTensorContents contents = 5
@@ .. cpp:var:: InferTensorContents @@ @@ The input tensor data. @@

@@ @@ .. cpp:var:: message InferRequestedOutputTensor @@ @@ An output tensor requested for an inference request. @@

Used in: ModelInferRequest

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The tensor name. @@
map<string, InferParameter> parameters = 2
@@ .. cpp:var:: map<string,InferParameter> parameters @@ @@ Optional requested output tensor parameters. @@

@@ @@.. cpp:var:: message ModelInferResponse @@ @@ Response message for ModelInfer. @@

Used as response type in: GRPCInferenceService.ModelInfer

Used as field type in: ModelStreamInferResponse

string model_name = 1
@@ .. cpp:var:: string model_name @@ @@ The name of the model used for inference. @@
string model_version = 2
@@ .. cpp:var:: string model_version @@ @@ The version of the model used for inference. @@
string id = 3
@@ .. cpp:var:: string id @@ @@ The id of the inference request if one was specified. @@
map<string, InferParameter> parameters = 4
@@ .. cpp:var:: map<string,InferParameter> parameters @@ @@ Optional inference response parameters. @@
repeated ModelInferResponse.InferOutputTensor outputs = 5
@@ @@ .. cpp:var:: InferOutputTensor outputs (repeated) @@ @@ The output tensors holding inference results. @@

@@ @@ .. cpp:var:: message InferOutputTensor @@ @@ An output tensor returned for an inference request. @@

Used in: ModelInferResponse

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The tensor name. @@
string datatype = 2
@@ @@ .. cpp:var:: string datatype @@ @@ The tensor data type. @@
repeated int64 shape = 3
@@ @@ .. cpp:var:: int64 shape (repeated) @@ @@ The tensor shape. @@
optional InferTensorContents contents = 4
@@ .. cpp:var:: InferTensorContents @@ @@ The output tensor data. @@

@@ @@.. cpp:var:: message ModelInput @@ @@ An input required by the model. @@

Used in: ModelConfig

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the input. @@
DataType data_type = 2
@@ .. cpp:var:: DataType data_type @@ @@ The data-type of the input. @@
ModelInput.Format format = 3
@@ .. cpp:var:: Format format @@ @@ The format of the input. Optional. @@
repeated int64 dims = 4
@@ .. cpp:var:: int64 dims (repeated) @@ @@ The dimensions/shape of the input tensor that must be provided @@ when invoking the inference API for this model. @@
optional ModelTensorReshape reshape = 5
@@ .. cpp:var:: ModelTensorReshape reshape @@ @@ The shape expected for this input by the backend. The input will @@ be reshaped to this before being presented to the backend. The @@ reshape must have the same number of elements as the input shape @@ specified by 'dims'. Optional. @@
bool is_shape_tensor = 6
@@ .. cpp:var:: bool is_shape_tensor @@ @@ Whether or not the input is a shape tensor to the model. This field @@ is currently supported only for the TensorRT model. An error will be @@ generated if this specification does not comply with underlying @@ model. @@
bool allow_ragged_batch = 7
@@ .. cpp:var:: bool allow_ragged_batch @@ @@ Whether or not the input is allowed to be "ragged" in a dynamically @@ created batch. Default is false indicating that two requests will @@ only be batched if this tensor has the same shape in both requests. @@ True indicates that two requests can be batched even if this tensor @@ has a different shape in each request. A true value is currently @@ supported only for custom models. @@

@@ @@ .. cpp:enum:: Format @@ @@ The format for the input. @@

Used in: ModelInput

FORMAT_NONE = 0
@@ .. cpp:enumerator:: Format::FORMAT_NONE = 0 @@ @@ The input has no specific format. This is the default. @@
FORMAT_NHWC = 1
@@ .. cpp:enumerator:: Format::FORMAT_NHWC = 1 @@ @@ HWC image format. Tensors with this format require 3 dimensions @@ if the model does not support batching (max_batch_size = 0) or 4 @@ dimensions if the model does support batching (max_batch_size @@ >= 1). In either case the 'dims' below should only specify the @@ 3 non-batch dimensions (i.e. HWC or CHW). @@
FORMAT_NCHW = 2
@@ .. cpp:enumerator:: Format::FORMAT_NCHW = 2 @@ @@ CHW image format. Tensors with this format require 3 dimensions @@ if the model does not support batching (max_batch_size = 0) or 4 @@ dimensions if the model does support batching (max_batch_size @@ >= 1). In either case the 'dims' below should only specify the @@ 3 non-batch dimensions (i.e. HWC or CHW). @@

@@ @@.. cpp:var:: message ModelInstanceGroup @@ @@ A group of one or more instances of a model and resources made @@ available for those instances. @@

Used in: ModelConfig

string name = 1
@@ .. cpp:var:: string name @@ @@ Optional name of this group of instances. If not specified the @@ name will be formed as <model name>_<group number>. The name of @@ individual instances will be further formed by a unique instance @@ number and GPU index: @@
ModelInstanceGroup.Kind kind = 4
@@ .. cpp:var:: Kind kind @@ @@ The kind of this instance group. Default is KIND_AUTO. If @@ KIND_AUTO or KIND_GPU then both 'count' and 'gpu' are valid and @@ may be specified. If KIND_CPU or KIND_MODEL only 'count' is valid @@ and 'gpu' cannot be specified. @@
int32 count = 2
@@ .. cpp:var:: int32 count @@ @@ For a group assigned to GPU, the number of instances created for @@ each GPU listed in 'gpus'. For a group assigned to CPU the number @@ of instances created. Default is 1.
repeated int32 gpus = 3
@@ .. cpp:var:: int32 gpus (repeated) @@ @@ GPU(s) where instances should be available. For each GPU listed, @@ 'count' instances of the model will be available. Setting 'gpus' @@ to empty (or not specifying at all) is eqivalent to listing all @@ available GPUs. @@
repeated string profile = 5
@@ .. cpp:var:: string profile (repeated) @@ @@ For TensorRT models, using inputs with dynamic shape, this @@ parameter specifies a set of optimization profiles available to this @@ instance group. The inference server will choose the optimal profile @@ based on the shapes of the input tensors. This field should lie @@ between 0 and <TotalNumberOfOptimizationProfilesInPlanModel> - 1 @@ and be specified only for TensorRT backend, otherwise an error will @@ be generated. @@

@@ @@ .. cpp:enum:: Kind @@ @@ Kind of this instance group. @@

Used in: ModelInstanceGroup

KIND_AUTO = 0
@@ .. cpp:enumerator:: Kind::KIND_AUTO = 0 @@ @@ This instance group represents instances that can run on either @@ CPU or GPU. If all GPUs listed in 'gpus' are available then @@ instances will be created on GPU(s), otherwise instances will @@ be created on CPU. @@
KIND_GPU = 1
@@ .. cpp:enumerator:: Kind::KIND_GPU = 1 @@ @@ This instance group represents instances that must run on the @@ GPU. @@
KIND_CPU = 2
@@ .. cpp:enumerator:: Kind::KIND_CPU = 2 @@ @@ This instance group represents instances that must run on the @@ CPU. @@
KIND_MODEL = 3
@@ .. cpp:enumerator:: Kind::KIND_MODEL = 3 @@ @@ This instance group represents instances that should run on the @@ CPU and/or GPU(s) as specified by the model or backend itself. @@ The inference server will not override the model/backend @@ settings. @@ Currently, this option is supported only for Tensorflow models. @@

@@ @@ .. cpp:var:: message TensorMetadata @@ @@ Metadata for a tensor. @@

Used in: ModelMetadataResponse

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The tensor name. @@
string datatype = 2
@@ @@ .. cpp:var:: string datatype @@ @@ The tensor data type. @@
repeated int64 shape = 3
@@ @@ .. cpp:var:: int64 shape (repeated) @@ @@ The tensor shape. A variable-size dimension is represented @@ by a -1 value. @@

@@ @@.. cpp:var:: message ModelOptimizationPolicy @@ @@ Optimization settings for a model. These settings control if/how a @@ model is optimized and prioritized by the backend framework when @@ it is loaded. @@

Used in: ModelConfig

optional ModelOptimizationPolicy.Graph graph = 1
@@ .. cpp:var:: Graph graph @@ @@ The graph optimization setting for the model. Optional. @@
ModelOptimizationPolicy.ModelPriority priority = 2
@@ .. cpp:var:: ModelPriority priority @@ @@ The priority setting for the model. Optional. @@
optional ModelOptimizationPolicy.Cuda cuda = 3
@@ .. cpp:var:: Cuda cuda @@ @@ CUDA-specific optimization settings. Optional. @@
optional ModelOptimizationPolicy.ExecutionAccelerators execution_accelerators = 4
@@ .. cpp:var:: ExecutionAccelerators execution_accelerators @@ @@ The accelerators used for the model. Optional. @@
optional ModelOptimizationPolicy.PinnedMemoryBuffer input_pinned_memory = 5
@@ .. cpp:var:: PinnedMemoryBuffer input_pinned_memory @@ @@ Use pinned memory buffer when the data transfer for inputs @@ is between GPU memory and non-pinned system memory. @@ Default is true. @@
optional ModelOptimizationPolicy.PinnedMemoryBuffer output_pinned_memory = 6
@@ .. cpp:var:: PinnedMemoryBuffer output_pinned_memory @@ @@ Use pinned memory buffer when the data transfer for outputs @@ is between GPU memory and non-pinned system memory. @@ Default is true. @@

@@ @@ .. cpp:var:: message Cuda @@ @@ CUDA-specific optimization settings. @@

Used in: ModelOptimizationPolicy

bool graphs = 1
@@ .. cpp:var:: bool graphs @@ @@ Use CUDA graphs API to capture model operations and execute @@ them more efficiently. Currently only recognized by TensorRT @@ backend. @@

@@ @@ .. cpp:var:: message ExecutionAccelerators @@ @@ Specify the preferred execution accelerators to be used to execute @@ the model. Currently only recognized by ONNX Runtime backend and @@ TensorFlow backend. @@ @@ For ONNX Runtime backend, it will deploy the model with the execution @@ accelerators by priority, the priority is determined based on the @@ order that they are set, i.e. the provider at the front has highest @@ priority. Overall, the priority will be in the following order: @@ <gpu_execution_accelerator> (if instance is on GPU) @@ CUDA Execution Provider (if instance is on GPU) @@ <cpu_execution_accelerator> @@ Default CPU Execution Provider @@

Used in: ModelOptimizationPolicy

repeated ExecutionAccelerators.Accelerator gpu_execution_accelerator = 1
@@ .. cpp:var:: Accelerator gpu_execution_accelerator (repeated) @@ @@ The preferred execution provider to be used if the model instance @@ is deployed on GPU. @@ @@ For ONNX Runtime backend, possible value is "tensorrt" as name, @@ and no parameters are required. @@ @@ For TensorFlow backend, possible values are "tensorrt", "gpu_io". @@ @@ For "tensorrt", the following parameters can be specified: @@ "precision_mode": The precision used for optimization. @@ Allowed values are "FP32" and "FP16". Default value is "FP32". @@ @@ "max_cached_engines": The maximum number of cached TensorRT @@ engines in dynamic TensorRT ops. Default value is 100. @@ @@ "minimum_segment_size": The smallest model subgraph that will @@ be considered for optimization by TensorRT. Default value is 3. @@ @@ "max_workspace_size_bytes": The maximum GPU memory the model @@ can use temporarily during execution. Default value is 1GB. @@ @@ For "gpu_io", no parameters are required. If set, the model will @@ be executed using TensorFlow Callable API to set input and output @@ tensors in GPU memory if possible, which can reduce data transfer @@ overhead if the model is used in ensemble. However, the Callable @@ object will be created on model creation and it will request all @@ outputs for every model execution, which may impact the @@ performance if a request does not require all outputs. This @@ optimization will only take affect if the model instance is @@ created with KIND_GPU. @@
repeated ExecutionAccelerators.Accelerator cpu_execution_accelerator = 2
@@ .. cpp:var:: Accelerator cpu_execution_accelerator (repeated) @@ @@ The preferred execution provider to be used if the model instance @@ is deployed on CPU. @@ @@ For ONNX Runtime backend, possible value is "openvino" as name, @@ and no parameters are required. @@

@@ @@ .. cpp:var:: message Accelerator @@ @@ Specify the accelerator to be used to execute the model. @@ Accelerator with the same name may accept different parameters @@ depending on the backends. @@

Used in: ExecutionAccelerators

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the execution accelerator. @@
map<string, string> parameters = 2
@@ .. cpp:var:: map<string, string> parameters @@ @@ Additional paremeters used to configure the accelerator. @@

@@ @@ .. cpp:var:: message Graph @@ @@ Enable generic graph optimization of the model. If not specified @@ the framework's default level of optimization is used. Supports @@ TensorFlow graphdef and savedmodel and Onnx models. For TensorFlow @@ causes XLA to be enabled/disabled for the model. For Onnx defaults @@ to enabling all optimizations, -1 enables only basic optimizations, @@ +1 enables only basic and extended optimizations. @@

Used in: ModelOptimizationPolicy

int32 level = 1
@@ .. cpp:var:: int32 level @@ @@ The optimization level. Defaults to 0 (zero) if not specified. @@ @@ - -1: Disabled @@ - 0: Framework default @@ - 1+: Enable optimization level (greater values indicate @@ higher optimization levels) @@

@@ @@ .. cpp:enum:: ModelPriority @@ @@ Model priorities. A model will be given scheduling and execution @@ preference over models at lower priorities. Current model @@ priorities only work for TensorRT models. @@

Used in: ModelOptimizationPolicy

PRIORITY_DEFAULT = 0
@@ .. cpp:enumerator:: ModelPriority::PRIORITY_DEFAULT = 0 @@ @@ The default model priority. @@
PRIORITY_MAX = 1
@@ .. cpp:enumerator:: ModelPriority::PRIORITY_MAX = 1 @@ @@ The maximum model priority. @@
PRIORITY_MIN = 2
@@ .. cpp:enumerator:: ModelPriority::PRIORITY_MIN = 2 @@ @@ The minimum model priority. @@

@@ @@ .. cpp:var:: message PinnedMemoryBuffer @@ @@ Specify whether to use a pinned memory buffer when transferring data @@ between non-pinned system memory and GPU memory. Using a pinned @@ memory buffer for system from/to GPU transfers will typically provide @@ increased performance. For example, in the common use case where the @@ request provides inputs and delivers outputs via non-pinned system @@ memory, if the model instance accepts GPU IOs, the inputs will be @@ processed by two copies: from non-pinned system memory to pinned @@ memory, and from pinned memory to GPU memory. Similarly, pinned @@ memory will be used for delivering the outputs. @@

Used in: ModelOptimizationPolicy

bool enable = 1
@@ .. cpp:var:: bool enable @@ @@ Use pinned memory buffer. Default is true. @@

@@ @@.. cpp:var:: message ModelOutput @@ @@ An output produced by the model. @@

Used in: ModelConfig

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the output. @@
DataType data_type = 2
@@ .. cpp:var:: DataType data_type @@ @@ The data-type of the output. @@
repeated int64 dims = 3
@@ .. cpp:var:: int64 dims (repeated) @@ @@ The dimensions/shape of the output tensor. @@
optional ModelTensorReshape reshape = 5
@@ .. cpp:var:: ModelTensorReshape reshape @@ @@ The shape produced for this output by the backend. The output will @@ be reshaped from this to the shape specifed in 'dims' before being @@ returned in the inference response. The reshape must have the same @@ number of elements as the output shape specified by 'dims'. Optional. @@
string label_filename = 4
@@ .. cpp:var:: string label_filename @@ @@ The label file associated with this output. Should be specified only @@ for outputs that represent classifications. Optional. @@
bool is_shape_tensor = 6
@@ .. cpp:var:: bool is_shape_tensor @@ @@ Whether or not the output is a shape tensor to the model. This field @@ is currently supported only for the TensorRT model. An error will be @@ generated if this specification does not comply with underlying @@ model. @@

@@ @@.. cpp:var:: message ModelParameter @@ @@ A model parameter. @@

Used in: ModelConfig

string string_value = 1
@@ .. cpp:var:: string string_value @@ @@ The string value of the parameter. @@

@@ @@.. cpp:var:: message ModelQueuePolicy @@ @@ Queue policy for inference requests. @@

Used in: ModelDynamicBatching

ModelQueuePolicy.TimeoutAction timeout_action = 1
@@ @@ .. cpp:var:: TimeoutAction timeout_action @@ @@ The action applied to timed-out request. @@ The default action is REJECT. @@
uint64 default_timeout_microseconds = 2
@@ @@ .. cpp:var:: uint64 default_timeout_microseconds @@ @@ The default timeout for every request, in microseconds. @@ The default value is 0 which indicates that no timeout is set. @@
bool allow_timeout_override = 3
@@ @@ .. cpp:var:: bool allow_timeout_override @@ @@ Whether individual request can override the default timeout value. @@ When true, individual requests can set a timeout that is less than @@ the default timeout value but may not increase the timeout. @@ The default value is false. @@
uint32 max_queue_size = 4
@@ @@ .. cpp:var:: uint32 max_queue_size @@ @@ The maximum queue size for holding requests. A request will be @@ rejected immediately if it can't be enqueued because the queue is @@ full. The default value is 0 which indicates that no maximum @@ queue size is enforced. @@

@@ @@ .. cpp:enum:: TimeoutAction @@ @@ The action applied to timed-out requests. @@

Used in: ModelQueuePolicy

REJECT = 0
@@ .. cpp:enumerator:: Action::REJECT = 0 @@ @@ Reject the request and return error message accordingly. @@
DELAY = 1
@@ .. cpp:enumerator:: Action::DELAY = 1 @@ @@ Delay the request until all other requests at the same @@ (or higher) priority levels that have not reached their timeouts @@ are processed. A delayed request will eventually be processed, @@ but may be delayed indefinitely due to newly arriving requests. @@

@@ @@.. cpp:enum:: ModelReadyState @@ @@ Readiness status for models. @@

Used in: ModelVersionStatus

MODEL_UNKNOWN = 0
@@ .. cpp:enumerator:: ModelReadyState::MODEL_UNKNOWN = 0 @@ @@ The model is in an unknown state. The model is not available for @@ inferencing. @@
MODEL_READY = 1
@@ .. cpp:enumerator:: ModelReadyState::MODEL_READY = 1 @@ @@ The model is ready and available for inferencing. @@
MODEL_UNAVAILABLE = 2
@@ .. cpp:enumerator:: ModelReadyState::MODEL_UNAVAILABLE = 2 @@ @@ The model is unavailable, indicating that the model failed to @@ load or has been implicitly or explicitly unloaded. The model is @@ not available for inferencing. @@
MODEL_LOADING = 3
@@ .. cpp:enumerator:: ModelReadyState::MODEL_LOADING = 3 @@ @@ The model is being loaded by the inference server. The model is @@ not available for inferencing. @@
MODEL_UNLOADING = 4
@@ .. cpp:enumerator:: ModelReadyState::MODEL_UNLOADING = 4 @@ @@ The model is being unloaded by the inference server. The model is @@ not available for inferencing. @@

@@ @@.. cpp:enum:: ModelReadyStateReason @@ @@ Detail associated with a model's readiness status. @@

Used in: ModelVersionStatus

string message = 1
@@ .. cpp:var:: string message @@ @@ The message that explains the cause of being in the current readiness @@ state. @@

@@ @@.. cpp:var:: message ModelRepositoryIndex @@ @@ Index of the model repository monitored by the inference server. @@

Used in: RepositoryResponse

repeated ModelRepositoryIndex.ModelEntry models = 1
@@ @@ .. cpp:var:: ModelEntry models (repeated) @@ @@ The list of models in the model repository. @@

message ModelRepositoryIndex.ModelEntry

server_status.proto:534

@@ @@ .. cpp:var:: message ModelEntry @@ @@ The basic information for a model. @@

Used in: ModelRepositoryIndex

string name = 1
@@ .. cpp:var:: string name @@ @@ The model's name. @@

@@ @@.. cpp:var:: message ModelSequenceBatching @@ @@ Sequence batching configuration. These settings control how sequence @@ batching operates for the model. @@

Used in: ModelConfig

oneof strategy_choice
@@ .. cpp:var:: oneof strategy_choice @@ @@ The strategy used by the sequence batcher. Default strategy @@ is 'direct'. @@
- ModelSequenceBatching.StrategyDirect direct = 3
  @@ .. cpp:var:: StrategyDirect direct @@ @@ StrategyDirect scheduling strategy. @@
- ModelSequenceBatching.StrategyOldest oldest = 4
  @@ .. cpp:var:: StrategyOldest oldest @@ @@ StrategyOldest scheduling strategy. @@
uint64 max_sequence_idle_microseconds = 1
@@ .. cpp:var:: uint64 max_sequence_idle_microseconds @@ @@ The maximum time, in microseconds, that a sequence is allowed to @@ be idle before it is aborted. The inference server considers a @@ sequence idle when it does not have any inference request queued @@ for the sequence. If this limit is exceeded, the inference server @@ will free the sequence slot allocated by the sequence and make it @@ available for another sequence. If not specified (or specified as @@ zero) a default value of 1000000 (1 second) is used. @@
repeated ModelSequenceBatching.ControlInput control_input = 2
@@ .. cpp:var:: ControlInput control_input (repeated) @@ @@ The model input(s) that the server should use to communicate @@ sequence start, stop, ready and similar control values to the @@ model. @@

@@ .. cpp:var:: message Control @@ @@ A control is a signal that the sequence batcher uses to @@ communicate with a backend. @@

Used in: ControlInput

Control.Kind kind = 1
@@ .. cpp:var:: Kind kind @@ @@ The kind of this control. @@
repeated int32 int32_false_true = 2
@@ .. cpp:var:: int32 int32_false_true (repeated) @@ @@ The control's true and false setting is indicated by setting @@ a value in an int32 tensor. The tensor must be a @@ 1-dimensional tensor with size equal to the batch size of @@ the request. 'int32_false_true' must have two entries: the @@ first the false value and the second the true value. @@
repeated float fp32_false_true = 3
@@ .. cpp:var:: float fp32_false_true (repeated) @@ @@ The control's true and false setting is indicated by setting @@ a value in a fp32 tensor. The tensor must be a @@ 1-dimensional tensor with size equal to the batch size of @@ the request. 'fp32_false_true' must have two entries: the @@ first the false value and the second the true value. @@
DataType data_type = 4
@@ .. cpp:var:: DataType data_type @@ @@ The control's datatype. @@

@@ @@ .. cpp:enum:: Kind @@ @@ The kind of the control. @@

Used in: Control

CONTROL_SEQUENCE_START = 0
@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_START = 0 @@ @@ A new sequence is/is-not starting. If true a sequence is @@ starting, if false a sequence is continuing. Must @@ specify either int32_false_true or fp32_false_true for @@ this control. This control is optional. @@
CONTROL_SEQUENCE_READY = 1
@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_READY = 1 @@ @@ A sequence is/is-not ready for inference. If true the @@ input tensor data is valid and should be used. If false @@ the input tensor data is invalid and inferencing should @@ be "skipped". Must specify either int32_false_true or @@ fp32_false_true for this control. This control is optional. @@
CONTROL_SEQUENCE_END = 2
@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_END = 2 @@ @@ A sequence is/is-not ending. If true a sequence is @@ ending, if false a sequence is continuing. Must @@ specify either int32_false_true or fp32_false_true for @@ this control. This control is optional. @@
CONTROL_SEQUENCE_CORRID = 3
@@ .. cpp:enumerator:: Kind::CONTROL_SEQUENCE_CORRID = 3 @@ @@ The correlation ID of the sequence. The correlation ID @@ is an uint64_t value that is communicated in whole or @@ in part by the tensor. The tensor's datatype must be @@ specified by data_type and must be TYPE_UINT64, TYPE_INT64, @@ TYPE_UINT32 or TYPE_INT32. If a 32-bit datatype is specified @@ the correlation ID will be truncated to the low-order 32 @@ bits. This control is optional. @@

@@ .. cpp:var:: message ControlInput @@ @@ The sequence control values to communicate by a model input. @@

Used in: ModelSequenceBatching

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the model input. @@
repeated Control control = 2
@@ .. cpp:var:: Control control (repeated) @@ @@ The control value(s) that should be communicated to the @@ model using this model input. @@

@@ .. cpp:var:: message StrategyDirect @@ @@ The sequence batcher uses a specific, unique batch @@ slot for each sequence. All inference requests in a @@ sequence are directed to the same batch slot in the same @@ model instance over the lifetime of the sequence. This @@ is the default strategy. @@

Used in: ModelSequenceBatching

(message has no fields)

@@ .. cpp:var:: message StrategyOldest @@ @@ The sequence batcher maintains up to 'max_candidate_sequences' @@ candidate sequences. 'max_candidate_sequences' can be greater @@ than the model's 'max_batch_size'. For inferencing the batcher @@ chooses from the candidate sequences up to 'max_batch_size' @@ inference requests. Requests are chosen in an oldest-first @@ manner across all candidate sequences. A given sequence is @@ not guaranteed to be assigned to the same batch slot for @@ all inference requests of that sequence. @@

Used in: ModelSequenceBatching

int32 max_candidate_sequences = 1
@@ .. cpp:var:: int32 max_candidate_sequences @@ @@ Maximum number of candidate sequences that the batcher @@ maintains. Excess seqences are kept in an ordered backlog @@ and become candidates when existing candidate sequences @@ complete. @@
repeated int32 preferred_batch_size = 2
@@ .. cpp:var:: int32 preferred_batch_size (repeated) @@ @@ Preferred batch sizes for dynamic batching of candidate @@ sequences. If a batch of one of these sizes can be formed @@ it will be executed immediately. If not specified a @@ preferred batch size will be chosen automatically @@ based on model and GPU characteristics. @@
uint64 max_queue_delay_microseconds = 3
@@ .. cpp:var:: uint64 max_queue_delay_microseconds @@ @@ The maximum time, in microseconds, a candidate request @@ will be delayed in the dynamic batch scheduling queue to @@ wait for additional requests for batching. Default is 0. @@

@@ @@.. cpp:var:: message ModelStatus @@ @@ Status for a model. @@

Used in: ServerStatus

optional ModelConfig config = 1
@@ .. cpp:var:: ModelConfig config @@ @@ The configuration for the model. @@
map<int64, ModelVersionStatus> version_status = 2
@@ .. cpp:var:: map<int64, ModelVersionStatus> version_status @@ @@ Duration statistics for each version of the model, as a map @@ from version to the status. A version will not occur in the map @@ unless there has been at least one inference request of @@ that model version. A version of -1 indicates the status is @@ for requests for which the version could not be determined. @@

@@ @@.. cpp:var:: message ModelTensorReshape @@ @@ Reshape specification for input and output tensors. @@

Used in: ModelInput, ModelOutput

repeated int64 shape = 1
@@ .. cpp:var:: int64 shape (repeated) @@ @@ The shape to use for reshaping. @@

@@ @@.. cpp:var:: message ModelVersionPolicy @@ @@ Policy indicating which versions of a model should be made @@ available by the inference server. @@

Used in: ModelConfig

oneof policy_choice
@@ .. cpp:var:: oneof policy_choice @@ @@ Each model must implement only a single version policy. The @@ default policy is 'Latest'. @@
- ModelVersionPolicy.Latest latest = 1
  @@ .. cpp:var:: Latest latest @@ @@ Serve only latest version(s) of the model. @@
- ModelVersionPolicy.All all = 2
  @@ .. cpp:var:: All all @@ @@ Serve all versions of the model. @@
- ModelVersionPolicy.Specific specific = 3
  @@ .. cpp:var:: Specific specific @@ @@ Serve only specific version(s) of the model. @@

@@ .. cpp:var:: message All @@ @@ Serve all versions of the model. @@

Used in: ModelVersionPolicy

(message has no fields)

@@ .. cpp:var:: message Latest @@ @@ Serve only the latest version(s) of a model. This is @@ the default policy. @@

Used in: ModelVersionPolicy

uint32 num_versions = 1
@@ .. cpp:var:: uint32 num_versions @@ @@ Serve only the 'num_versions' highest-numbered versions. T @@ The default value of 'num_versions' is 1, indicating that by @@ default only the single highest-number version of a @@ model will be served. @@

@@ .. cpp:var:: message Specific @@ @@ Serve only specific versions of the model. @@

Used in: ModelVersionPolicy

repeated int64 versions = 1
@@ .. cpp:var:: int64 versions (repeated) @@ @@ The specific versions of the model that will be served. @@

@@ @@.. cpp:var:: message ModelVersionStatus @@ @@ Status for a version of a model. @@

Used in: ModelStatus

ModelReadyState ready_state = 1
@@ .. cpp:var:: ModelReadyState ready_state @@ @@ Current readiness state for the model. @@
optional ModelReadyStateReason ready_state_reason = 5
@@ .. cpp:var:: ModelReadyStateReason ready_state_reason @@ @@ Supplemental information regarding the current readiness state. @@
map<uint32, InferRequestStats> infer_stats = 2
@@ .. cpp:var:: map<uint32, InferRequestStats> infer_stats @@ @@ Inference statistics for the model, as a map from batch size @@ to the statistics. A batch size will not occur in the map @@ unless there has been at least one inference request of @@ that batch size. However, for V2 API all InferRequestStats are @@ recorded at a single key which is 1. @@
uint64 model_execution_count = 3
@@ .. cpp:var:: uint64 model_execution_count @@ @@ Cumulative number of model executions performed for the @@ model. A single model execution performs inferencing for @@ the entire request batch and can perform inferencing for multiple @@ requests if dynamic batching is enabled. @@
uint64 model_inference_count = 4
@@ .. cpp:var:: uint64 model_inference_count @@ @@ Cumulative number of model inferences performed for the @@ model. Each inference in a batched request is counted as @@ an individual inference. @@
uint64 last_inference_timestamp_milliseconds = 6
@@ .. cpp:var:: uint64 last_inference_timestamp_milliseconds @@ @@ The timestamp of the last inference request made for this model, @@ given as milliseconds since the epoch. @@

@@ @@.. cpp:var:: message ModelWarmup @@ @@ Settings used to construct the request sample for model warmup. @@

Used in: ModelConfig

string name = 1
@@ .. cpp:var:: string name @@ @@ The name of the request sample. @@
uint32 batch_size = 2
@@ .. cpp:var:: uint32 batch_size @@ @@ The batch size of the inference request. This must be >= 1. For @@ models that don't support batching, batch_size must be 1. If @@ batch_size > 1, the 'inputs' specified below will be duplicated to @@ match the batch size requested. @@
map<string, ModelWarmup.Input> inputs = 3
@@ .. cpp:var:: map<string, Input> inputs @@ @@ The warmup meta data associated with every model input, including @@ control tensors. @@

@@ @@ .. cpp:var:: message Input @@ @@ Meta data associated with an input. @@

Used in: ModelWarmup

DataType data_type = 1
@@ .. cpp:var:: DataType data_type @@ @@ The data-type of the input. @@
repeated int64 dims = 2
@@ .. cpp:var:: int64 dims (repeated) @@ @@ The shape of the input tensor, not including the batch dimension. @@
oneof input_data_type
@@ .. cpp:var:: oneof input_data_type @@ @@ Specify how the input data is generated. If the input has STRING @@ data type and 'random_data' is set, the data generation will fall @@ back to 'zero_data'. @@
- bool zero_data = 3
  @@ @@ .. cpp:var:: bool zero_data @@ @@ The identifier for using zeros as input data. Note that the @@ value of 'zero_data' will not be checked, instead, zero data @@ will be used as long as the field is set. @@
- bool random_data = 4
  @@ @@ .. cpp:var:: bool random_data @@ @@ The identifier for using random data as input data. Note that @@ the value of 'random_data' will not be checked, instead, @@ random data will be used as long as the field is set. @@
- string input_data_file = 5
  @@ .. cpp:var:: string input_data_file @@ @@ The file whose content will be used as raw input data in @@ row-major order. The file must be provided in a sub-directory @@ 'warmup' under the model directory. @@

@@ @@ .. cpp:var:: message ModelIndex @@ @@ Index entry for a model. @@

Used in: RepositoryIndexResponse

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The name of the model. @@

@@ @@.. cpp:var:: message RepositoryRequestStats @@ @@ Statistics collected for Repository requests. @@

Used in: ServerStatus

optional StatDuration success = 1
@@ .. cpp:var:: StatDuration success @@ @@ Total time required to handle successful Repository requests, not @@ including HTTP or gRPC endpoint termination time. @@

@@ @@.. cpp:var:: message RequestStatus @@ @@ Status returned for all inference server requests. The @@ RequestStatus provides a :cpp:enum:`RequestStatusCode`, an @@ optional status message, and server and request IDs. @@

Used in: HealthResponse, InferResponse, ModelControlResponse, RepositoryResponse, SharedMemoryControlResponse, StatusResponse

RequestStatusCode code = 1
@@ .. cpp:var:: RequestStatusCode code @@ @@ The status code. @@
string msg = 2
@@ .. cpp:var:: string msg @@ @@ The optional status message. @@
string server_id = 3
@@ .. cpp:var:: string server_id @@ @@ The identifying string for the server that is returning @@ this status. @@
uint64 request_id = 4
@@ .. cpp:var:: string request_id @@ @@ Unique identifier for the request assigned by the inference @@ server. Value 0 (zero) indicates the request ID is not known. @@

@@ @@.. cpp:enum:: RequestStatusCode @@ @@ Status codes returned for inference server requests. The @@ :cpp:enumerator:`RequestStatusCode::SUCCESS` status code indicates @@ not error, all other codes indicate an error. @@

Used in: RequestStatus

INVALID = 0
@@ .. cpp:enumerator:: RequestStatusCode::INVALID = 0 @@ @@ Invalid status. Used internally but should not be returned as @@ part of a :cpp:var:`RequestStatus`. @@
SUCCESS = 1
@@ .. cpp:enumerator:: RequestStatusCode::SUCCESS = 1 @@ @@ Error code indicating success. @@
UNKNOWN = 2
@@ .. cpp:enumerator:: RequestStatusCode::UNKNOWN = 2 @@ @@ Error code indicating an unknown failure. @@
INTERNAL = 3
@@ .. cpp:enumerator:: RequestStatusCode::INTERNAL = 3 @@ @@ Error code indicating an internal failure. @@
NOT_FOUND = 4
@@ .. cpp:enumerator:: RequestStatusCode::NOT_FOUND = 4 @@ @@ Error code indicating a resource or request was not found. @@
INVALID_ARG = 5
@@ .. cpp:enumerator:: RequestStatusCode::INVALID_ARG = 5 @@ @@ Error code indicating a failure caused by an unknown argument or @@ value. @@
UNAVAILABLE = 6
@@ .. cpp:enumerator:: RequestStatusCode::UNAVAILABLE = 6 @@ @@ Error code indicating an unavailable resource. @@
UNSUPPORTED = 7
@@ .. cpp:enumerator:: RequestStatusCode::UNSUPPORTED = 7 @@ @@ Error code indicating an unsupported request or operation. @@
ALREADY_EXISTS = 8
@@ .. cpp:enumerator:: RequestStatusCode::ALREADY_EXISTS = 8 @@ @@ Error code indicating an already existing resource. @@

@@ @@.. cpp:enum:: ServerReadyState @@ @@ Readiness status for the inference server. @@

Used in: ServerStatus

SERVER_INVALID = 0
@@ .. cpp:enumerator:: ServerReadyState::SERVER_INVALID = 0 @@ @@ The server is in an invalid state and will likely not @@ response correctly to any requests. @@
SERVER_INITIALIZING = 1
@@ .. cpp:enumerator:: ServerReadyState::SERVER_INITIALIZING = 1 @@ @@ The server is initializing. @@
SERVER_READY = 2
@@ .. cpp:enumerator:: ServerReadyState::SERVER_READY = 2 @@ @@ The server is ready and accepting requests. @@
SERVER_EXITING = 3
@@ .. cpp:enumerator:: ServerReadyState::SERVER_EXITING = 3 @@ @@ The server is exiting and will not respond to requests. @@
SERVER_FAILED_TO_INITIALIZE = 10
@@ .. cpp:enumerator:: ServerReadyState::SERVER_FAILED_TO_INITIALIZE = 10 @@ @@ The server did not initialize correctly. Most requests will fail. @@

@@ @@.. cpp:var:: message ServerStatus @@ @@ Status for the inference server. @@

Used in: StatusResponse

string id = 1
@@ .. cpp:var:: string id @@ @@ The server's ID. @@
string version = 2
@@ .. cpp:var:: string version @@ @@ The server's version. @@
ServerReadyState ready_state = 7
@@ .. cpp:var:: ServerReadyState ready_state @@ @@ Current readiness state for the server. @@
uint64 uptime_ns = 3
@@ .. cpp:var:: uint64 uptime_ns @@ @@ Server uptime in nanoseconds. @@
map<string, ModelStatus> model_status = 4
@@ .. cpp:var:: map<string, ModelStatus> model_status @@ @@ Status for each model, as a map from model name to the @@ status. @@
optional StatusRequestStats status_stats = 5
@@ .. cpp:var:: StatusRequestStats status_stats @@ @@ Statistics for Status requests. @@
optional HealthRequestStats health_stats = 8
@@ .. cpp:var:: HealthRequestStats health_stats @@ @@ Statistics for Health requests. @@
optional ModelControlRequestStats model_control_stats = 9
@@ .. cpp:var:: ModelControlRequestStats model_control_stats @@ @@ Statistics for ModelControl requests. @@
optional SharedMemoryControlRequestStats shm_control_stats = 10
@@ .. cpp:var:: SharedMemoryControlRequestStats shm_control_stats @@ @@ [DEPRECATED] Statistics for SharedMemoryControl requests. @@
optional RepositoryRequestStats repository_stats = 11
@@ .. cpp:var:: RepositoryRequestStats repository_stats @@ @@ Statistics for Repository requests. @@

@@ .. cpp:var:: message Register @@ @@ Register a shared memory region. @@

Used in: SharedMemoryControlRequest

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The name for this shared memory region. @@
oneof shared_memory_types
@@ .. cpp:var:: oneof shared_memory_types @@ @@ Types of shared memory identifiers @@
- Register.SystemSharedMemoryIdentifier system_shared_memory = 2
  @@ @@ .. cpp:var:: SystemSharedMemoryIdentifier system_shared_memory @@ @@ The identifier for this system shared memory region. @@
- Register.CUDASharedMemoryIdentifier cuda_shared_memory = 3
  @@ @@ .. cpp:var:: CUDASharedMemoryIdentifier cuda_shared_memory @@ @@ The identifier for this CUDA shared memory region. @@
uint64 byte_size = 4
@@ .. cpp:var:: uint64 byte_size @@ @@ Size of the shared memory block, in bytes. @@

@@ @@ .. cpp:var:: message CUDASharedMemoryIdentifier @@ @@ The identifier for this system shared memory region. @@

Used in: Register

bytes raw_handle = 1
@@ .. cpp:var:: bytes raw_handle @@ @@ The raw serialized cudaIPC handle. @@
int64 device_id = 2
@@ .. cpp:var:: int64 device_id @@ @@ The GPU device ID on which the cudaIPC handle was created. @@

@@ @@ .. cpp:var:: message SystemSharedMemoryIdentifier @@ @@ The identifier for this system shared memory region. @@

Used in: Register

string shared_memory_key = 1
@@ .. cpp:var:: string shared_memory_key @@ @@ The name of the shared memory region that holds the input data @@ (or where the output data should be written). @@
uint64 offset = 2
@@ .. cpp:var:: uint64 offset @@ @@ This is the offset of the shared memory block from the start @@ of the shared memory region. @@ start = offset, end = offset + byte_size; @@

@@ .. cpp:var:: message GetStatus @@ @@ Get the status of all active shared memory regions. @@

Used in: SharedMemoryControlRequest

(message has no fields)

@@ .. cpp:var:: message Unregister @@ @@ Unregister a specified shared memory region. @@

Used in: SharedMemoryControlRequest

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The name for this shared memory region to unregister. @@

@@ .. cpp:var:: message UnregisterAll @@ @@ Unregister all shared memory regions. @@

Used in: SharedMemoryControlRequest

(message has no fields)

@@ @@.. cpp:var:: message SharedMemoryControlRequestStats @@ @@ Statistics for SharedMemoryControl requests @@ @@ [DEPRECATED] The message has been deprecated and will @@ always report 0. @@

Used in: ServerStatus

optional StatDuration success = 1
@@ .. cpp:var:: StatDuration success @@ @@ Total time required to handle successful SharedMemoryControl @@ requests, not including HTTP or gRPC endpoint termination time. @@

@@ @@.. cpp:var:: message Status @@ @@ Status of all active shared memory regions. @@

Used in: SharedMemoryControlResponse

repeated SharedMemoryRegion shared_memory_region = 1
@@ @@ .. cpp:var:: SharedMemoryRegion shared_memory_region @@ @@ The list of active/registered shared memory regions. @@

@@.. cpp:var:: message SharedMemoryRegion @@ @@ The meta-data for the shared memory region registered in the inference @@ server. @@

Used in: SharedMemoryControlResponse.Status, SharedMemoryStatus

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The name for this shared memory region. @@
oneof shared_memory_types
@@ .. cpp:var:: oneof shared_memory_types @@ @@ Types of shared memory identifiers @@
- SharedMemoryRegion.SystemSharedMemory system_shared_memory = 2
  @@ @@ .. cpp:var:: SystemSharedMemory system_shared_memory @@ @@ The status of this system shared memory region. @@
- SharedMemoryRegion.CudaSharedMemory cuda_shared_memory = 3
  @@ @@ .. cpp:var:: CudaSharedMemory cuda_shared_memory @@ @@ The status of this CUDA shared memory region. @@
uint64 byte_size = 5
@@ .. cpp:var:: uint64 byte_size @@ @@ Size of the shared memory block, in bytes. @@

Used in: SharedMemoryRegion

int64 device_id = 1
@@ .. cpp:var:: int64 device_id @@ @@ The GPU device ID on which the cudaIPC handle was created. @@

Used in: SharedMemoryRegion

string shared_memory_key = 1
@@ .. cpp:var:: string shared_memory_key @@ @@ The name of the shared memory region that holds the input data @@ (or where the output data should be written). @@
uint64 offset = 2
@@ .. cpp:var:: uint64 offset @@ @@ This is the offset of the shared memory block from the start @@ of the shared memory region. @@ start = offset, end = offset + byte_size; @@

@@ @@.. cpp:var:: message SharedMemoryStatus @@ @@ Shared memory status for the inference server. @@

repeated SharedMemoryRegion shared_memory_region = 2
@@ @@ .. cpp:var:: SharedMemoryRegion shared_memory_region (repeated) @@ @@ The list of active/registered shared memory regions. @@

@@ @@.. cpp:var:: message StatDuration @@ @@ Statistic collecting a duration metric. @@

Used in: HealthRequestStats, InferRequestStats, ModelControlRequestStats, RepositoryRequestStats, SharedMemoryControlRequestStats, StatusRequestStats

uint64 count = 1
@@ .. cpp:var:: uint64 count @@ @@ Cumulative number of times this metric occurred. @@
uint64 total_time_ns = 2
@@ .. cpp:var:: uint64 total_time_ns @@ @@ Total collected duration of this metric in nanoseconds. @@

@@ @@.. cpp:var:: message StatisticDuration @@ @@ Statistic recording a cumulative duration metric. @@

Used in: InferStatistics

uint64 count = 1
@@ .. cpp:var:: uint64 count @@ @@ Cumulative number of times this metric occurred. @@
uint64 ns = 2
@@ .. cpp:var:: uint64 total_time_ns @@ @@ Total collected duration of this metric in nanoseconds. @@

@@ @@.. cpp:var:: message StatusRequestStats @@ @@ Statistics collected for Status requests. @@

Used in: ServerStatus

optional StatDuration success = 1
@@ .. cpp:var:: StatDuration success @@ @@ Total time required to handle successful Status requests, not @@ including HTTP or gRPC endpoint termination time. @@

@@ @@ .. cpp:var:: message RegionStatus @@ @@ Status for a shared memory region. @@

Used in: SystemSharedMemoryStatusResponse

string name = 1
@@ @@ .. cpp:var:: string name @@ @@ The name for the shared memory region. @@
string key = 2
@@ .. cpp:var:: string shared_memory_key @@ @@ The key of the underlying memory object that contains the @@ shared memory region. @@
uint64 offset = 3
@@ .. cpp:var:: uint64 offset @@ @@ Offset, in bytes, within the underlying memory object to @@ the start of the shared memory region. @@
uint64 byte_size = 4
@@ .. cpp:var:: uint64 byte_size @@ @@ Size of the shared memory region, in bytes. @@

package nvidia.inferenceserver

service GRPCInferenceService

rpc CudaSharedMemoryRegister (CudaSharedMemoryRegisterRequest, CudaSharedMemoryRegisterResponse)

message CudaSharedMemoryRegisterRequest

string name = 1

bytes raw_handle = 2

int64 device_id = 3

uint64 byte_size = 4

message CudaSharedMemoryRegisterResponse

rpc CudaSharedMemoryStatus (CudaSharedMemoryStatusRequest, CudaSharedMemoryStatusResponse)

message CudaSharedMemoryStatusRequest

string name = 1

message CudaSharedMemoryStatusResponse

map<string, CudaSharedMemoryStatusResponse.RegionStatus> regions = 1

rpc CudaSharedMemoryUnregister (CudaSharedMemoryUnregisterRequest, CudaSharedMemoryUnregisterResponse)

message CudaSharedMemoryUnregisterRequest

string name = 1

message CudaSharedMemoryUnregisterResponse

rpc ModelConfig (ModelConfigRequest, ModelConfigResponse)

message ModelConfigRequest

string name = 1

string version = 2

message ModelConfigResponse

optional ModelConfig config = 1

rpc ModelInfer (ModelInferRequest, ModelInferResponse)

rpc ModelMetadata (ModelMetadataRequest, ModelMetadataResponse)

message ModelMetadataRequest

string name = 1

string version = 2

message ModelMetadataResponse

string name = 1

repeated string versions = 2

string platform = 3

repeated ModelMetadataResponse.TensorMetadata inputs = 4

repeated ModelMetadataResponse.TensorMetadata outputs = 5

rpc ModelReady (ModelReadyRequest, ModelReadyResponse)

message ModelReadyRequest

string name = 1

string version = 2

message ModelReadyResponse

bool ready = 1

rpc ModelStatistics (ModelStatisticsRequest, ModelStatisticsResponse)

message ModelStatisticsRequest

string name = 1

string version = 2

message ModelStatisticsResponse

map<string, InferStatistics> inference = 1

rpc ModelStreamInfer (stream ModelInferRequest, stream ModelStreamInferResponse)

message ModelStreamInferResponse

string error_message = 1

optional ModelInferResponse infer_response = 2

rpc RepositoryIndex (RepositoryIndexRequest, RepositoryIndexResponse)

message RepositoryIndexRequest

string repository_name = 1

message RepositoryIndexResponse

repeated RepositoryIndexResponse.ModelIndex models = 1

rpc RepositoryModelLoad (RepositoryModelLoadRequest, RepositoryModelLoadResponse)

message RepositoryModelLoadRequest

string repository_name = 1

string model_name = 2

message RepositoryModelLoadResponse

rpc RepositoryModelUnload (RepositoryModelUnloadRequest, RepositoryModelUnloadResponse)

message RepositoryModelUnloadRequest

string repository_name = 1

string model_name = 2

message RepositoryModelUnloadResponse

rpc ServerLive (ServerLiveRequest, ServerLiveResponse)

message ServerLiveRequest

message ServerLiveResponse

bool live = 1

rpc ServerMetadata (ServerMetadataRequest, ServerMetadataResponse)

message ServerMetadataRequest

message ServerMetadataResponse

string name = 1

string version = 2

repeated string extensions = 3

rpc ServerReady (ServerReadyRequest, ServerReadyResponse)

message ServerReadyRequest

message ServerReadyResponse

bool ready = 1