package generate.v3

Get desktop application:
View/edit binary Protocol Buffers messages

rpc Info (InfoRequest, InfoResponse)
generate.proto:7
/ Model Info
message InfoRequest
generate.proto:29
/ Empty request
(message has no fields)
message InfoResponse
generate.proto:31
- bool requires_padding = 1
- string dtype = 2
- string device_type = 3
- optional uint32 window_size = 4
- uint32 speculate = 5
- bool support_chunking = 6
- bool use_prefix_caching = 7
- string attention_impl = 8
- uint32 block_size = 9
rpc ServiceDiscovery (ServiceDiscoveryRequest, ServiceDiscoveryResponse)
generate.proto:9
/ Service discovery
message ServiceDiscoveryRequest
generate.proto:44
/ Empty request
(message has no fields)
message ServiceDiscoveryResponse
generate.proto:46
- repeated string urls = 1
  / Other shards urls
rpc ClearCache (ClearCacheRequest, ClearCacheResponse)
generate.proto:12
/ Empties batch cache
message ClearCacheRequest
generate.proto:51
- optional uint64 id = 1
  / Optional batch id
message ClearCacheResponse
generate.proto:57
/ Empty response
(message has no fields)
rpc FilterBatch (FilterBatchRequest, FilterBatchResponse)
generate.proto:14
/ Remove requests from a cached batch
message FilterBatchRequest
generate.proto:218
- uint64 batch_id = 1
  / Batch ID
- repeated uint64 request_ids = 2
  / Requests to keep
message FilterBatchResponse
generate.proto:225
- optional CachedBatch batch = 1
  / Filtered Batch (cached)
rpc Warmup (WarmupRequest, WarmupResponse)
generate.proto:16
/ Warmup the model and compute max cache size
message WarmupRequest
generate.proto:272
- optional Batch batch = 1
  / Batch to warmup on
- optional uint32 max_input_tokens = 2
- uint32 max_prefill_tokens = 3
- optional uint32 max_total_tokens = 4
message WarmupResponse
generate.proto:280
- optional uint32 max_supported_total_tokens = 1
  / Maximum number of tokens supported by the model
- uint32 max_input_tokens = 2
  / Maximum input tokens by clients should be equal to request value if it's set / Otherwise warmup automatically allocates a value here
- uint32 max_total_tokens = 3
  / Maximum total tokens by clients should be equal to request value if it's set / Otherwise warmup automatically allocates a value here
rpc Prefill (PrefillRequest, PrefillResponse)
generate.proto:18
/ Prefill batch and decode first token
message PrefillRequest
generate.proto:230
- optional Batch batch = 1
  / Batch
- optional CachedBatch cached_batch = 2
  / Optional cached batch
message PrefillResponse
generate.proto:237
- repeated Generation generations = 1
  / Generation
- optional CachedBatch batch = 2
  / Next batch (cached)
- uint64 forward_ns = 3
  / Forward elapsed time in nanoseconds
- uint64 decode_ns = 4
  / Decode elapsed time in nanoseconds
- uint64 total_ns = 5
  / Total elapsed time in nanoseconds
- optional uint64 concat_ns = 6
  / Concatenate elapsed time in nanoseconds
rpc Decode (DecodeRequest, DecodeResponse)
generate.proto:20
/ Decode token for a list of prefilled batches
message DecodeRequest
generate.proto:252
- repeated CachedBatch batches = 1
  / Cached batches
message DecodeResponse
generate.proto:257
- repeated Generation generations = 1
  / Decodes
- optional CachedBatch batch = 2
  / Next batch (cached)
- uint64 forward_ns = 3
  / Forward elapsed time in nanoseconds
- uint64 decode_ns = 4
  / Decode elapsed time in nanoseconds
- uint64 total_ns = 5
  / Total elapsed time in nanoseconds
- optional uint64 concat_ns = 6
  / Concatenate elapsed time in nanoseconds
rpc Health (HealthRequest, HealthResponse)
generate.proto:22
/ Health check
message HealthRequest
generate.proto:25
(message has no fields)
message HealthResponse
generate.proto:26
(message has no fields)

Used in: PrefillRequest, WarmupRequest

uint64 id = 1
/ Batch ID
repeated Request requests = 2
/ Individual requests
uint32 size = 3
/ Batch size (==len(requests))
uint32 max_tokens = 4
/ Maximum number of tokens this batch will grow to
uint32 max_blocks = 5
/ Maximum number of Paged Attention blocks

Used in: DecodeRequest, DecodeResponse, FilterBatchResponse, PrefillRequest, PrefillResponse

uint64 id = 1
/ Batch ID
repeated uint64 request_ids = 2
/ Individual requests ids
uint32 size = 3
/ Batch size (==len(requests))
uint32 max_tokens = 4
/ Maximum number of tokens this batch will grow to
uint32 current_tokens = 5
/ Number of tokens in the next forward

Used in: GeneratedText

FINISH_REASON_LENGTH = 0
FINISH_REASON_EOS_TOKEN = 1
FINISH_REASON_STOP_SEQUENCE = 2

Used in: Generation

string text = 1
/ Output
uint32 generated_tokens = 2
/ Number of generated tokens
FinishReason finish_reason = 3
/ Finish reason
optional uint64 seed = 4
/ Seed

Used in: DecodeResponse, PrefillResponse

uint64 request_id = 1
/ Request ID
optional Tokens prefill_tokens = 2
/ Prefill tokens (optional)
optional Tokens tokens = 3
optional GeneratedText generated_text = 4
/ Complete generated text
repeated Tokens top_tokens = 5
/ Top tokens

Used in: NextTokenChooserParameters

GRAMMAR_TYPE_NONE = 0
GRAMMAR_TYPE_JSON = 1
GRAMMAR_TYPE_REGEX = 2

Used in: InputChunk

bytes data = 1
/ Binary image data.
string mimetype = 2
/ Image MIME type.

Used in: Request

repeated InputChunk chunks = 1

Used in: Input

oneof chunk
- string text = 1
  / Plain text data
- Image image = 2
  / Image data

Used in: Request

float temperature = 1
/ exponential scaling output probability distribution
uint32 top_k = 2
/ restricting to the k highest probability elements
float top_p = 3
/ restricting to top tokens summing to prob_cut_off <= prob_cut_off
float typical_p = 4
/ restricting to top tokens summing to prob_cut_off <= prob_cut_off
bool do_sample = 5
/ apply sampling on the logits
uint64 seed = 6
/ random seed for sampling
float repetition_penalty = 7
/ repetition penalty
float frequency_penalty = 9
/ frequency penalty
bool watermark = 8
/ token watermarking using "A Watermark for Large Language Models"
string grammar = 10
/ grammar (applied if not empty)
GrammarType grammar_type = 11
/ grammar type

Used in: Batch

uint64 id = 1
/ Request ID
optional Input input_chunks = 8
/ The generation context as chunks
string inputs = 2
/ The generation context, stringified input_chunks
uint32 truncate = 3
/ Context truncation
optional NextTokenChooserParameters parameters = 4
/ Next Token Chooser Parameters
optional StoppingCriteriaParameters stopping_parameters = 5
/ Stopping Criteria Parameters
bool prefill_logprobs = 6
/ Return prefill logprobs
uint32 top_n_tokens = 7
/ Return most likely n tokens
repeated uint32 blocks = 9
/ Paged attention blocks
repeated uint32 slots = 10
/ Paged attention slots
optional string adapter_id = 11
/ LORA adapter index
uint32 cache_len = 12
/ Tokens that can be retrieved from the KV cache. / This value is set for the first prefill and never reset
bool add_special_tokens = 13
/ Context truncation
optional uint32 chunk_len = 14
/ Chunk of tokens that must be computed for the first prefill / This value is set for the first prefill and never reset

Used in: Request

uint32 max_new_tokens = 1
/ Maximum number of generated tokens
repeated string stop_sequences = 2
/ Optional stopping sequences
bool ignore_eos_token = 3
/ Ignore end of sequence token / used for benchmarking

Used in: Generation

repeated uint32 ids = 1
/ Token IDs
repeated float logprobs = 2
/ Logprobs
repeated string texts = 3
/ tokens
repeated bool is_special = 4
/ special

package generate.v3

service TextGenerationService

rpc Info (InfoRequest, InfoResponse)

message InfoRequest

message InfoResponse

bool requires_padding = 1

string dtype = 2

string device_type = 3

optional uint32 window_size = 4

uint32 speculate = 5

bool support_chunking = 6

bool use_prefix_caching = 7

string attention_impl = 8

uint32 block_size = 9

rpc ServiceDiscovery (ServiceDiscoveryRequest, ServiceDiscoveryResponse)

message ServiceDiscoveryRequest

message ServiceDiscoveryResponse

repeated string urls = 1

rpc ClearCache (ClearCacheRequest, ClearCacheResponse)

message ClearCacheRequest

optional uint64 id = 1

message ClearCacheResponse

rpc FilterBatch (FilterBatchRequest, FilterBatchResponse)

message FilterBatchRequest

uint64 batch_id = 1

repeated uint64 request_ids = 2

message FilterBatchResponse

optional CachedBatch batch = 1

rpc Warmup (WarmupRequest, WarmupResponse)

message WarmupRequest

optional Batch batch = 1

optional uint32 max_input_tokens = 2

uint32 max_prefill_tokens = 3

optional uint32 max_total_tokens = 4

message WarmupResponse

optional uint32 max_supported_total_tokens = 1

uint32 max_input_tokens = 2

uint32 max_total_tokens = 3

rpc Prefill (PrefillRequest, PrefillResponse)

message PrefillRequest

optional Batch batch = 1

optional CachedBatch cached_batch = 2

message PrefillResponse

repeated Generation generations = 1

optional CachedBatch batch = 2

uint64 forward_ns = 3

uint64 decode_ns = 4

uint64 total_ns = 5

optional uint64 concat_ns = 6

rpc Decode (DecodeRequest, DecodeResponse)

message DecodeRequest

repeated CachedBatch batches = 1

message DecodeResponse

repeated Generation generations = 1

optional CachedBatch batch = 2

uint64 forward_ns = 3

uint64 decode_ns = 4

uint64 total_ns = 5

optional uint64 concat_ns = 6

rpc Health (HealthRequest, HealthResponse)

message HealthRequest

message HealthResponse

message Batch

uint64 id = 1

repeated Request requests = 2

uint32 size = 3

uint32 max_tokens = 4

uint32 max_blocks = 5

message CachedBatch

uint64 id = 1

repeated uint64 request_ids = 2

uint32 size = 3

uint32 max_tokens = 4

uint32 current_tokens = 5

enum FinishReason

FINISH_REASON_LENGTH = 0

FINISH_REASON_EOS_TOKEN = 1

FINISH_REASON_STOP_SEQUENCE = 2

message GeneratedText

string text = 1