package generate.v1

Get desktop application:
View/edit binary Protocol Buffers messages

rpc Info (InfoRequest, InfoResponse)
generate.proto:7
/ Model Info
message InfoRequest
generate.proto:28
/ Empty request
(message has no fields)
message InfoResponse
generate.proto:30
- bool requires_padding = 1
- string dtype = 2
- string device_type = 3
rpc ServiceDiscovery (ServiceDiscoveryRequest, ServiceDiscoveryResponse)
generate.proto:9
/ Service discovery
message ServiceDiscoveryRequest
generate.proto:37
/ Empty request
(message has no fields)
message ServiceDiscoveryResponse
generate.proto:39
- repeated string urls = 1
  / Other shards urls
rpc ClearCache (ClearCacheRequest, ClearCacheResponse)
generate.proto:11
/ Empties batch cache
message ClearCacheRequest
generate.proto:44
- optional uint64 id = 1
  / Optional batch id
message ClearCacheResponse
generate.proto:50
/ Empty response
(message has no fields)
rpc FilterBatch (FilterBatchRequest, FilterBatchResponse)
generate.proto:13
/ Remove requests from a cached batch
message FilterBatchRequest
generate.proto:161
- uint64 batch_id = 1
  / Batch ID
- repeated uint64 request_ids = 2
  / Requests to keep
message FilterBatchResponse
generate.proto:168
- optional CachedBatch batch = 1
  / Filtered Batch (cached)
rpc Warmup (WarmupRequest, WarmupResponse)
generate.proto:15
/ Warmup the model and compute max cache size
message WarmupRequest
generate.proto:198
- optional Batch batch = 1
  / Batch to warmup on
message WarmupResponse
generate.proto:204
/ Empty response
- optional uint32 max_supported_total_tokens = 1
  / Maximum number of tokens supported by the model
rpc Prefill (PrefillRequest, PrefillResponse)
generate.proto:17
/ Prefill batch and decode first token
message PrefillRequest
generate.proto:174
- optional Batch batch = 1
  / Batch
message PrefillResponse
generate.proto:179
- repeated Generation generations = 1
  / Generation
- optional CachedBatch batch = 2
  / Next batch (cached)
rpc Decode (DecodeRequest, DecodeResponse)
generate.proto:19
/ Decode token for a list of prefilled batches
message DecodeRequest
generate.proto:186
- repeated CachedBatch batches = 1
  / Cached batches
message DecodeResponse
generate.proto:191
- repeated Generation generations = 1
  / Decodes
- optional CachedBatch batch = 2
  / Next batch (cached)
rpc Health (HealthRequest, HealthResponse)
generate.proto:21
/ Health check
message HealthRequest
generate.proto:24
(message has no fields)
message HealthResponse
generate.proto:25
(message has no fields)

Used in: PrefillRequest, WarmupRequest

uint64 id = 1
/ Batch ID
repeated Request requests = 2
/ Individual requests
uint32 size = 3
/ Batch size (==len(requests))
uint32 max_tokens = 4
/ Maximum number of tokens this batch will grow to

Used in: DecodeRequest, DecodeResponse, FilterBatchResponse, PrefillResponse

uint64 id = 1
/ Batch ID
repeated uint64 request_ids = 2
/ Individual requests ids
uint32 size = 3
/ Batch size (==len(requests))
uint32 max_tokens = 4
/ Maximum number of tokens this batch will grow to

Used in: GeneratedText

FINISH_REASON_LENGTH = 0
FINISH_REASON_EOS_TOKEN = 1
FINISH_REASON_STOP_SEQUENCE = 2

Used in: Generation

string text = 1
/ Output
uint32 generated_tokens = 2
/ Number of generated tokens
FinishReason finish_reason = 3
/ Finish reason
optional uint64 seed = 4
/ Seed

Used in: DecodeResponse, PrefillResponse

uint64 request_id = 1
/ Request ID
optional PrefillTokens prefill_tokens = 2
/ Prefill tokens (optional)
uint32 token_id = 3
/ Token ID
float token_logprob = 4
/ Logprob
string token_text = 5
/ Text
bool token_is_special = 6
/ Is it a special token
optional GeneratedText generated_text = 7
/ Complete generated text

Used in: Request

float temperature = 1
/ exponential scaling output probability distribution
uint32 top_k = 2
/ restricting to the k highest probability elements
float top_p = 3
/ restricting to top tokens summing to prob_cut_off <= prob_cut_off
float typical_p = 4
/ restricting to top tokens summing to prob_cut_off <= prob_cut_off
bool do_sample = 5
/ apply sampling on the logits
uint64 seed = 6
/ random seed for sampling
float repetition_penalty = 7
/ repetition penalty
bool watermark = 8
/ token watermarking using "A Watermark for Large Language Models"

Used in: Generation

repeated uint32 ids = 1
/ Prefill Token IDs
repeated float logprobs = 2
/ Prefill Logprobs
repeated string texts = 3
/ Prefill tokens

Used in: Batch

uint64 id = 1
/ Request ID
string inputs = 2
/ The generation context
uint32 truncate = 3
/ Context truncation
optional NextTokenChooserParameters parameters = 4
/ Next Token Chooser Parameters
optional StoppingCriteriaParameters stopping_parameters = 5
/ Stopping Criteria Parameters
bool prefill_logprobs = 6
/ Return prefill logprobs

Used in: Request

uint32 max_new_tokens = 1
/ Maximum number of generated tokens
repeated string stop_sequences = 2
/ Optional stopping sequences
bool ignore_eos_token = 3
/ Ignore end of sequence token / used for benchmarking

package generate.v1

service TextGenerationService

rpc Info (InfoRequest, InfoResponse)

message InfoRequest

message InfoResponse

bool requires_padding = 1

string dtype = 2

string device_type = 3

rpc ServiceDiscovery (ServiceDiscoveryRequest, ServiceDiscoveryResponse)

message ServiceDiscoveryRequest

message ServiceDiscoveryResponse

repeated string urls = 1

rpc ClearCache (ClearCacheRequest, ClearCacheResponse)

message ClearCacheRequest

optional uint64 id = 1

message ClearCacheResponse

rpc FilterBatch (FilterBatchRequest, FilterBatchResponse)

message FilterBatchRequest

uint64 batch_id = 1

repeated uint64 request_ids = 2

message FilterBatchResponse

optional CachedBatch batch = 1

rpc Warmup (WarmupRequest, WarmupResponse)

message WarmupRequest

optional Batch batch = 1

message WarmupResponse

optional uint32 max_supported_total_tokens = 1

rpc Prefill (PrefillRequest, PrefillResponse)

message PrefillRequest

optional Batch batch = 1

message PrefillResponse

repeated Generation generations = 1

optional CachedBatch batch = 2

rpc Decode (DecodeRequest, DecodeResponse)

message DecodeRequest

repeated CachedBatch batches = 1

message DecodeResponse

repeated Generation generations = 1

optional CachedBatch batch = 2

rpc Health (HealthRequest, HealthResponse)

message HealthRequest

message HealthResponse

message Batch

uint64 id = 1

repeated Request requests = 2

uint32 size = 3

uint32 max_tokens = 4

message CachedBatch

uint64 id = 1

repeated uint64 request_ids = 2

uint32 size = 3

uint32 max_tokens = 4

enum FinishReason

FINISH_REASON_LENGTH = 0

FINISH_REASON_EOS_TOKEN = 1

FINISH_REASON_STOP_SEQUENCE = 2

message GeneratedText

string text = 1

uint32 generated_tokens = 2

FinishReason finish_reason = 3

optional uint64 seed = 4

message Generation

uint64 request_id = 1

optional PrefillTokens prefill_tokens = 2

uint32 token_id = 3

float token_logprob = 4

string token_text = 5

bool token_is_special = 6

optional GeneratedText generated_text = 7

message NextTokenChooserParameters

float temperature = 1

uint32 top_k = 2

float top_p = 3

float typical_p = 4

bool do_sample = 5

uint64 seed = 6

float repetition_penalty = 7

bool watermark = 8

message PrefillTokens

repeated uint32 ids = 1