package generate.v1

Get desktop application:
View/edit binary Protocol Buffers messages

rpc Info (InfoRequest, InfoResponse)
generate.proto:7
/ Model Info
message InfoRequest
generate.proto:47
/ Empty request
(message has no fields)
message InfoResponse
generate.proto:49
- bool requires_padding = 1
- string dtype = 2
- string device_type = 3
- optional uint32 window_size = 4
- uint32 block_size = 5
- uint32 speculate = 6
- repeated PreloadedAdapter preloaded_adapters = 7
- bool supports_generation = 8
- bool supports_embeddings = 9
- bool supports_classification = 10
- bool chunked_prefill = 11
- bool requires_block_allocator = 12
rpc ServiceDiscovery (ServiceDiscoveryRequest, ServiceDiscoveryResponse)
generate.proto:9
/ Service discovery
message ServiceDiscoveryRequest
generate.proto:65
/ Empty request
(message has no fields)
message ServiceDiscoveryResponse
generate.proto:67
- repeated string urls = 1
  / Other shards urls
rpc ClearCache (ClearCacheRequest, ClearCacheResponse)
generate.proto:11
/ Empties batch cache
message ClearCacheRequest
generate.proto:72
- optional uint64 id = 1
  / Optional batch id
message ClearCacheResponse
generate.proto:78
/ Empty response
(message has no fields)
rpc FilterBatch (FilterBatchRequest, FilterBatchResponse)
generate.proto:13
/ Remove requests from a cached batch
message FilterBatchRequest
generate.proto:253
- uint64 batch_id = 1
  / Batch ID
- repeated uint64 request_ids = 2
  / Requests to keep
message FilterBatchResponse
generate.proto:260
- optional CachedBatch batch = 1
  / Filtered Batch (cached)
rpc Warmup (WarmupRequest, WarmupResponse)
generate.proto:15
/ Warmup the model and compute max cache size
message WarmupRequest
generate.proto:361
- optional Batch batch = 1
  / Batch to warmup on
- uint32 max_input_length = 2
- uint32 max_prefill_tokens = 3
- uint32 max_new_tokens = 4
message WarmupResponse
generate.proto:370
/ Empty response
- optional uint32 max_supported_total_tokens = 1
  / Maximum number of tokens supported by the model
rpc Prefill (PrefillRequest, PrefillResponse)
generate.proto:17
/ Prefill batch and decode first token
message PrefillRequest
generate.proto:266
- optional Batch batch = 1
  / Batch
- optional CachedBatch cached_batch = 2
  / Optional cached batch
message PrefillResponse
generate.proto:273
- repeated Generation generations = 1
  / Generation
- optional CachedBatch batch = 2
  / Next batch (cached)
rpc Embed (EmbedRequest, EmbedResponse)
generate.proto:19
/ Embed
message EmbedRequest
generate.proto:310
- optional Batch batch = 1
  / Batch
message EmbedResponse
generate.proto:315
- repeated Embedding embeddings = 1
  / Embeddings
- string errorMsg = 2
  / Error message on failure
rpc Classify (ClassifyRequest, ClassifyResponse)
generate.proto:21
/ Classify
message ClassifyRequest
generate.proto:349
- optional Batch batch = 1
  / Batch
message ClassifyResponse
generate.proto:354
- repeated ClassifyPredictionList classify_prediction_lists = 1
  / Classifications
- string errorMsg = 2
  / Error message on failure
rpc Decode (DecodeRequest, DecodeResponse)
generate.proto:23
/ Decode token for a list of prefilled batches
message DecodeRequest
generate.proto:290
- repeated CachedBatch batches = 1
  / Cached batches
message DecodeResponse
generate.proto:295
- repeated Generation generations = 1
  / Decodes
- optional CachedBatch batch = 2
  / Next batch (cached)
rpc Health (HealthRequest, HealthResponse)
generate.proto:25
/ Health check
message HealthRequest
generate.proto:34
(message has no fields)
message HealthResponse
generate.proto:35
(message has no fields)
rpc DownloadAdapter (DownloadAdapterRequest, DownloadAdapterResponse)
generate.proto:27
/ Download adapter
message DownloadAdapterRequest
generate.proto:425
- optional AdapterParameters adapter_parameters = 1
  / Adapter Parameters
- AdapterSource adapter_source = 2
  / Adapter source
- optional string api_token = 3
  / Token for external API (predibase / HuggingFace)
message DownloadAdapterResponse
generate.proto:434
- bool downloaded = 1
  / True if download occurred, false if skipped
- float memory_fraction = 2
  / Fraction of the adapter memory limit consumed by the adapter. / If no limit is set, will return 0. / When the total across all loaded adapters exceeds / the adapter_memory_fraction limit, no more adapters / will be loaded to GPU and LoRAX will begin swapping.
rpc LoadAdapter (LoadAdapterRequest, LoadAdapterResponse)
generate.proto:29
/ Load adapter
message LoadAdapterRequest
generate.proto:446
- optional AdapterParameters adapter_parameters = 1
  / Adapter Parameters
- AdapterSource adapter_source = 2
  / Adapter source
- uint32 adapter_index = 3
  / Adapter index
- optional string api_token = 4
  / Token for external API (predibase / HuggingFace)
message LoadAdapterResponse
generate.proto:457
- bool loaded = 1
  / True if load occurred, false if skipped
rpc OffloadAdapter (OffloadAdapterRequest, OffloadAdapterResponse)
generate.proto:31
/ Offload adapter
message OffloadAdapterRequest
generate.proto:462
- optional AdapterParameters adapter_parameters = 1
  / Adapter Parameters
- AdapterSource adapter_source = 2
  / Adapter source
- uint32 adapter_index = 3
  / Adapter index
message OffloadAdapterResponse
generate.proto:471
- bool offloaded = 1
  / True if offload occurred, false if skipped

Used in: DownloadAdapterRequest, LoadAdapterRequest, OffloadAdapterRequest, PreloadedAdapter

repeated string adapter_ids = 1
/ Adapter IDs
repeated float weights = 2
/ Adapter weights for merging
MergeStrategy merge_strategy = 3
/ Merge strategy (default: linear)
float density = 4
/ [0, 1], 0: full pruning, 1: no pruning
MajoritySignMethod majority_sign_method = 5
/ Majority sign method (default: total)

Used in: DownloadAdapterRequest, LoadAdapterRequest, OffloadAdapterRequest, PreloadedAdapter

HUB = 0
/ Adapters loaded using the HuggingFace Hub
S3 = 1
/ Adapters loaded via remote filesystem path
LOCAL = 2
/ Adapters loaded via local filesystem path
PBASE = 3
/ Adapters loaded via predibase

Used in: NextTokens

repeated uint32 ids = 1
/ Alternative Token IDs
repeated float logprobs = 2
/ Alternative Logprobs
repeated string texts = 3
/ Alternative tokens

Used in: ClassifyRequest, EmbedRequest, PrefillRequest, WarmupRequest

uint64 id = 1
/ Batch ID
repeated Request requests = 2
/ Individual requests
uint32 size = 3
/ Batch size (==len(requests))
uint32 max_tokens = 4
/ Maximum number of tokens this batch will grow to
uint32 max_blocks = 5
/ Maximum number of Paged Attention blocks

Used in: DecodeRequest, DecodeResponse, FilterBatchResponse, PrefillRequest, PrefillResponse

uint64 id = 1
/ Batch ID
repeated uint64 request_ids = 2
/ Individual requests ids
uint32 size = 3
/ Batch size (==len(requests))
uint32 max_tokens = 4
/ Maximum number of tokens this batch will grow to
uint32 current_tokens = 5
/ Number of tokens in the next forward

Used in: ClassifyResponse

uint64 request_id = 1
/ Request ID
repeated string predictions = 2
/ Classifications
repeated float scores = 3

Used in: EmbedResponse

uint64 request_id = 1
/ Request ID
repeated float values = 2
/ Embedding values

Used in: EntityList

string entity = 1
float score = 2
uint32 index = 3
string word = 4
uint32 start = 5
uint32 end = 6

uint64 request_id = 1
/ Request ID
repeated Entity entities = 2
/ Entities
repeated uint32 input_ids = 4
/ XXX

Used in: GeneratedText

FINISH_REASON_LENGTH = 0
FINISH_REASON_EOS_TOKEN = 1
FINISH_REASON_STOP_SEQUENCE = 2

Used in: Generation

string text = 1
/ Output
uint32 generated_tokens = 2
/ Number of generated tokens
uint32 skipped_tokens = 3
/ Number of skipped tokens due to speculative decoding hits
FinishReason finish_reason = 4
/ Finish reason
optional uint64 seed = 5
/ Seed

Used in: DecodeResponse, PrefillResponse

uint64 request_id = 1
/ Request ID
optional NextTokens prefill_tokens = 2
/ Prefill tokens (optional)
optional NextTokens next_tokens = 3
/ Next tokens
optional GeneratedText generated_text = 4
/ Complete generated text
uint32 prefill_tokens_length = 5
/ Prefill tokens length

Used in: InputChunk

bytes data = 1
/ Binary image data.
string mimetype = 2
/ Image MIME type.

Used in: TokenizedInputs

oneof chunk
- string text = 1
  / Plain text data
- Image image = 2
  / Image data

Used in: AdapterParameters

TOTAL = 0
/ Total method
FREQUENCY = 1
/ Frequency method

Used in: AdapterParameters

LINEAR = 0
/ Linear combination of adapters
TIES = 1
/ TIES method for combining adapters
DARE_LINEAR = 2
/ DARE method for combining adapters
DARE_TIES = 3
/ DARE + TIES method for combining adapters

Used in: Request

float temperature = 1
/ exponential scaling output probability distribution
uint32 top_k = 2
/ restricting to the k highest probability elements
float top_p = 3
/ restricting to top tokens summing to prob_cut_off <= prob_cut_off
float typical_p = 4
/ restricting to top tokens summing to prob_cut_off <= prob_cut_off
bool do_sample = 5
/ apply sampling on the logits
uint64 seed = 6
/ random seed for sampling
float repetition_penalty = 7
/ repetition penalty
float frequency_penalty = 8
/ frequency penalty
float presence_penalty = 9
/ presence penalty
bool watermark = 10
/ token watermarking using "A Watermark for Large Language Models"
string adapter_id = 11
/ adapter to use with lora exchange
optional string schema = 12
/ JSON schema used for constrained decoding (Outlines)
uint32 return_k_alternatives = 13
/ returning the k highest probability alternatives

Used in: Generation

repeated uint32 ids = 1
/ Token IDs
repeated float logprobs = 2
/ Logprobs
repeated string texts = 3
/ decoded text for each token
repeated bool is_special = 4
/ is special for each token
repeated AlternativeTokens alternative_tokens = 5
/ Alternative tokens (optional)

Used in: InfoResponse

optional AdapterParameters adapter_parameters = 1
/ Adapter params
AdapterSource adapter_source = 2
/ Adapter source
uint32 adapter_index = 3
/ Adapter index

Used in: Batch

uint64 id = 1
/ Request ID
string inputs = 2
/ The generation context
optional TokenizedInputs tokenized_inputs = 3
/ Tokenized inputs
uint32 truncate = 4
/ Context truncation
optional NextTokenChooserParameters parameters = 5
/ Next Token Chooser Parameters
optional StoppingCriteriaParameters stopping_parameters = 6
/ Stopping Criteria Parameters
bool prefill_logprobs = 7
/ Return prefill logprobs
uint32 adapter_index = 8
/ Adapter index
repeated uint32 blocks = 9
/ Paged attention blocks
repeated uint32 slots = 10
/ Paged attention slots
uint32 cache_len = 11
/ Tokens that can be retrieved from the KV cache. / This value is set for the first prefill and never reset
optional uint32 chunk_len = 12
/ Chunk of tokens that must be computed for the first prefill / This value is set for the first prefill and never reset

Used in: Request

uint32 max_new_tokens = 1
/ Maximum number of generated tokens
repeated string stop_sequences = 2
/ Optional stopping sequences
bool ignore_eos_token = 3
/ Ignore end of sequence token / used for benchmarking

Used in: Request

repeated uint32 ids = 1
/ Token IDs
repeated InputChunk input_chunks = 2
/ Chunks

package generate.v1

service LoraxService

rpc Info (InfoRequest, InfoResponse)

message InfoRequest

message InfoResponse

bool requires_padding = 1

string dtype = 2

string device_type = 3

optional uint32 window_size = 4

uint32 block_size = 5

uint32 speculate = 6

repeated PreloadedAdapter preloaded_adapters = 7

bool supports_generation = 8

bool supports_embeddings = 9

bool supports_classification = 10

bool chunked_prefill = 11

bool requires_block_allocator = 12

rpc ServiceDiscovery (ServiceDiscoveryRequest, ServiceDiscoveryResponse)

message ServiceDiscoveryRequest

message ServiceDiscoveryResponse

repeated string urls = 1

rpc ClearCache (ClearCacheRequest, ClearCacheResponse)

message ClearCacheRequest

optional uint64 id = 1

message ClearCacheResponse

rpc FilterBatch (FilterBatchRequest, FilterBatchResponse)

message FilterBatchRequest

uint64 batch_id = 1

repeated uint64 request_ids = 2

message FilterBatchResponse

optional CachedBatch batch = 1

rpc Warmup (WarmupRequest, WarmupResponse)

message WarmupRequest

optional Batch batch = 1

uint32 max_input_length = 2

uint32 max_prefill_tokens = 3

uint32 max_new_tokens = 4

message WarmupResponse

optional uint32 max_supported_total_tokens = 1

rpc Prefill (PrefillRequest, PrefillResponse)

message PrefillRequest

optional Batch batch = 1

optional CachedBatch cached_batch = 2

message PrefillResponse

repeated Generation generations = 1

optional CachedBatch batch = 2

rpc Embed (EmbedRequest, EmbedResponse)

message EmbedRequest

optional Batch batch = 1

message EmbedResponse

repeated Embedding embeddings = 1

string errorMsg = 2

rpc Classify (ClassifyRequest, ClassifyResponse)

message ClassifyRequest

optional Batch batch = 1

message ClassifyResponse

repeated ClassifyPredictionList classify_prediction_lists = 1

string errorMsg = 2

rpc Decode (DecodeRequest, DecodeResponse)

message DecodeRequest

repeated CachedBatch batches = 1

message DecodeResponse

repeated Generation generations = 1

optional CachedBatch batch = 2

rpc Health (HealthRequest, HealthResponse)

message HealthRequest

message HealthResponse

rpc DownloadAdapter (DownloadAdapterRequest, DownloadAdapterResponse)

message DownloadAdapterRequest

optional AdapterParameters adapter_parameters = 1

AdapterSource adapter_source = 2

optional string api_token = 3

message DownloadAdapterResponse

bool downloaded = 1

float memory_fraction = 2

rpc LoadAdapter (LoadAdapterRequest, LoadAdapterResponse)

message LoadAdapterRequest

optional AdapterParameters adapter_parameters = 1

AdapterSource adapter_source = 2

uint32 adapter_index = 3