package grpc.trainer.v2

Get desktop application:
View/edit binary Protocol Buffers messages

rpc CreateTrainingJob (CreateRequest, CreateResponse)
trainer.proto:25
Creates a new deep learning training job for a given model definition.
message CreateRequest
trainer.proto:83
- string user_id = 1
- optional ModelDefinition model_definition = 2
- optional Training training = 3
- repeated Datastore datastores = 4
- optional EMExtractionSpec evaluation_metrics = 5
  EMExtractionSpec allows the caller to specify evaluation metrics extraction.
message CreateResponse
trainer.proto:248
- string training_id = 1
rpc GetTrainingJob (GetRequest, GetResponse)
trainer.proto:29
Reads a training job with a given ID.
message GetResponse
trainer.proto:270
- optional Job job = 1
rpc GetAllTrainingsJobs (GetAllRequest, GetAllResponse)
trainer.proto:33
Returns all training jobs for a given user.
message GetAllRequest
trainer.proto:290
- string user_id = 1
message GetAllResponse
trainer.proto:294
- repeated Job jobs = 1
rpc DeleteTrainingJob (DeleteRequest, DeleteResponse)
trainer.proto:37
Deletes a training job with a given ID.
message DeleteRequest
trainer.proto:320
- string training_id = 1
- string user_id = 2
message DeleteResponse
trainer.proto:325
- string training_id = 1
rpc HaltTrainingJob (HaltRequest, HaltResponse)
trainer.proto:41
Halts the training with a given ID without discarding the result.
message HaltRequest
trainer.proto:298
- string training_id = 1
- string user_id = 2
message HaltResponse
trainer.proto:303
- string training_id = 1
- string user_id = 2
- Status status = 3
rpc GetModelDefinition (ModelDefinitionRequest, stream ZippedDataChunk)
trainer.proto:45
Returns the model definition that was used for training as application/zip.
message ModelDefinitionRequest
trainer.proto:459
- string training_id = 1
- string user_id = 2
rpc GetTrainedModel (TrainedModelRequest, stream ZippedDataChunk)
trainer.proto:49
Returns the trained model as application/zip.
message TrainedModelRequest
trainer.proto:464
- string training_id = 1
- string user_id = 2
rpc GetTrainedModelLogs (TrainedModelLogRequest, stream ByteStreamResponse)
trainer.proto:54
Returns the logs from the ZIP file stored in the object store. Deprecated
message TrainedModelLogRequest
trainer.proto:469
- string training_id = 1
- string user_id = 2
- bool follow = 5
- bool is_metrics = 3
  These are for internal use only, and will be eventually removed!
- bool is_summary = 4
message ByteStreamResponse
trainer.proto:506
- bytes data = 1
rpc GetTrainingLogs (Query, stream LogLine)
trainer.proto:57
message LogLine
trainer.proto:185
LogLine represents one line of log information, returned by training data endpoints.
- optional MetaInfo meta = 1
  For MetaInfo, at the minimum training_id and user_id must be specified.
- string line = 2
  Raw line from the logs.
rpc GetTrainingEMetrics (Query, stream EMetrics)
trainer.proto:60
Get evaluation metrics records, based on query
message EMetrics
trainer.proto:209
EMetrics specifies an evaluation metrics record from the training data.
- optional MetaInfo meta = 1
  For MetaInfo, at the minimum training_id and user_id must be specified.
- map<string, Any> etimes = 2
  Map of temporal keys, normally values for the x-axis on a graph. Example: {"iteration": 209}
- string grouplabel = 3
  Group label, such as test, train, or validate
- map<string, Any> values = 4
  Map of value keys, normally values for the y-axis on a graph. / Example: {"cross_entropy": 0.4430539906024933, "accuracy": 0.8999999761581421}
rpc GetVersions (GetVersionsRequest, Frameworks)
trainer.proto:62
message GetVersionsRequest
trainer.proto:514
(message has no fields)
message Frameworks
trainer.proto:519
Contains a list of all frameworks currently supported along with the versions of that framework and whether a specific framework version can be used by anyone or only for internal usage.
- map<string, FrameworkDetailList> frameworks = 1
rpc GetTrainingStatusID (GetRequest, GetStatusIDResponse)
trainer.proto:68
For internal use only!
message GetStatusIDResponse
trainer.proto:278
- Status status = 1
rpc UpdateTrainingJob (UpdateRequest, UpdateResponse)
trainer.proto:74
For internal use only! Updates an existing training status TODO we should not have this but until we fix the status update handling properly, we have no other choice.
message UpdateRequest
trainer.proto:252
- string training_id = 1
- string user_id = 2
- Status status = 3
- string status_message = 4
- string error_code = 5
- string timestamp = 6
message UpdateResponse
trainer.proto:261
- string training_id = 1
rpc ResumeTrainingJob (ResumeRequest, ResumeResponse)
trainer.proto:78
Not implemented, to be removed (for GA)
message ResumeRequest
trainer.proto:309
- string training_id = 1
- string user_id = 2
message ResumeResponse
trainer.proto:314
- string training_id = 1
- string user_id = 2
- Status status = 3

Any represents a typed value used with the evaluation metrics record.

Used in: EMetrics

Any.DataType type = 1
Datatype of the value.
string value = 2
String representation of a value

Used in: Any

STRING = 0
JSONSTRING = 1
INT = 2
FLOAT = 3

Used in: CreateRequest, Job

string id = 1
string type = 2
map<string, string> fields = 3
additional fields for the given Datastore type
map<string, string> connection = 4
connection information for the given Datastore type

Typed value for EMExtractionSpec. The data type here can't be an enum, due to internal issues with YAML conversion.

Used in: EMGroup

string type = 1
one of: STRING, JSONSTRING, INT, FLOAT
string value = 2
String represenation of the value.

EMExtractionSpec represents the specification for extracting structured evaluation metrics from training jobs. It is used across all log collectors, so some fields may not be relevent for all log collectors. Note: Don't use enums with this, as need to do untyped YAML convert to string and back Refer to https://github.ibm.com/deep-learning-platform/dlaas-training-metrics-service for complete documentation.

Used in: CreateRequest

string type = 1
Loosly typed string representing what kind of log-collector to use. For Logs-only, specify `type: logger` For the Regex_extractor log-collector, specify `type: regex_extractor` For Tensorboard, specify `type: tensorboard` To invoke the emetrics_file_extractor, you can specify the following synonyms `type: optivist` || `type: emetrics_file` || `type: file`.
string image_tag = 6
Dev only.
string in = 2
The filename of the logfile. (Normally this should be left to default).
int32 line_lookahead = 3
For the regex_extractor, number of lines to keep in the buffer for regex matching.
repeated string eventTypes = 4
(Eventual) Available event types: 'images', 'distributions', 'histograms', 'images' 'audio', 'scalars', 'tensors', 'graph', 'meta_graph', 'run_metadata'. For now only scalars are supported.
map<string, EMGroup> groups = 5
For the regex_extractor, the `EMExtractionSpec` should contain a `groups` section, which should contain templates for groups such as `test` and `train`, which group names should be be the keys of this map.

EMGroup represents a group, such as `test` or `train`, that acts as a template for structured evaluation metrics, and which allows the specification of a regular expression (regex) that contains named bindings with sub-expressions, which can then be used as references to specify structured time-related (x-axis) and value-related (y-axis) values.

Used in: EMExtractionSpec

string regex = 1
Python regular expressions, which use the named group feature `(?P<name>...)`, to specify a name of a matching expression, which can then be used to specify the value that is used in the template for the `EMetrics` record. To help with verbosity, the regex_extractor allows the following macros: GLOG_STAMP, TIMESTAMP, FLOAT, INT, INT_ANY, and HEX. (See dlaas-training-metrics-service README for more details.
optional EMMeta meta = 2
Allows the caller to specify a binding for the time field of the meta structure.
map<string, EMAny> values = 3
Map of keys and regex references for value-related (y-axis) values.
map<string, EMAny> etimes = 4
Map of keys and regex references for time-related (x-axis) values.

Allows the user to bind an extracted value to the time field of the evaluation metrics.

Used in: EMGroup

string time = 3
Time that the metric occured: representing the number of millisecond since midnight January 1, 1970. (ref, for instance $timestamp). Value will be extracted from timestamps
string subid = 5
Optional subid

Used in: ModelDefinition

string name = 1
string version = 2
string image_tag = 3
Optional: tag used for learner testing
optional ImageLocation image_location = 4
Optional: non-standard location for learner image

Used in: Frameworks

repeated FrameworkDetails versions = 1

Used in: FrameworkDetailList

string version = 1
bool external = 2
If true, the image can be used by any user. If false, the image is only available for internal usage.

string training_id = 1
string user_id = 2

string training_id = 1
string user_id = 2
optional Metrics metrics = 3

string metrics = 1

Used as request type in: Trainer.GetTrainingJob, Trainer.GetTrainingStatusID

string training_id = 1
string user_id = 2

optional TrainingStatus status = 1

string test = 1

Used in: Framework

string registry = 1
the server name for the docker registry
string namespace = 2
namespace within the registry
string access_token = 3
Token used to access images stored in the registry+namespace
string email = 4
Email address associated with the account

Used in: GetAllResponse, GetResponse

string training_id = 1
string user_id = 2
optional ModelDefinition model_definition = 3
optional Training training = 4
optional TrainingStatus status = 5
repeated Datastore datastores = 6
string job_id = 7
optional Metrics metrics = 8

MetaInfo represents data shared with both log lines and evaluation metrics.

Used in: EMetrics, LogLine, Query

string training_id = 1
Unique id identifying the training job
string user_id = 2
Unique id identifying the user
int64 time = 3
Time that the metric occured: representing the number of millisecond since midnight January 1, 1970.
int64 rindex = 4
Sequential index, 1-based
string subid = 5
Optional subid

Used in: GetLatestMetricsResponse, Job, UpdateTrainedModelMetricsRequest

string timestamp = 1
string type = 2
int32 iteration = 3
map<string, string> values = 4

Used in: CreateRequest, Job

string name = 1
string description = 2
bytes content = 3
Optional: application/zip as bytes containing the model definition. If not present field location needs to be set.
string location = 4
Optional: data store location where the model definition (code) is located
optional Framework framework = 5

Query specifies the input query for logs and evaluation metrics.

Used as request type in: Trainer.GetTrainingEMetrics, Trainer.GetTrainingLogs

Query.SearchType searchType = 1
At this time, the SearchType value should normally always be TERM.
optional MetaInfo meta = 3
At the minimum, the training_data and user_data must be specified in the meta substructure.
string since = 4
representing the number of milliseconds since midnight January 1, 1970, exclusive with pos.
int32 pagesize = 5
Only get this many records
int64 pos = 6
The starting position. If positive or zero, count from beginning, if negative, count from end, exclusive with since.

Used in: Query

TERM = 0
NESTED = 1
MATCH = 2
ALL = 3

Used to specify resource requirements of a training job

Used in: Training

float cpus = 1
Number of CPU cores
float gpus = 2
Number of GPUs
float memory = 3
RAM
SizeUnit memory_unit = 4
float storage = 5
SizeUnit storage_unit = 6
int32 learners = 7
Number of learners
string schedpolicy = 8
string topology = 9
string architecture = 10
string gpu_type = 11
Optional. If not specified, job will be scheduled ONLY on nvidia-TeslaK80 Constraint strictly enforced. If e.g., a nvidia-TeslaP100 is requested, job will NOT start until a nvidia-TeslaP100 is available Can only be nvidia-TeslaK80, nvidia-TeslaP100 or nvidia-TeslaV100

Used in: ResourceRequirements

MB = 0
MiB = 1
GB = 2
GiB = 3

Used in: GetStatusIDResponse, HaltResponse, ResumeResponse, TrainingStatus, UpdateRequest

NOT_STARTED = 0
PENDING = 1
HALTED = 5
FAILED = 10
DEPLOY = 20
DOWNLOADING = 30
PROCESSING = 40
STORING = 50
COMPLETED = 60
QUEUED = 70

string training_id = 1
string user_id = 2
bool follow = 3

Used in: CreateRequest, Job

string command = 1
Command to execute during training
optional ResourceRequirements resources = 2
Resource requirements for the training
repeated string input_data = 3
Input and output data as data store references
repeated string output_data = 4
bool profiling = 5
whether we want to enable detailed profiling during the training

Used in: GetStatusResponse, Job

Status status = 1
string submission_timestamp = 3
string completion_timestamp = 4
string download_start_timestamp = 5
string process_start_timestamp = 6
string store_start_timestamp = 7
string status_message = 8
string error_code = 9

string training_id = 1
string user_id = 2
optional Metrics new_metrics = 3

string training_id = 1

Used as response type in: Trainer.GetModelDefinition, Trainer.GetTrainedModel

bytes data = 1

package grpc.trainer.v2

service Trainer

rpc CreateTrainingJob (CreateRequest, CreateResponse)

message CreateRequest

string user_id = 1

optional ModelDefinition model_definition = 2

optional Training training = 3

repeated Datastore datastores = 4

optional EMExtractionSpec evaluation_metrics = 5

message CreateResponse

string training_id = 1

rpc GetTrainingJob (GetRequest, GetResponse)

message GetResponse

optional Job job = 1

rpc GetAllTrainingsJobs (GetAllRequest, GetAllResponse)

message GetAllRequest

string user_id = 1

message GetAllResponse

repeated Job jobs = 1

rpc DeleteTrainingJob (DeleteRequest, DeleteResponse)

message DeleteRequest

string training_id = 1

string user_id = 2

message DeleteResponse

string training_id = 1

rpc HaltTrainingJob (HaltRequest, HaltResponse)

message HaltRequest

string training_id = 1

string user_id = 2

message HaltResponse

string training_id = 1

string user_id = 2

Status status = 3

rpc GetModelDefinition (ModelDefinitionRequest, stream ZippedDataChunk)

message ModelDefinitionRequest

string training_id = 1

string user_id = 2

rpc GetTrainedModel (TrainedModelRequest, stream ZippedDataChunk)

message TrainedModelRequest

string training_id = 1

string user_id = 2

rpc GetTrainedModelLogs (TrainedModelLogRequest, stream ByteStreamResponse)

message TrainedModelLogRequest

string training_id = 1

string user_id = 2

bool follow = 5

bool is_metrics = 3

bool is_summary = 4

message ByteStreamResponse

bytes data = 1

rpc GetTrainingLogs (Query, stream LogLine)

message LogLine

optional MetaInfo meta = 1

string line = 2

rpc GetTrainingEMetrics (Query, stream EMetrics)

message EMetrics

optional MetaInfo meta = 1

map<string, Any> etimes = 2

string grouplabel = 3

map<string, Any> values = 4

rpc GetVersions (GetVersionsRequest, Frameworks)

message GetVersionsRequest

message Frameworks

map<string, FrameworkDetailList> frameworks = 1

rpc GetTrainingStatusID (GetRequest, GetStatusIDResponse)

message GetStatusIDResponse

Status status = 1

rpc UpdateTrainingJob (UpdateRequest, UpdateResponse)

message UpdateRequest

string training_id = 1

string user_id = 2

Status status = 3

string status_message = 4

string error_code = 5

string timestamp = 6

message UpdateResponse

string training_id = 1

rpc ResumeTrainingJob (ResumeRequest, ResumeResponse)

message ResumeRequest

string training_id = 1