package tensorflow.tpu

Get desktop application:
View/edit binary Protocol Buffers messages

A 'device' is a physical entity in the system and is comprised of several resources.

Used in: Trace

string name = 1
The name of the device.
uint32 device_id = 2
The id of this device, unique in a single trace.
map<uint32, Resource> resources = 3
The resources on this device, keyed by resource_id;

Result proto for HloExtraInfoMap.

Used in: TfOpStats

map<string, HloExtraInfoResult> hlo_extrainfo_map = 1
A map from HLO name to HloExtraInfo.

Result proto for HloExtraInfo.

Used in: HloExtraInfoMapResult

optional string category = 1
Category of the HLO op given by the compiler.
optional string long_name = 2
The long name of the HLO that includes the dimensions.
optional int64 per_core_batch_size = 3
The per-TPU-core batch size inferred from this HLO.

Result proto for host-dependent job information.

Used in: RunEnvironmentResult

optional string host_id = 1
This ID of the host where the job was run on.
optional string command_line = 2
The command line used to run the job.
optional int64 start_time = 3
The start time of the job on this host.

Result proto for host-independent job information.

Used in: RunEnvironmentResult

optional int64 change_list = 1
The change-list number of this build.
optional int64 build_time = 2
The time of this build.
optional string build_target = 3
The target of this build.

The types of host operations that are tracked.

kINVALIDHostOp = 0
Invalid host op.
kInputDataProducerGetNextBatch = 1
Each of host op type has two parts: (1) the stage where the op happens and (2) the op name. stage = Input Data Producer, op = Get Next Batch.
kInputDataProducerSessionRun = 2
stage = Input Data Producer, op = Session Run.
kInputDataProducerForwardBatch = 3
stage = Input Data Producer, op = Forward Batch.
kInfeedThreadGetNextBatch = 4
stage = Infeed Thread, op = Get Next Batch.
kInfeedThreadSessionRun = 5
stage = Infeed Thread, op = Session Run.
kInfeedThreadForwardBatch = 6
stage = Infeed Thread, op = Forward Batch.
kOutfeedThreadGetNextBatch = 7
stage = Outfeed Thread, op = Get Next Batch.
kOutfeedThreadSessionRun = 8
stage = Outfeed Thread, op = Session Run.
kOutfeedThreadForwardBatch = 9
stage = Outfeed Thread, op = Forward Batch.

Used in: HostOpsDetailsPerHost

map<int32, HostOpsPerTpuStep> core_map = 1
Map from core id to HostOpsPerTpuStep.

Used in: HostOpsResult

map<string, HostOpsDetailsPerCore> host_map = 1
Map from hostname to a map from core id to HostOpsPerTpuStep.

Result proto for the host ops per TPU step.

Used in: HostOpsDetailsPerCore

optional bool valid = 1
Whether the data in this message is valid.
optional uint32 tpu_step_num = 2
The current TPU step number.
optional uint64 tpu_step_begin_ps = 3
The beginning time of the current TPU step on the device in picoseconds.
optional uint64 tpu_step_end_ps = 4
The ending time of the current TPU step on the device in picoseconds.
map<int32, int32> step_diffs = 5
For each possible host operation, maps to the difference between the TPU step number that the host op targets and the current TPU step number. The key is HostOp, value is the step difference.

Result proto for the host ops for all TPU steps.

Used in: TfOpStats

repeated HostOpsDetailsPerHost hostops_details = 2
A sequence of records with one for each TPU step. Each record is a map from hostname to a map from core id to HostOpsPerTpuStep.

Result proto for looping-related metrics.

Used in: TfOpStats

optional double iteration_time_ns = 1
The total iteration time in nanoseconds.
optional int32 num_iterations = 2
The total number of iterations.
optional double computation_time_ns = 3
The total computation time in nanoseconds.
optional int32 num_computations = 4
The total number of computations.

Result proto for OpMetricsDb.

Used in: TfOpStats

repeated OpMetricsResult metrics_db = 1
A bunch of OpMetricsResults.
optional uint64 total_host_infeed_enq_duration_ps = 2
The total host infeed-enqueue duration in picoseconds.
optional uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3
The total of the difference between the start times of two consecutive infeed-enqueues (per host) in picoseconds.

Result proto for OpMetrics.

Used in: OpMetricsDbResult

optional bool on_device = 1
True if this OP is executed on the device; False if it is executed on the host.
optional string name = 3
Name of this OP.
optional uint64 rank = 4
Rank of this OP.
optional double last_starttime_in_cycles = 5
The starting time in cycles of the last instance of this OP executed.
optional double last_endtime_in_cycles = 6
The ending time in cycles of the last instance of this OP executed.
optional double sum_of_duration_in_us_as_children = 7
If this OP (say A), is an immediate child of another OP (say B), this field stores the sum of duration in microseconds of A inside B. If A appears more than once in B, the duration of all A's appearances will be added together. This sum will be reset after the self-time of B is calculated so that it can be reused for a new parent OP.
optional uint64 occurrences = 8
Number of instances that this OP occurred.
optional double total_time_in_us = 9
Total time in microseconds spent in this OP (accumulated over all of its occurrences).
optional double total_self_time_in_us = 10
Total self time in microseconds spent in this OP (accumulated over all of its occurrences).
optional double host_total_self_time_as_fraction_of_all_op_time = 11
The total self time as a fraction of sum of all OP's total self time on the host.
optional double host_cumulative_total_self_time_as_fraction_of_all_op_time = 12
Cumulative total self time in fraction on the host.
optional double device_total_self_time_as_fraction_of_all_op_time = 13
The total self time as a fraction of sum of all OP's total self time on the device.
optional double device_cumulative_total_self_time_as_fraction_of_all_op_time = 14
Cumulative total self time in fraction on the device.
optional double total_flops = 15
Total number of FLOPs incurred by this OP.
optional double total_bytes_accessed = 16
Total number of bytes accessed by this OP.
optional double unit1_occupancy_in_us = 17
Total time in microseconds that special hw unit 1 is occupied by this OP.
optional double unit2_occupancy_in_us = 18
Total time in microseconds that special hw unit 2 is occupied by this OP.
optional double total_memory_stall_in_us = 19
Total memory stall time in microseconds.

A 'resource' generally is a specific computation component on a device. These can range from threads on CPUs to specific arithmetic units on hardware devices.

Used in: Device

string name = 1
The name of the resource.
uint32 resource_id = 2
The id of the resource. Unique within a device.

Result proto for RunEnvironment (the run environment of a profiling session).

Used in: TfOpStats

optional int32 host_count = 1
Number of hosts used.
optional string tpu_type = 2
The type of TPU used.
optional int32 tpu_core_count = 3
The number of TPU cores used.
optional int32 per_core_batch_size = 4
The per-TPU-core batch size.
optional HostIndependentJobInfoResult host_independent_job_info = 5
Host-independent job information.
repeated HostDependentJobInfoResult host_dependent_job_info = 6
Host-dependent job information.

Result proto for a StepDatabase.

Used in: TfOpStats

map<uint32, StepSequenceResult> step_sequence_per_core = 1
A map from core_id to StepSequenceResult.

Result proto for StepInfo.

Used in: StepSequenceResult

optional uint32 step_num = 1
The (micro) step number.
optional uint64 duration_ps = 2
The step duration in picoseconds.
optional uint64 infeed_duration_ps = 3
The infeed duration in picoseconds. Can turn into a map if we want a variable number of ops.
optional uint64 begin_ps = 4
The start time of this step in picoseconds.
optional uint64 wait_duration_ps = 5
The waiting time within this step in picoseconds.
optional uint64 crs_duration_ps = 6
The time spent on cross-replica-sum in picoseconds.

Result proto for a sequence of steps.

Used in: StepDatabaseResult

repeated StepInfoResult step_sequence = 1
A sequence of StepInfoResults.

The TPUEmbeddingConfiguration contains specification of TPU Embedding lookups and gradient updates separate from the TF Graph.

TPUEmbeddingConfiguration.ModelMode model_mode = 1
int32 num_hosts = 2
num_hosts is the number of host CPU systems in the training/inference job. Each embedding table must be sharded into num_hosts separate Variables, placed separately on the num_hosts CPU devices in the cluster. Sharding will be performed equivalently to the 'div' sharding_strategy option of embedding_lookup() and embedding_lookup_sparse().
int32 num_tensornodes = 3
The total number of TensorNodes. This is equal to num_hosts times the number of TensorNodes attached to each host.
int32 batch_size = 4
The number of training examples per TensorNode.
repeated TPUEmbeddingConfiguration.TPUEmbeddingTable table_config = 5

Used in: TPUEmbeddingTable

float learning_rate = 1
float initial_accumulator = 2

Used in: TPUEmbeddingTable

float learning_rate = 1

model_mode specifies whether the model is to be run in training or inference. In inference mode, gradient updates to embedding tables are not performed.

Used in: TPUEmbeddingConfiguration

INVALID = 0
TRAINING = 1
INFERENCE = 2

Each Embedding

Used in: TPUEmbeddingConfiguration

string name = 1
Name of the embedding table. This will be used to name Variables in the Tensorflow Graph.
int32 num_rows = 3
Number of rows of the embedding table. The Variable created to hold the learned embedding table values will have shape (num_rows, width).
int32 width = 4
Width of the embedding table. The Variable created to hold the learned embedding table values will have shape (num_rows, width).
int32 num_features = 5
Number of distinct embedding activation vectors per training example produced by lookups into this table during model evaluation. For each table, the Graph will receive an activations Tensor of shape (batch_size * table.num_features, table.width). For example, num_features = 1 produces equivalent behavior to a single tf.nn.embedding_lookup() call. In the case of 'multivalent' embeddings, (i.e. tf.nn.embedding_lookup_sparse()) which compute weighted averages of embedding table rows, num_features is the number of vectors produced after averaging. In sequence models num_features is typically equal to the sequence length, since each sequence element must be represented separately to the convolutional or recurrent network.
oneof optimizer
- GradientDescentOptimizer gradient_descent = 6
- AdagradOptimizer adagrad = 7

Result proto for TfStatsHelper.

optional OpMetricsDbResult tf_metrics_db = 1
The result for the TF-metric database.
optional OpMetricsDbResult hlo_metrics_db = 2
The result for the HLO-metric database.
optional StepDatabaseResult step_db = 3
The result for the step database.
optional LoopingResult looping = 4
The result for the looping-related metrics.
optional HloExtraInfoMapResult hlo_extrainfo_map = 5
The result for the HloExtraInfoMap.
optional double matrix_unit_utilization_percent = 6
Overall matrix unit utilization in percentage.
optional RunEnvironmentResult run_environment = 7
The run environment of this profiling session.
optional HostOpsResult host_ops = 8
The result for the host operations.
map<uint32, string> core_id_to_name_map = 9
A map from core ID to name.

Describes the geometry of a TPU mesh.

repeated int32 mesh_shape = 1
The dimensions of the TPU topology, in cores. Typically, this is a 3D topology [x, y, core], where the major dimensions correspond to TPU chips, and the minor dimension describes the number of cores on a multicore chip.
int32 num_tasks = 2
Number of TensorFlow tasks in the cluster.
int32 num_tpu_devices_per_task = 3
Number of TPU devices per task.
repeated int32 device_coordinates = 4
A flattened rank 3 int32 array with shape [num_tasks, num_tpu_devices_per_task, len(mesh_shape)]. `tasks` is the number of tasks in the TPU cluster, `devices` is the number of TPU devices per task, and the minor dimension corresponds to a position in the TPU mesh topology. Each entry [task, device, axis] gives the `axis`-th coordinate in the topology of a task/device pair.

A 'Trace' contains metadata for the individual traces of a system.

map<uint32, Device> devices = 1
The devices that this trace has information about. Maps from device_id to more data about the specific device.
repeated TraceEvent trace_events = 4
All trace events capturing in the profiling period.

Used in: Trace

uint32 device_id = 1
The id of the device that this event occurred on. The full dataset should have this device present in the Trace object.
uint32 resource_id = 2
The id of the resource that this event occurred on. The full dataset should have this resource present in the Device object of the Trace object. A resource_id is unique on a specific device, but not necessarily within the trace.
string name = 3
The name of this trace event.
uint64 timestamp_ps = 9
The timestamp that this event occurred at (in picos since tracing started).
uint64 duration_ps = 10
The duration of the event in picoseconds if applicable. Events without duration are called instant events.

package tensorflow.tpu

message Device

string name = 1

uint32 device_id = 2

map<uint32, Resource> resources = 3

message HloExtraInfoMapResult

map<string, HloExtraInfoResult> hlo_extrainfo_map = 1

message HloExtraInfoResult

optional string category = 1

optional string long_name = 2

optional int64 per_core_batch_size = 3

message HostDependentJobInfoResult

optional string host_id = 1

optional string command_line = 2

optional int64 start_time = 3

message HostIndependentJobInfoResult

optional int64 change_list = 1

optional int64 build_time = 2

optional string build_target = 3

enum HostOp

kINVALIDHostOp = 0

kInputDataProducerGetNextBatch = 1

kInputDataProducerSessionRun = 2

kInputDataProducerForwardBatch = 3

kInfeedThreadGetNextBatch = 4

kInfeedThreadSessionRun = 5

kInfeedThreadForwardBatch = 6

kOutfeedThreadGetNextBatch = 7

kOutfeedThreadSessionRun = 8

kOutfeedThreadForwardBatch = 9

message HostOpsDetailsPerCore

map<int32, HostOpsPerTpuStep> core_map = 1

message HostOpsDetailsPerHost

map<string, HostOpsDetailsPerCore> host_map = 1

message HostOpsPerTpuStep

optional bool valid = 1

optional uint32 tpu_step_num = 2

optional uint64 tpu_step_begin_ps = 3

optional uint64 tpu_step_end_ps = 4

map<int32, int32> step_diffs = 5

message HostOpsResult

repeated HostOpsDetailsPerHost hostops_details = 2

message LoopingResult

optional double iteration_time_ns = 1

optional int32 num_iterations = 2

optional double computation_time_ns = 3

optional int32 num_computations = 4

message OpMetricsDbResult

repeated OpMetricsResult metrics_db = 1

optional uint64 total_host_infeed_enq_duration_ps = 2

optional uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3

message OpMetricsResult

optional bool on_device = 1

optional string name = 3

optional uint64 rank = 4

optional double last_starttime_in_cycles = 5

optional double last_endtime_in_cycles = 6

optional double sum_of_duration_in_us_as_children = 7

optional uint64 occurrences = 8

optional double total_time_in_us = 9

optional double total_self_time_in_us = 10

optional double host_total_self_time_as_fraction_of_all_op_time = 11

optional double host_cumulative_total_self_time_as_fraction_of_all_op_time = 12

optional double device_total_self_time_as_fraction_of_all_op_time = 13

optional double device_cumulative_total_self_time_as_fraction_of_all_op_time = 14

optional double total_flops = 15

optional double total_bytes_accessed = 16

optional double unit1_occupancy_in_us = 17

optional double unit2_occupancy_in_us = 18

optional double total_memory_stall_in_us = 19

message Resource

string name = 1

uint32 resource_id = 2

message RunEnvironmentResult

optional int32 host_count = 1

optional string tpu_type = 2

optional int32 tpu_core_count = 3

optional int32 per_core_batch_size = 4

optional HostIndependentJobInfoResult host_independent_job_info = 5

repeated HostDependentJobInfoResult host_dependent_job_info = 6