package tensorflow.profiler

Get desktop application:
View/edit binary Protocol Buffers messages

The active memory allocations at the peak memory usage.

Used in: PerAllocatorMemoryProfile

int64 snapshot_index = 1
The index of a snapshot in the time-sorted list, used to fetch the MemoryActivityMetadata at front end from the memory_profile_snapshots list.
int64 special_index = 2
The index of MemoryActivityMetadata in the special_allocations list.
int64 num_occurrences = 3
Number of occurrences for identical memory allocations.

Result database for all-reduce ops.

Used in: PerCoreStepInfo

repeated AllReduceInfo all_reduce_info = 1

Result proto for all -educe ops.

Used in: AllReduceDbResult

uint64 id = 1
Unique id for all-reduce ops.
string name = 2
The name of the hlo op. This field is no longer set by the profiler.
uint64 all_reduce_id = 3
For all-reduce nodes from different modules, if they have the same all_reduce_id, they will be 'Allreduce'd'. If empty, AllReduce will not be applied across modules.
uint64 start_time_ps = 4
The start time in picoseconds of the op event.
uint64 end_time_ps = 5
The end time in picoseconds of the op event.
uint64 byte_size = 6
The size of the op in bytes.

Used in: PodStatsMap

string name = 1
Name of this OP.
uint32 occurrences = 2
Number of instances that this OP occurred.
double duration_us = 3
The time in microseconds spent in this OP (averaged across all of its occurrences).
uint64 data_size = 4
Byte size of data transferred.
repeated ReplicaGroup replica_groups = 5
Replica groups.
string description = 6
Description (e.g. XLA expression).

Detail of a batch. Next ID: 13

Used in: PerBatchSizeAggregatedResult, PerHostInferenceStats, PerModelInferenceStats, SampledPerModelInferenceStatsProto

optional int64 batch_id = 1
Batch id.
optional uint64 start_time_ps = 2
Start time of the batch in picosecs.
optional uint64 end_time_ps = 3
End time of the batch in picosecs.
optional uint64 batch_delay_ps = 5
The latency between "start time of the first request in this batch", and the time this batch is processed.
repeated int64 related_request_ids = 4
Request ids related to this batch.
optional int32 padding_amount = 6
Size of padding.
optional int32 batch_size_after_padding = 7
Size of a batch after padding.
optional int32 model_id_index = 8
Model ID of this batch. This is the same model_id as any of the request in this batch. All the requests from the same batch must share the same model_id.
optional TensorEventDetail tensor_event_detail = 9
Tensor event details.
optional int32 host_id = 10
Host index for this batch.
optional double percentile = 11
Percentile of this batch in all batches in the profile duration.
optional uint64 device_time_ps = 12
Total time in picosecs in this batch spent on device.

Batching parameters collected from TFstreamz.

Used in: ModelIdDatabase

optional int64 num_batch_threads = 1
Number of batch threads.
optional int64 batch_timeout_micros = 2
How long a request can wait before being processed by a batch.
optional int64 max_batch_size = 3
Maximum size of a batch.
optional int64 max_enqueued_batches = 4
Maximum number of enqueued batches.
optional string allowed_batch_sizes = 5
Sizes that are allowed to form a batch. A list of integers separated by ","

Generic hardware bottleneck.

double input_percent = 7
Percentage of step time that is spent on input.
double output_percent = 8
Percentage of step time that is spent on output.
double idle_percent = 9
Percentage of step time that is idle for non-I/O-related reason.
double compute_percent = 10
Percentage of step time that is spent on compute.
string input_classification = 1
Indicates if input is a bottleneck. Possible values: "host", "device", "both", or "unknown"
string input_statement = 2
A human-readable description of the input bottleneck.
string kernel_launch_classification = 3
Indicates if kernel launching is a bottleneck. Possible values: "no", "moderate", "high".
string kernel_launch_statement = 4
A human-readable description of the kernel launching overhead.
string all_other_classification = 5
Indicates if all other is a bottleneck. Possible values: "no", "moderate", "high".
string all_other_statement = 6
A human-readable description of the all other overhead.
string device_collectives_classification = 11
Indicates if device collective communication is a bottleneck. Possible values: "no", "moderate", "high".
string device_collectives_statement = 12
A human-readable description of the device collective communication overhead.

Used in: PreprocessResult

int64 id = 1
double size_mib = 2
repeated string attributes = 3
repeated LogicalBuffer logical_buffers = 4
string common_shape = 5

Describes the start / exclusive limit HLO program points for a given buffer lifetime, used for rendering a box on the plot.

Used in: PreprocessResult

int32 start = 1
int32 limit = 2

Next ID: 14 Information about a send and recv channel.

Used in: PodStatsMap

int64 channel_id = 1
Id of the channel.
repeated uint32 src_core_ids = 11
Core ids of send ops.
repeated uint32 dst_core_ids = 12
Core ids of recv ops.
uint64 data_size = 4
Byte size of the data transferred.
double duration_us = 5
Duration from the beginning of send to the end of recv-done in microseconds.
uint32 occurrences = 6
Number of occurrences of a channel.
double utilization = 7
Percentage of the link BW utilized over the peak link BW.
repeated string hlo_names = 8
A list of hlo names associated with this channel id.
double send_delay_us = 9
Duration from the beginning of the recv-done to the beginning of send in microseconds. If the recv-done op starts after the beginning of the send op, the delay is zero.
string description = 13
Description (e.g. XLA expression).

TfDataStats of all hosts.

bool is_input_bound = 3
Whether it is input bound.
string summary = 4
Summary of the analysis.
repeated TfDataBottleneckAnalysis bottleneck_analysis = 1
Bottleneck analysis result.
map<string, TfDataStats> tf_data_stats = 2
TfDataStats per host.

Next ID: 8

Used in: OpStats

string hostname = 1
uint32 device_ordinal = 2
unique within host, TPU core only
uint32 core_num = 3
unique within chip per core type
uint32 local_chip_id = 4
unique within host
uint32 global_chip_id = 5
unique within mesh
uint32 global_core_id = 6
unique within mesh, TPU core only
bool is_sparse_core = 7

This proto is based on MegaScaleInfoProto and should be consistent with it.

DcnCollectiveInfoProto.TransferType transfer_type = 1
The type of DCN transfer.
repeated DcnCollectiveInfoProto.EndpointGroup endpoint_groups = 2
Groups of endpoints (in the form of slice id and device id) involved in `ALL_TO_ALL`, `REDUCE_SCATTER`, `ALL_REDUCE` and `ALL_GATHER` transfer.
repeated DcnCollectiveInfoProto.OneToOneGroup one_to_one_groups = 3
Groups of endpoints (in the form of slice id and device id) involved in `ONE_TO_ONE` transfer.

Used in: EndpointGroup, OneToOneGroup

int32 slice_id = 1
int32 device_id = 2

Used in: DcnCollectiveInfoProto

repeated Endpoint endpoints = 1

Used in: DcnCollectiveInfoProto

optional Endpoint source = 1
optional Endpoint destination = 2

Used in: DcnCollectiveInfoProto

UNKNOWN_TRANSFER_TYPE = 0
ALL_TO_ALL = 1
XLA AllToAll transfer. Needs `endpoint_groups`.
ONE_TO_ONE = 2
Peer-To-Peer DCN transfer from source to one destination. Needs one_to_one_groups.
REDUCE_SCATTER = 3
XLA reduce-scatter transfer. Needs `endpoint_groups`.
ALL_GATHER = 4
XLA AllGather transfer. Needs `endpoint_groups`.
ALL_REDUCE = 5
XLA all-reduce transfer. Needs `endpoint_groups`.
RAGGED_ALL_TO_ALL = 6
XLA ragged all-to-all transfer. Needs `endpoint_groups`.

Used in: DcnSlackAnalysis

string rendezvous = 1
uint64 send_start_time_us = 2
XProf observed send start time.
uint64 recv_done_end_time_us = 3
XProf observed recv_done end time.
uint64 slack_us = 4
Slack is defined as the time the collective has to send and recv data without stalling the tpu. The effect of the network and other overlapping collectives are removed from the collective of interest. HOST 1 : |--------|SEND1|-------|SEND1.DONE|-------|RECV1|------|RECV1.DONE|------- HOST 2: |------|SEND2|-------|SEND2.DONE|-------|RECV2|------|RECV2.DONE |----- Slack is computed as RECV2.DONE.StartTime - SEND2.StartTime - (Overlapping Communication) In this case, Overlapping communication is the duration of SEND2, SEND2.DONE and RECV2. In cases where other collectives are interspaced between this collective, Overlapping duration would include their durations as well. Host 1 is ignored while computing the slack, as we assume that the similar ops are executing each core. This also prevents clock drifts to effect the analysis.
uint64 bytes_transmitted_over_network = 5
uint64 stall_duration_us = 6
Duration the collective stalled the TPU.
string recv_op_name = 7
Recv op name
string send_op_name = 8
Send op name
optional OpInstance send = 9
Timestamp for the send/send-done/recv/recv-done ops
optional OpInstance send_done = 10
optional OpInstance recv = 11
optional OpInstance recv_done = 12
string transfer_type = 13
optional OpInstance host_graph_execution = 14

repeated DcnSlack dcn_slack = 1
repeated DcnSlackSummary dcn_slack_summary = 2

Used in: DcnSlackAnalysis

string rendezvous = 1
Rendezvous name for the collective.
uint64 slack_us = 2
Slack Time in Microseconds,
uint64 occurrences = 3
Number of occurrences in the sampled duration.
uint64 bytes_transmitted_over_network = 4
Bytes transmitted over the network.
uint64 stall_duration_us = 5
Duration the collective stalled the TPU.
uint64 observed_duration_us = 6
Observed duration.
string recv_op_name = 7
Recv op name.
string send_op_name = 8
Send op name.
uint64 send_duration_us = 9
Stall duration based on the op.
uint64 recv_duration_us = 10
uint64 send_done_duration_us = 11
uint64 recv_done_duration_us = 12
string transfer_type = 13
int64 host_stall_us = 14
uint64 host_events_count = 15

A 'device' is a physical entity in the system and is comprised of several resources.

Used in: Trace

optional string name = 1
The name of the device.
optional uint32 device_id = 2
The id of this device, unique in a single trace.
map<uint32, Resource> resources = 3
The resources on this device, keyed by resource_id;

double clock_rate_in_ghz = 1
uint32 num_cores = 2
uint64 memory_size_in_bytes = 3
uint64 memory_bandwidth = 4
Bytes/s.
optional GPUComputeCapability compute_capability = 5
string device_vendor = 6

Information about memory transfer to/from device memory.

Used in: PerCoreStepInfo, StepInfoResult

uint64 occurrence = 1
double time_us = 2
uint64 bytes_transferred = 3

Used in: InputPipelineAnalysisResult, OpStats, OverviewPage, PodStatsDatabase, PodViewerDatabase, roofline_model.RooflineModelDatabase

repeated string info = 1
repeated string warnings = 2
repeated string errors = 3

DmaActivity can be used to add DMA details to a trace event.

Used in: RawData

optional uint64 start_time_cycles = 1
optional uint64 end_time_cycles = 2
optional uint64 kilobytes = 4
optional uint32 mesh_chip_id = 5
optional uint32 core_id = 11
optional uint64 dma_address = 6
optional uint32 multicast = 8
optional uint32 segmented = 9
optional uint64 temporary = 10
temporary field, not saved to .sstable.

Used in: DeviceCapabilities

uint32 major = 1
uint32 minor = 2

string kernel_launch_bottleneck = 1
Indicates if kernel launch is a performance bottleneck. Possible values: "no", "moderate", "high".
string kernel_launch_statement = 2
A statement that recommends if we need to further investigate kernel-launch performance.
string all_other_bottleneck = 3
Indicates if all other is a performance bottleneck. Possible values: "no", "moderate", "high".
string all_other_statement = 4
A statement that recommends if we need to further investigate all-other performance.
string precision_statement = 5
A statement that recommends if the user should try using lower precision. Shows this statement to users only if it is not empty.
string device_collectives_bottleneck = 6
Indicates if device collectives are a performance bottleneck. Possible values: "no", "moderate", "high".
string device_collectives_statement = 7
A statement that recommends if we need to further investigate device-collectives performance.

Breakdown of step-time on generic hardware. Note that these components are mutually exclusive so that adding them together is equal to the step time. If an execution time interval has multiple types of event happening, we need to pick one of the event type to attribute the time interval to.

map<int32, uint64> type_ps = 1
Map event type to the accumulated duration in picoseconds of that type.
map<string, uint64> category_ps = 2
Map of string category to accumulated duration in picoseconds for that category.

optional StepSummary unknown_time_ms_summary = 1
Summary of all unknown time as a part of step in ms.
optional StepSummary host_wait_input_ms_summary = 9
Summary of all host-wait-input time as a part of step in ms.
optional StepSummary host_to_device_ms_summary = 10
Summary of all host-to-device time as a part of step in ms.
optional StepSummary input_ms_summary = 11
Summary of all input time as a part of step in ms.
optional StepSummary output_ms_summary = 3
Summary of all output time as a part of step in ms.
optional StepSummary device_compute_ms_summary = 4
Summary of all device-compute time as a part of step in ms.
optional StepSummary device_to_device_ms_summary = 5
Summary of all device-to-device time as a part of step in ms.
optional StepSummary device_collectives_ms_summary = 12
Summary of all device-collectives time as a part of step in ms.
optional StepSummary host_compute_ms_summary = 6
Summary of all host-compute time as a part of step in ms.
optional StepSummary host_prepare_ms_summary = 7
Summary of all host-prepare time as a part of step in ms.
optional StepSummary host_compile_ms_summary = 8
Summary of all compilation time as a part of step in ms.

Types of hardware profiled.

Used in: RunEnvironment

UNKNOWN_HARDWARE = 0
Unknown hardware.
CPU_ONLY = 1
CPU only without any hardware accelerator.
GPU = 2
GPU.
TPU = 3
TPU.

Describes a heap object that is displayed in a plot in the memory visualization HTML.

Used in: PreprocessResult

oneof color
- int32 numbered = 1
- string named = 2
string label = 3
int32 logical_buffer_id = 4
double logical_buffer_size_mib = 5
double unpadded_shape_mib = 6
string instruction_name = 7
string shape_string = 8
string tf_op_name = 9
string group_name = 10
string op_code = 11
optional SourceInfo source_info = 12

Result proto for host-dependent job information.

Used in: RunEnvironment

string host_id = 1
This ID of the host where the job was run on.
string command_line = 2
The command line used to run the job.
int64 start_time = 3
The start time of this run (nanoseconds since the Unix epoch).
string bns_address = 4
BNS address specified by client at time of profiling request.
uint64 profile_time_ns = 5
Profiling start walltime (in ns).

Result proto for host-independent job information.

Used in: RunEnvironment

int64 change_list = 1
The change-list number of this build.
int64 build_time = 2
The time of this build (nanoseconds since the Unix epoch).
string build_target = 3
The target of this build.
uint32 profile_duration_ms = 4
Profiling duration (in ms).

Proto consumed by inference analysis.

map<int32, PerHostInferenceStats> inference_stats_per_host = 3
Map from host-id to the InferenceStats for that host.
map<int32, PerModelInferenceStats> inference_stats_per_model = 5
Map from model-id to the InferenceStats for that model.
optional ModelIdDatabase model_id_db = 4
A database of model ids.
optional TensorPatternDatabase tensor_pattern_db = 6
A database of tensor patterns.
optional SampledInferenceStatsProto sampled_inference_stats = 7

Used in: InputPipelineAnalysisResult

string op_name = 1
The Op's name.
uint64 count = 2
The number of occurrences.
double time_in_ms = 3
Time (accumulated over all occurrences) in milliseconds.
double time_in_percent = 4
Time (accumulated over all occurrences) in percentage of the total input processing time.
double self_time_in_ms = 5
Self time (accumulated over all occurrences) in milliseconds.
double self_time_in_percent = 6
Self time (accumulated over all occurrences) in percentage of the total input processing time.
string category = 7
Possible categories: "Enqueue", "Advanced file read", "Demanded file read", "Preprocessing", "Unknown".

Used in: InputPipelineAnalysisResult

repeated string details = 1
A list of detailed recommendations.
optional google.protobuf.Any bottleneck_analysis = 2
An analysis of different types of bottlenecks. Can be unpacked into a BottleneckAnalysis.
string summary_next_step = 3
A suggested step to take next.

Used in: OverviewPage

bool tag = 16
tag : indicate the format of step_details and step_time_breakdown. true for TPU-specific data models.
string hardware_type = 9
Hardware type.
optional StepSummary step_time_summary = 2
Summary of all step duration across all cores.
optional StepSummary input_percent_summary = 3
Summary of all input-related stall as percentage of step duration.
double input_percent = 11
Percentage of step time that is waiting for input.
double output_percent = 13
Percentage of step time that is doing output.
double idle_percent = 14
Percentage of step time that is idle for non-I/O-related reason.
double compute_percent = 15
Percentage of step time that is doing compute.
repeated google.protobuf.Any step_details = 4
Details of each step. Can be unpacked into a PerGenericStepDetails.
optional InputTimeBreakdown input_time_breakdown = 5
The breakdown of the input processing time.
repeated InputOpDetails input_op_details = 6
Details of each input Op executed.
optional InputPipelineAnalysisRecommendation recommendation = 7
Recommendation for next steps to users.
optional google.protobuf.Any step_time_breakdown = 8
Breakdown of the step time. Can be unpacked into a GenericStepTimeBreakdown.
optional Diagnostics diagnostics = 12
Error and warning messages for diagnosing profiling issues.

Metadata for input pipeline.

Used in: InputPipelineStats

int64 id = 1
Id of the input pipeline which is set to the id of its root iterator.
InputPipelineMetadata.InputPipelineType type = 2
string name = 4

The distribution strategy creates one "host" input pipeline which actually runs tf.data user code. Also, it creates a "device" input pipeline per device (e.g., TensorCore) which takes an element from the host input pipeline and transfers it to the device.

Used in: InputPipelineMetadata

HOST = 0
DEVICE = 1

Stat and metadata for input pipeline.

Used in: InputPipelineStats

int64 bottleneck_iterator_id = 2
Id of the blocking iterator with the longest self time.
int64 bottleneck_iterator_latency_ps = 3
Latency of the bottleneck iterator.
map<int64, IteratorStat> iterator_stats = 1
Stats per iterator.

Collection of metadata and stats of input pipeline.

Used in: TfDataStats

optional InputPipelineMetadata metadata = 1
Metadata of the input pipeline.
int64 avg_latency_ps = 3
Average latency (i.e., the root iterator's latency) of the input pipeline.
int64 min_latency_ps = 4
Minimum latency of the input pipeline.
int64 max_latency_ps = 5
Maximum latency of the input pipeline.
int64 num_slow_calls = 6
The number of times this input pipeline was slower than 50 us.
repeated InputPipelineStat stats = 2
Stats per call sorted by the root iterator's duration.

Used in: InputPipelineAnalysisResult

double demanded_file_read_us = 1
Time spent on demanded file read in microseconds.
double advanced_file_read_us = 2
Time spent on advanced file read in microseconds.
double preprocessing_us = 3
Time spent on data preprocessing in microseconds.
double enqueue_us = 4
The infeed enqueue time in microseconds.
double unclassified_non_enqueue_us = 5
This entry is for the situtation where we can't further break down the non-enqueue input time (because the input pipeline is not instrumented).

Metadata for iterator.

Used in: TfDataStats

int64 id = 1
Id of the iterator.
int64 parent_id = 2
Id of the parent iterator.
string name = 3
Name of the iterator.
string long_name = 6
Long name of the iterator.
bool is_async = 4
Whether it is an async iterator.
map<string, string> params = 5
Parameters of the iterator (e.g., num_parallel_calls).

Stat for iterator.

Used in: InputPipelineStat

int64 id = 1
Id of the iterator.
int64 start_time_ps = 2
Start time of the iterator's GetNext in ps.
int64 duration_ps = 3
Duration of the iterator's GetNext in ps.
int64 self_time_ps = 4
Self time of the iterator's GetNext in ps. It takes account into async iterators. It is calculated by subtracting the time overlapped with its child iterator's duration from the iterator's duration.
bool is_blocking = 5
Whether it is blocking the root iterator. An async iterator's child iterator may not block its parent iterator if it is executed in advance and does not overlap with the parent iterator.
int64 num_calls = 6
The number of times this iterator is called. For example, a batch iterator's child iterator may be called multiple times.

Next ID: 15

Used in: KernelStatsDb

string name = 1
Name of the kernel.
uint32 registers_per_thread = 2
Registers per thread.
uint32 static_shmem_bytes = 3
Static shared memory in bytes.
uint32 dynamic_shmem_bytes = 4
Dynamic shared memory in bytes.
repeated uint32 block_dim = 5
Block dimensions.
repeated uint32 grid_dim = 6
Grid dimensions.
uint64 total_duration_ns = 7
Total duration of this kernel.
uint64 min_duration_ns = 8
Min duration of kernel in nanoseconds.
uint64 max_duration_ns = 9
Max duration of kernel in nanoseconds.
bool is_kernel_using_tensor_core = 10
Kernel utilizes TensorCore instructions.
bool is_op_tensor_core_eligible = 11
Operation is eligible to use TensorCores.
string op_name = 12
TF operation name.
uint32 occurrences = 13
Number of occurrences.
float occupancy_pct = 14
Occupancy pct.

Used in: OpStats

repeated KernelReport reports = 1
A list of kernels aggregated by name.

Data layout of an op.

Used in: OpMetrics

repeated LayoutAnalysis.Dimension dimensions = 1
The physical data layout, from most-minor to most-major dimensions.

Physical data layout in each tensor dimension.

Used in: LayoutAnalysis

int32 size = 1
Size of the data in this dimension.
int32 alignment = 2
Data must be padded to a multiple of alignment.
LayoutDimensionSemantics semantics = 3
What the dimension represents.

What the dimension represents, e.g. spatial, feature or batch.

Used in: LayoutAnalysis.Dimension

UNKNOWN_SEMANTICS = 0
FEATURE = 1
BATCH = 2
SPATIAL = 3

Used in: BufferAllocation

int64 id = 1
string shape = 2
double size_mib = 3
string hlo_name = 4
repeated int64 shape_index = 5

The logical topology of the job.

repeated LogicalTopology.LogicalSlice slices = 1
The slices that are part of the job.

The network address of a specific host.

Used in: LogicalHost

string address = 1
string interface_name = 2

Logical metadata about a specific device.

Used in: LogicalHost

int32 global_id = 1
The id that uniquely identifies the device globally.
int32 slice_local_id = 2
The id that uniquely identifies the device within its slice.
int32 host_local_id = 3
The id that uniquely identifies the device within its host.

Logical metadata about a specific host.

Used in: LogicalSlice

int32 slice_local_id = 1
The id that uniquely identifies the host within its slice.
repeated HostNetworkAddress network_addresses = 2
The network addresses of the host.
repeated LogicalDevice devices = 3
The devices that are connected to this host.

Logical metadata about a specific slice.

Used in: LogicalTopology

int32 global_id = 1
The id that uniquely identifies the slice globally.
repeated LogicalHost hosts = 2
The hosts that are part of this slice.

Types of memory bandwidth we track in the system.

MEM_BW_TYPE_FIRST = 0
We use FIRST and LAST enum values to be able to iterate over this enum in TypeScript, since the _MIN and _MAX values are not automatically available as in C++.
MEM_BW_TYPE_HBM_RW = 0
Aggregated BW across on-chip and off-chip memory. For GPU, 1/2 is shared memory bandwisth.
MEM_BW_TYPE_SRAM_RD = 1
On-chip memory read bw.
MEM_BW_TYPE_SRAM_WR = 2
On-chip memory write bw.
MEM_BW_TYPE_CMEM_RD = 3
MEM_BW_TYPE_CMEM_WR = 4
MEM_BW_TYPE_VMEM_RD = 5
MEM_BW_TYPE_VMEM_WR = 6
MEM_BW_TYPE_MAX = 2
Leave last. Leave this MAX unchanged now to avoid op profile changes. TODO(b/359279074) Revisit the memory breakdown in op profile since we have more memory types now.

A container to serialize this repeated field in "symbolized xplane."

repeated OpMetrics.MemoryAccessed memory_accessed = 1

The memory activity that causes change of memory state.

Used in: MemoryActivityMetadata

UNKNOWN_ACTIVITY = 0
ALLOCATION = 1
Memory allocation in heap.
DEALLOCATION = 2
Memory deallocation in heap.
RESERVATION = 3
Memory reservation for stack.
EXPANSION = 4
Expansion of existing memory allocation.

The metadata associated with each memory allocation/deallocation. It can also be interpreted as the metadata for the delta of memory state. Next ID: 10

Used in: MemoryProfileSnapshot, PerAllocatorMemoryProfile

MemoryActivity memory_activity = 1
The activity associated with the MemoryProfileSnapshot.
int64 requested_bytes = 2
The requested memory size in bytes from the caller of memory allocation. Should be a positive number.
int64 allocation_bytes = 3
The allocated (block/chunk) size for the memory allocation. Should be a positive number.
uint64 address = 4
Starting address of the allocated memory chunk/block.
string tf_op_name = 5
TensorFlow Op name for the memory activity.
int64 step_id = 6
Step Id at which the memory activity occurred.
string region_type = 7
Tensor memory region type including "output", "temp", "persist", and "dynamic".
string data_type = 8
From enum DataType defined in tensorflow/core/framework/types.proto.
string tensor_shape = 9
Tensor shape printed in string, e.g. "[3, 3, 512, 512]".

The aggregated memory stats including heap, stack, free memory and fragmentation at a specific time.

Used in: MemoryProfileSnapshot, MemoryProfileSummary

int64 stack_reserved_bytes = 1
Memory usage by stack reservation, in bytes.
int64 heap_allocated_bytes = 2
Memory usage by heap allocation, in bytes.
int64 free_memory_bytes = 3
Free memory available for allocation or reservation, in bytes.
double fragmentation = 4
Fragmentation value within [0, 1].
int64 peak_bytes_in_use = 5
The peak memory usage over the entire program (lifetime of memory allocator). It monotonically increases with upper limit as memory capacity.

Data for memory usage analysis in one host.

map<string, PerAllocatorMemoryProfile> memory_profile_per_allocator = 1
A map from memory allocator's id to PerAllocatorMemoryProfile for memory usage analysis on this host.
int32 num_hosts = 2
Number of hosts profiled, used to populate host selection list at front end.
repeated string memory_ids = 3
Ids for profiled memory allocators, used to populate memory selection list at front end.
int32 version = 5
Version number of MemoryProfile proto.

Profile snapshot of the TensorFlow memory at runtime, including MemoryAggregationStats (memory usage breakdown etc.), and MemoryActivityMetadata (allocation or deallocation, TF Op name etc.).

Used in: PerAllocatorMemoryProfile

int64 time_offset_ps = 1
Memory activity timestamp.
optional MemoryAggregationStats aggregation_stats = 2
The memory aggregation stats at the snapshot time.
optional MemoryActivityMetadata activity_metadata = 3
The metadata for the memory activity at the snapshot time.

The summary of memory profile within the profiling window duration.

Used in: PerAllocatorMemoryProfile

int64 peak_bytes_usage_lifetime = 1
The peak memory usage over the entire program (lifetime of memory allocator).
optional MemoryAggregationStats peak_stats = 2
The peak memory usage stats within the profiling window.
int64 peak_stats_time_ps = 3
The timestamp for peak memory usage within the profiling window.
int64 memory_capacity = 4
The memory capacity of the allocator.

Tensorflow generic memory space names. These space names are used in analysis code to get memory bandwidth per core.

MEMORY_SPACE_UNDEFINED = 0
MEMORY_SPACE_HBM = 1
Off-chip memory. Assume all backends use 1 for HBM/off-chip memory.
MEMORY_SPACE_ON_CHIP = 2147483646
On-chip memory.
MEMORY_SPACE_ALL = 2147483647
Any memory.

Model ID database. Unknown model id will be "" and won't be stored here. So if model id is not found in the TF-session metadata, ModelIdDatabase will be empty.

Used in: InferenceStats

repeated string ids = 1
Array of model ids.
map<string, int32> id_to_index = 2
Map from id to index.
map<string, BatchingParameters> id_to_batching_params = 3
Map from id to batching parameters.

Used in: DcnSlack

uint64 start_time_ps = 1
uint64 duration_ps = 2

Metrics for an operation (accumulated over all occurrences). Next ID: 27

Used in: OpMetricsDb

uint64 hlo_module_id = 13
HLO module id. 0 for Framework ops.
string name = 6
Name of this op.
string long_name = 20
Long name of this op (e.g., HLO expression).
string category = 11
Category of this op. (e.g. Hlo op category, Framework op type) Could be parsed from provenance if it is a framework op.
string provenance = 12
Provenance of this op if it is an HLO Op. (e.g. TF Op name, JAX Op name) TODO(b/310434797) Extends this for JAX as now only TF Op is populated.
bool is_eager = 18
Whether it is executed eagerly.
uint32 occurrences = 3
Number of executions.
uint64 time_ps = 7
Total time (self + children) in picoseconds.
uint64 min_time_ps = 17
Minimum time (self + children) among all occurrences.
uint64 self_time_ps = 1
Total self time in picoseconds.
uint64 flops = 2
Total FLOPs. Normalized to the devices peak bandwidth.
uint64 model_flops = 24
Total FLOPs for the model. Can be 0, in which case assume it's same as flops
uint64 fingerprint = 25
Fingerprint of the symbol (cs/xla::HloPrintOptions::Fingerprint), if 0, the fingerprint is not set.
uint64 bytes_accessed = 5
Total bytes accessed.
repeated OpMetrics.MemoryAccessed memory_accessed_breakdown = 19
uint64 dma_stall_ps = 10
Total dma stall time in picoseconds.
optional LayoutAnalysis layout = 14
The data layout for this op. Only set for convolution ops for now.
string deduplicated_name = 15
Deduplicated HLO name for this op. Not set for TF ops.
optional OpMetricsDb children = 16
Children of the op. e.g. fused ops if this op is fusion.
uint32 num_cores = 21
Number of cores this op occurs.
uint32 computation_primitive_size = 22
Computation primitive size in BITS. This is the size of the type of the hardware computation. In the future this may be extended to include info such as signed/unsigned, int/fp, etc. Currently only the size is needed.
bool autotuned = 23
Whether the op is autotuned.
optional SourceInfo source_info = 26

Breakdown of memory accessed by operation type and memory space.

Used in: MemoryAccessBreakdown, OpMetrics

MemoryAccessed.OperationType operation_type = 1
uint64 memory_space = 2
Device-specific id of memory space.
uint64 bytes_accessed = 3

Used in: MemoryAccessed

UNKNOWN = 0
READ = 1
WRITE = 2

A database for OpMetrics. Next ID: 16

Used in: OpMetrics, OpStats, PerCoreStepInfo

repeated OpMetrics metrics_db = 10
A bunch of OpMetrics.
uint64 total_host_infeed_enq_duration_ps = 2
The total host infeed-enqueue duration in picoseconds.
uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3
The total of the difference between the start times of two consecutive infeed-enqueues (per host) in picoseconds.
uint64 total_time_ps = 11
The total time in picoseconds.
uint64 total_op_time_ps = 12
The total time incurred by OPs in picoseconds.
optional PrecisionStats precision_stats = 13
Precision-related stats.
uint64 idle_time_ps = 14
The below two stats will be different from the total time ps and total op time ps because they are unioned all cores (and not summed). For duty cycle, a device is idle if all the cores are idle.
uint64 busy_time_ps = 15
For duty cycle, a device is busy if any of the cores is busy.

Next ID: 14 Operator Statistics.

optional OpMetricsDb host_op_metrics_db = 1
The database for the op metrics collected from the host over the entire profiling session including incomplete steps.
optional OpMetricsDb device_op_metrics_db = 2
The database for the op metrics collected from the device over the entire profiling session including incomplete steps.
optional OpMetricsDb hlo_metrics_db_complete_steps_only = 10
The result for the HLO-metric database over the complete steps only.
optional PerfEnv perf_env = 3
Performance environment of the op metrics collected.
optional StepDatabaseResult step_db = 4
The database of step sequences.
optional RunEnvironment run_environment = 5
The run environment of this profiling session.
optional KernelStatsDb kernel_stats_db = 6
Kernel stats results from all GPUs.
optional TfFunctionDb tf_function_db = 8
Statistics for all tf-functions.
map<uint32, CoreDetails> core_id_to_details = 11
A map from core ID to details.
optional Diagnostics diagnostics = 9
Error and warning messages for diagnosing profiling issues.
map<uint64, string> program_id_to_name_map = 12
A map from program ID to program name.
optional PerformanceCounterResult performance_counter_result = 13
Performance counters.

Overview result for the inference query latency stats.

Used in: OverviewPage

repeated double percentile_numbers = 1
The percentile numbers that the inference query latency distribution should follow. E.g., 50.0 means 50%ile. Default is [50.0, 75.0, 90.0, 99.0, 99.9].
repeated OverviewLatencyBreakdown latency_breakdowns = 2
Total and breakdown of a certain percentile latency. Each element corresponds to element with the same index in percentile_numbers.
double max_latency_us = 3
Max latency in micro seconds.
double min_latency_us = 4
Min Latency in micro seconds.
optional double sessions_per_second = 5
Inference sessions per second aggregated over all hosts. There can be multiple queries batched in one session.

Total and breakdown latency for inference query(s). Breakdown into host/device/communication.

Used in: OverviewInferenceLatency

double total_latency_us = 1
double host_latency_us = 2
double device_latency_us = 3
double communication_latency_us = 4

optional OverviewPageRunEnvironment run_environment = 6
The run environment of the profiled session.
optional InputPipelineAnalysisResult input_analysis = 2
The step-time result.
optional OverviewPageAnalysis analysis = 3
The other analysis result.
optional OverviewPageRecommendation recommendation = 4
The recommendation made to the user.
optional Diagnostics diagnostics = 8
Error and warning messages for diagnosing profiling issues.
optional OverviewInferenceLatency inference_latency = 9
The inference query latency stats.

Overview result for general analysis.

Used in: OverviewPage

double mxu_utilization_percent = 1
MXU utilization in percentage.
double device_idle_time_percent = 2
Percentage of the device time that is idle.
double host_idle_time_percent = 3
Percentage of the host time that is idle.
repeated OverviewTfOp top_device_ops = 4
Top TF Ops executed on the device.
string remark_text = 5
Remark text in the performance summary section.
string remark_color = 6
Color of the remark text.
double flop_rate_utilization_relative_to_roofline_percent = 7
FLOP rate utilization relative to the roofline in percentage.
double memory_bw_utilization_relative_to_hw_limit_percent = 8
Memory bandwidth utilization relative to the hw limit in percentage.
double device_compute_16bit_percent = 9
Percentage of device computation that is 16-bit.
double device_compute_32bit_percent = 10
Percentage of device computation that is 32-bit.
double host_tf_op_percent = 11
Percentage of TF ops executed on the host.
double device_tf_op_percent = 12
Percentage of TF ops executed on the device.
uint32 host_trace_level = 13
Host trace level.
double host_op_time_eager_percent = 14
Percentage of TF-op execution time on the host (excluding the idle time) that are in eager mode.
double device_op_time_eager_percent = 15
Percentage of TF-op execution time on the device (excluding the idle time) that are in eager mode.
double device_op_time_outside_compilation_percent = 16
Percentage of TF-op execution time on the device (excluding the idle time) that are for outside compilation.
double device_duty_cycle_percent = 17
Percentage of the device time that is in use.
double program_goodput_percent = 18
BEGIN-INTERNAL Program Goodput metric in percentage.
double sc_step_time_ms_average = 19
Sparse core step time in ms average.
double sc_infeed_time_ms_avg = 20
Sparse core infeed time in ms average.
double sc_outfeed_time_ms_avg = 21
Sparse core outfeed time in ms average.
double sc_idle_time_ms_avg = 22
Sparse core idle time in ms average.
double fw_max_vdd_core_pl1_power_watts = 23
Max FW VDD Core PL1 power metrics in watts.
double fw_max_vdd_core_pl2_power_watts = 24
Max FW VDD Core PL2 power metrics in watts.
double fw_max_vdd_core_pl3_power_watts = 25
Max FW VDD Core PL3 power metrics in watts.
double fw_max_vdd_core_pl4_power_watts = 26
Max FW VDD Core PL4 power metrics in watts.
double fw_max_hbm_pl1_power_watts = 27
Max FW HBM PL1 power metrics in watts.
double fw_max_hbm_pl2_power_watts = 28
Max FW HBM PL2 power metrics in watts.
double fw_max_hbm_pl3_power_watts = 29
Max FW HBM PL3 power metrics in watts.
double fw_max_hbm_pl4_power_watts = 30
Max FW HBM PL4 power metrics in watts.
END-INTERNAL

Result proto for host-dependent job information.

Used in: OverviewPageRunEnvironment

string host_id = 1
This ID of the host where the job was run on.
string command_line = 2
The command line used to run the job.
int64 start_time = 3
The start time of this run (nanoseconds since the Unix epoch).
string bns_address = 4
BNS address specified by client at time of profiling request.
uint64 profile_time_ns = 5
Profiling start walltime (in ns).

Result proto for host-independent job information.

Used in: OverviewPageRunEnvironment

int64 change_list = 1
The change-list number of this build.
int64 build_time = 2
The time of this build (nanoseconds since the Unix epoch).
string build_target = 3
The target of this build.
uint32 profile_duration_ms = 4
Profiling duration (in ms).

Overview result for the recommendation section.

Used in: OverviewPage

string bottleneck = 1
Possible performance bottleneck: "host", "device", "both".
string statement = 2
A statement for input that recommends the next steps for investigating the bottleneck.
repeated OverviewPageTip input_tips = 11
A list of tips for tackling input bottleneck.
string output_statement = 9
A statement for output that recommends the next steps for investigating the bottleneck.
string eager_statement_html = 12
A statement that recommends the next steps for investigating eager-mode related bottleneck (it is an html so that it can link to other tools/docs.)
string outside_compilation_statement_html = 13
A statement that recommends the next steps for investigating outside-compilation related bottleneck (it is an html so that it can link to other tools/docs.)
string tf_function_statement_html = 10
A statement that recommends the next steps for investigating tf-function related bottleneck (it is an html so that it can link to other tools/docs.)
repeated OverviewPageTip host_tips = 3
A list of tips for improving host performance.
repeated OverviewPageTip device_tips = 4
A list of tips for improving device performance.
repeated OverviewPageTip documentation_tips = 5
A list of links to related useful documents.
optional google.protobuf.Any recommendation = 6
// The recommendation made to the user. Can be unpacked into a GenericRecommendation.
repeated OverviewPageTip faq_tips = 7
A list of tips for FAQ.
repeated OverviewPageTip inference_tips = 8
A list of tips for inference run.

The run environment of a profiling session.

Used in: OverviewPage

int32 host_count = 1
Number of hosts used.
int32 task_count = 2
Number of tasks used.
map<string, bool> hostnames = 3
Distinct hostnames seen.
string device_type = 4
The type of device used.
int32 device_core_count = 5
The number of device cores used. In TPU case, this corresponds to the number of TPU cores In GPU case, this corresponds to the number of GPUs (not the number of SMs).
optional OverviewPageHostIndependentJobInfo host_independent_job_info = 7
Host-independent information about this job.
repeated OverviewPageHostDependentJobInfo host_dependent_job_info = 8
Host-dependent information about this job.
int32 replica_count = 9
The number of replicas, corresponds to input parallelism. If there is no model parallelism, replica_count = device_core_count
int32 num_cores_per_replica = 10
The number of cores used for a single replica, e.g. model parallelism. If there is no model parallelism, then num_cores_per_replica = 1
bool is_training = 11
Whether it is a training analysis or inference analysis.
optional PowerMetrics power_metrics = 12
Power Metrics for TPU.

Overview result for a performance tip to users.

Used in: OverviewPageRecommendation

string link = 1
Link to the tip.

Overview result for a TensorFlow Op.

Used in: OverviewPageAnalysis

string name = 1
Name of the Op.
string category = 2
Category of the Op.
double self_time_fraction = 3
The amount of time that this Op takes by itself as fraction of the total execution time on the device or host.
double cumulative_time_fraction = 4
The cumulative time upto this Op as fraction of the total execution time.
double flop_rate = 5
How many GFlops/sec that this Op achieves.
bool is_op_tensorcore_eligible = 6
Whether the Op is eligible to use TensorCores.
bool is_op_using_tensorcore = 7
Whether at least one of the kernels launched in this op is using TensorCore.

Memory profile snapshots per memory allocator.

Used in: MemoryProfile

repeated MemoryProfileSnapshot memory_profile_snapshots = 1
A list of MemoryProfileSnapshots referenced by <active_allocations>.
optional MemoryProfileSummary profile_summary = 2
The summary of memory profile (e.g. the peak memory usage).
repeated ActiveAllocation active_allocations = 3
The rows in the table of active allocations at peak memory usage within profiling window.
repeated MemoryActivityMetadata special_allocations = 4
The special allocations (e.g. pre-allocated heap memory, stack reservation) that are not captured in the MemoryActivityMetadata of memory_profile_snapshots. Need to handle separately.
repeated MemoryProfileSnapshot sampled_timeline_snapshots = 5
A list of MemoryProfileSnapshots sampled from all the snapshots during the profiling window. It is used to display the memory timeline graph in the frontend. The snapshots are sorted by timestamp.

Aggregated result per batch size.

Used in: PerModelInferenceStats

optional int32 batch_size = 1
optional RequestDetail aggregated_request_result = 2
optional BatchDetail aggregated_batch_result = 3
optional double request_throughput = 4
optional double batch_throughput = 5

Result proto for information in a step across all cores.

Used in: StepDatabaseResult

uint32 step_num = 1
The step number.
map<uint32, StepInfoResult> step_info_per_core = 2
A map from core_id to StepInfo.
optional OpMetricsDb hlo_metrics_db = 3
The result for the per-step HLO-metric database.
map<uint32, uint32> core_id_to_replica_id_map = 5
A map from core ID to program replica id. Replica id map could change during a profile session, but should stay stable within a step.
map<uint32, AllReduceDbResult> all_reduce_db_per_core = 6
A map from core_id to all-reduce ops.
repeated DeviceMemoryTransfer device_memory_transfers = 7
Information about deivce memory transfers, categoried by source and destination. Ordered by following categories: 1. HostToDevice 2. DeviceToHost 3. DeviceToDevice Cores are normally sharing host interfaces (i.e. PCIe).

Per-step details on generic hardware.

int32 step_number = 1
The step number of a step.
string step_name = 14
The step name.
double step_time_ms = 2
The step time (in ms).
double unknown_time_ms = 3
Breakdown of the step time in different event categories. The unknown time (in ms).
double host_wait_input_ms = 11
The time (in ms) in which the host is waiting for input data to be ready.
double host_to_device_ms = 12
The time (in ms) in which the host is sending input data to the device. Total input time = host_wait_input_ms + host_to_device_ms.
double output_ms = 5
The output time (in ms).
double device_compute_ms = 6
The device-compute time (in ms).
double device_to_device_ms = 7
The device-to-device communication time (in ms).
double device_collectives_ms = 13
The device time spent on collective communications (in ms).
double host_compute_ms = 8
The host-compute time (in ms).
double host_prepare_ms = 9
The host-prepare time (in ms).
double host_compile_ms = 10
The time spent on compiling (in ms).

Per-host data for inference analysis.

Used in: InferenceStats

repeated RequestDetail request_details = 3
A list of requests selected for inference analysis on this host. This list is in ascending order of the request duration.
repeated BatchDetail batch_details = 5
A list of batches selected for inference analysis on this host. This list is in ascending order of the batch duration.

Per-model data for inference analysis.

Used in: InferenceStats

repeated RequestDetail request_details = 1
A list of requests selected for inference analysis on this model. This list is in ascending order of the request duration.
optional RequestDetail aggregated_request_detail = 8
Aggregated result from all the <request_details>.
optional double request_throughput = 2
Inference requests per second for this model.
optional double request_average_latency_us = 3
Average latency in microseconds of the requests in this model.
repeated BatchDetail batch_details = 4
A list of batches selected for inference analysis on this model. This list is in ascending order of the batch duration.
optional BatchDetail aggregated_batch_detail = 9
Aggregated result from all the <batch_details>.
optional double batch_throughput = 5
Batches per second for this model.
optional double batch_average_latency_us = 6
Average latency in microseconds of the batches in this model.
optional TensorTransferAggregatedResult tensor_transfer_aggregated_result = 7
The aggregated result of tensor transfer in this model.
repeated PerBatchSizeAggregatedResult per_batch_size_aggregated_result = 10
Aggregated result per batch size.

Per-step details on TPU. Next ID: 26

int32 step_number = 1
The step number of a step.
double tc_compute_time_ms = 13
The TensorCore compute time in this step.
double tc_idle_time_ms = 14
The maximum TensorCore idle time that is due to host overhead (but not input-related).
double tc_outfeed_time_ms = 15
The part of a step (in ms) TC spends sending data to the host via outfeed.
double tc_infeed_time_ms = 3
The part of a step (in ms) on TC that is waiting for input data from the host.
double infeed_percent_average = 4
Average infeed-dequeue time across cores (as percentage of step time).
double infeed_percent_minimum = 5
Minimum infeed-dequeue time across cores (as percentage of step time).
double infeed_percent_maximum = 6
Maximum infeed-dequeue time across cores (as percentage of step time).
uint32 coreid_max_infeed_time = 7
The core with the maximum infeed time in this step.
string max_infeed_time_core_name = 25
The name of the core with the maximum infeed time in this step.
double all_reduce_compute_time_ms = 11
The part of a step (in ms) that is spent on the all-reduce compute.
double all_reduce_sync_time_ms = 12
The part of a step (in ms) that is spent on the all-reduce synchronization.
double scv0_compute_time_ms = 16
The part of a step (in ms) that is spent on SparseCoreV0 compute.
double scv0_infeed_time_ms = 17
The part of a step (in ms) that spent on infeed from host to SparseCoreV0.
double host_transfer_ms = 18
The part of the step (in ms) that is spent waiting for device to host or host to device transfer.
double sc_compute_time_ms = 20
The SparseCore compute time in this step.
double sc_idle_time_ms = 21
The maximum SparseCore idle time that is due to host overhead (but not input-related).
double sc_outfeed_time_ms = 22
The part of a step (in ms) SC spends sending data to the host via outfeed.
double sc_infeed_time_ms = 23
The part of a step (in ms) on SC that is waiting for input data from the host.
double sc_step_time_ms = 24
Sparse core step time in ms.

Performance environment, e.g the peak performance capabilities of the device.

Used in: OpStats

double peak_tera_flops_per_second = 1
Peak performance of a TPU core or a GPU in TFLOP/s.
double peak_bw_giga_bytes_per_second = 4
Peak memory bandwidth of a TPU core or a GPU in GiBs/s.
double peak_hbm_bw_giga_bytes_per_second = 2
Peak off-chip memory bandwidth of a TPU core or a GPU in GiBs/s.
repeated double peak_bws_giga_bytes_per_second = 5
Peak memory bandwidths of a TPU core or a GPU in GiBs/s. Index into array using MemBwType enum. TODO: remove the 2 above fields and bump up the proto version to maintain backwards compatibility.
double ridge_point = 3
The ridge point of roofline model in FLOP/Byte. (i.e., minimum operational intensity required to achieve maximum performance).
bool has_cmem = 6
Whether the device has CMEM.
bool has_merged_vmem = 7
Whether the device has merged VMEM (with CMEM).
bool has_megacore = 8
Whether megacore is used.

Metrics based on hardware performance counters.

Used in: OpStats

double matrix_unit_utilization_percent = 1
Overall matrix unit utilization in percentage.

Predicted computational cost of the instruction associated with the symbol. Estimated by traversing the HLO graph.

optional int64 flops = 1
The number of floating-point operations computed.
optional int64 bytes_accessed = 2
The sum of bytes read and bytes written.
repeated PerformanceInfo.MemoryAccessed memory_accessed_breakdown = 3

Breakdown of memory accessed by read/write and memory space.

Used in: PerformanceInfo

optional bool is_read = 1
optional MemoryAccessed.MemorySpace memory_space = 2
optional int64 bytes_accessed = 3

Used in: MemoryAccessed

UNKNOWN = 0
HBM = 1
CMEM = 2
VMEM = 3

A database of PodStats records.

repeated PodStatsRecord pod_stats_record = 1
All PodStats records, one for each row in the PodStats tool.
optional Diagnostics diagnostics = 3
Error and warning messages for diagnosing profiling issues.
repeated StepBreakdownEvents step_breakdown_events = 4
A map from event type number to event name string for step breakdown.

Result proto for information in a step across all cores.

Used in: PodStatsSequence

uint32 step_num = 1
The (micro) step number.
map<uint32, PodStatsRecord> pod_stats_per_core = 2
A map from core_id to PodStatsRecord.
repeated ChannelInfo channel_db = 3
A database of channel info.
map<uint32, uint32> core_id_to_replica_id_map = 4
A map from core ID to program replica id. Replica id map could change during a profile session, but should stay stable within a step.
repeated AllReduceOpInfo all_reduce_op_db = 5
A database of all reduce ops.

Next ID: 20 There is one PodStatsRecord for each step traced on each compute node.

Used in: PodStatsDatabase, PodStatsMap

string host_name = 1
The host name where the trace was collected.
int32 chip_id = 2
The TPU global chip id where the trace was collected.
int32 node_id = 3
The TPU node id where the trace was collected.
uint32 step_num = 4
The step number.
double total_duration_us = 5
The step duration in micro-seconds.
map<int32, double> step_breakdown_us = 19
Breakdown the durations for each event type in micro-seconds.
string bottleneck = 14
Indicates the bottleneck out of the above mentioned metrics.

A sequence of PodStatsMap for each step.

Used in: PodViewerDatabase

repeated PodStatsMap pod_stats_map = 1

Next ID: 12 A database of pod viewer records.

string device_type = 10
The type of device used.
optional PodStatsSequence pod_stats_sequence = 3
Pod level stats for each step.
optional PodViewerSummary summary = 7
Top level summary of pod viewer.
optional Diagnostics diagnostics = 8
Error and warning messages for diagnosing profiling issues.
repeated StepBreakdownEvents step_breakdown_events = 9
A map from event type number to event name string for step breakdown.
optional PodViewerTopology topology = 11
Info to draw the topology graph.

Used in: PodViewerDatabase

repeated string warnings = 1

Next ID: 9 Topology graph draws all the cores in the system in a 2-D rectangle or 3-D cube. It is hierarchically grouped by host, chip and core.

Used in: PodViewerDatabase

int32 x_dimension = 1
Number of chips in the x dimension of the rectangle/cube.
int32 y_dimension = 2
Number of chips in the y dimension of the rectangle/cube.
int32 z_dimension = 3
Number of chips in the z dimension of the cube.
int32 host_x_stride = 4
Number of chips in the x dimension of each host.
int32 host_y_stride = 5
Number of chips in the y dimension of each host.
int32 host_z_stride = 6
Number of chips in the z dimension of each host.
int32 num_cores_per_chip = 7
Number of cores per chip.
repeated TopologyLocation cores = 8
Core locations.

Used in: PowerMetrics

string component_name = 1
power rail or component name, e.g. HBM, Core.
double max_power = 2
maximum watts monitored.
double avg_power = 3
average watts monitored.
double max_moving_avg_power_100us = 4
(SPI sampler only) maximum watts of moving average power over a time window of 100us.
double max_moving_avg_power_1ms = 5
(SPI sampler only) maximum watts of moving average power over a time window of 1ms.
double max_moving_avg_power_10ms = 6
(SPI sampler only) maximum watts of moving average power over a time window of 10ms.
uint32 timescale_us = 7
(FW only) The timescale in us to compute moving averages.
uint64 sample_count = 8
The number of samples.
double max_moving_avg_power_1s = 9
(SPI sampler only) maximum watts of moving average power over a time window of 1s.

Used in: OverviewPageRunEnvironment, RunEnvironment

repeated PowerComponentMetrics power_component_metrics = 1

Statistics about the various precision used in computation.

Used in: OpMetricsDb

uint64 compute_16bit_ps = 1
Amount of time spent on 16-bit computation (in ps).
uint64 compute_32bit_ps = 2
Amount of time spent on 32-bit computation (in ps).

Groups together all results from the preprocessing C++ step.

repeated double heap_sizes = 1
Heap sizes at each HLO program point (the HLO sequential order).
repeated double unpadded_heap_sizes = 2
Unpadded heap sizes (calculated as the minimal sizes based on the data type and dimensionality) at each HLO program point (the HLO sequential order).
repeated string hlo_instruction_names = 20
The HloInstruction that was being processed at this HLO program point.
repeated HeapObject max_heap = 3
Heap objects at the peak memory usage point ordered by HLO program "birth" time.
repeated HeapObject max_heap_by_size = 4
Heap objects at the peak memory usage point ordered by size, descending.
map<int32, BufferSpan> logical_buffer_spans = 5
Mapping from logical buffer ID to the HLO sequential order span in which it is alive.
repeated int32 max_heap_to_by_size = 6
Indexes to get back and forth from the by-size and by-program-order sequences.
repeated int32 by_size_to_max_heap = 7
string module_name = 8
string entry_computation_name = 9
double peak_heap_mib = 10
Peak heap size for the HLO program.
double peak_unpadded_heap_mib = 11
Peak unpadded heap size for the HLO program.
int32 peak_heap_size_position = 12
HLO program point number at which the peak heap size occurs.
double entry_computation_parameters_mib = 13
Size of the entry computation parameters in MiB. This does not reflect whether those MiB are reusable during the computation or not, it is simply a size value.
double non_reusable_mib = 14
double maybe_live_out_mib = 15
double total_buffer_allocation_mib = 18
total size of indefinite/global and temporary buffer allocations.
double indefinite_buffer_allocation_mib = 19
total size of indefinite/global buffer allocations.
repeated BufferAllocation indefinite_lifetimes = 16
string allocation_timeline = 17

RawData contains raw data that can be used to attach further details to a TraceEvent. TraceEvents store this raw data in serialized form so it can be decoded on demand. This can improve performance as TraceEvents are often subject to filtering and only a small subset actually needs to be decoded. NEXT ID: 4

oneof raw_data
- DmaActivity dma_activity = 1
- TraceEventArguments args = 2
- TpuTraceData tpu_data = 3
  Never used. For the ease of template code.

Describes the replica groups in a cross replica op (e.g., all-reduce and all-to-all).

Used in: AllReduceOpInfo

repeated int64 replica_ids = 1
The ids of the replicas that belongs to the same group. The ordering of the ids matters in some ops (e.g., all-to-all).

Detail of a user facing request. Next ID: 22

Used in: PerBatchSizeAggregatedResult, PerHostInferenceStats, PerModelInferenceStats, SampledPerModelInferenceStatsProto

optional int64 request_id = 10
Request id.
optional int32 model_id_index = 8
An index to the model_id inside InferenceStats below. Storing index instead of string to save space. It will be -1 if the model id is not given.
optional uint64 start_time_ps = 1
Start-time of the request in picosecs.
optional uint64 end_time_ps = 2
End-time of the request in picosecs.
optional uint64 device_time_ps = 7
Total time in picosecs in this request spent on device.
optional uint64 write_to_device_time_ps = 5
Total time in picosecs in this request spent on writes to device.
optional uint64 read_from_device_time_ps = 6
Total time in picosecs in this request spent on reads from device.
optional uint64 batching_request_delay_ps = 12
If this inference request is running in batching mode, record the latency between a request is scheduled and is processed in a batch. Otherwise, it will always be 0.
repeated int64 related_batch_ids = 11
Batch ids related to this request.
optional int32 batching_request_size = 13
If this inference request is running in batching mode, record the size of the request. Otherwise, it will always be 0.
optional uint64 host_preprocessing_ps = 14
Detailed breakdown for host side activities of a request. Total time in picosecs spent on host preprocessing.
optional uint64 host_batch_formation_ps = 15
Total time in picosecs spent on host batch formation.
optional uint64 host_runtime_ps = 16
Total time in picosecs spent on host runtime.
optional uint64 host_postprocessing_ps = 17
Total time in picosecs spent on host postprocessing.
repeated TensorEventDetail tensor_event_details = 18
Tensor event details. One request can have multiple TensorEventDetails because it might be split into multiple batches for execution.
optional int32 host_id = 19
Host index for this request.
optional double percentile = 20
Percentile of this request in all requests in the profile duration.
optional double idle_time_ps = 21
The time no event associated with. It could be that the machine was idle or executing some events which were not traced.

A 'resource' generally is a specific computation component on a device. These can range from threads on CPUs to specific arithmetic units on hardware devices.

Used in: Device

optional string name = 1
The name of the resource.
optional uint32 resource_id = 2
The id of the resource. Unique within a device.
optional uint32 num_events = 3
Number of events added to this resource.

The run environment of a profiling session.

Used in: OpStats

int32 host_count = 1
Number of hosts used.
int32 task_count = 2
Number of tasks used.
map<string, bool> hostnames = 3
Distinct hostnames seen.
string device_type = 4
The type of device used.
int32 device_core_count = 5
The number of device cores used. In TPU case, this corresponds to the number of TPU cores In GPU case, this corresponds to the number of GPUs (not the number of SMs).
optional HostIndependentJobInfoResult host_independent_job_info = 7
Host-independent information about this job.
repeated HostDependentJobInfoResult host_dependent_job_info = 8
Host-dependent information about this job.
int32 replica_count = 9
The number of replicas, corresponds to input parallelism. If there is no model parallelism, replica_count = device_core_count
int32 num_cores_per_replica = 10
The number of cores used for a single replica, e.g. model parallelism. If there is no model parallelism, then num_cores_per_replica = 1
uint32 host_trace_level = 12
Host trace level.
optional Topology system_topology = 13
The chip and host interconnection topology.
bool is_training = 14
Whether it is a training analysis or inference analysis.
optional PowerMetrics power_metrics = 15
Power Metrics for TPU.
HardwareType hardware_type = 16
Hardware type.

Used in: InferenceStats

map<int32, SampledPerModelInferenceStatsProto> sampled_inference_stats_per_model = 1
Map from model index to the Sampled Stats.

Used in: SampledInferenceStatsProto

repeated RequestDetail sampled_requests = 1
repeated BatchDetail sampled_batches = 2

Used in: HeapObject, OpMetrics, hlo_stats.HloStatsRecord, op_profile.Node.XLAInstruction, roofline_model.RooflineModelRecord

string file_name = 1
int32 line_number = 2
could be `-1`
string stack_frame = 3
One stack frame per line.

Breakdown of step-time on SparseCore.

uint64 sc_compute_ps = 1
SparseCore step time in picoseconds (equal to SparseCore time - sc_idle - sc_wait_time).
uint64 sc_infeed_ps = 2
Host to sparse core time in picoseconds.
uint64 sc_outfeed_ps = 3
SparseCore to host time in picoseconds.
uint64 sc_idle_ps = 4
Idle time but not waiting for input in picoseconds.
uint64 sc_busy_ps = 5
SparseCore busy time in picoseconds.

Similar to TpuStepTimeBreakdown, this is for sparse core step time info.

Used in: TpuStepTimeBreakdown

optional StepSummary sc_compute_ms_summary = 1
Summary of all SparseCore compute op duration as a part of step in ms.
optional StepSummary sc_infeed_ms_summary = 2
Summary of all SparseCore infeed op duration as a part of step in ms.
optional StepSummary sc_outfeed_ms_summary = 3
Summary of all SparseCore outfeed op duration as a part of step in ms.
optional StepSummary sc_idle_ms_summary = 4
Summary of all SparseCore idle (but not input-related) duration as a part of step in ms.
optional StepSummary sc_step_time_ms_summary = 5
Summary of all SparseCore step time in ms.

Used in: PodStatsDatabase, PodViewerDatabase

int32 id = 1
string name = 2

Result proto for a StepDatabase.

Used in: OpStats

repeated PerCoreStepInfo step_sequence = 1
A sequence of PerCoreStepInfo.
bool use_incomplete_step = 2
Whether the step db uses incomplete step information. This flag is set to true when: 1) no step marker or annotation present. 2) profiling duration is too short to cover a full step. If this flag is false, we will group and breakdown the profile by complete steps only and ignore incomplete steps. If this flag is true, we will simply aggregate and breakdown over the total profile as a single step.
uint32 num_steps_dropped = 3
Number of steps dropped during post processing.
bool empty_intersect = 4
If the step_sequence is empty because: * there is no step profiled on any host, then empty_intersect is false. * there are steps profiled on some host, but the intersection of steps over all hosts is empty, then empty_intersect is true.

Next ID: 7 Result proto for StepInfo.

Used in: PerCoreStepInfo

uint32 step_num = 1
The step number.
string step_name = 5
The step name.
uint64 duration_ps = 2
The step duration in picoseconds.
uint64 begin_ps = 3
The start time of this step in picoseconds.
optional google.protobuf.Any step_breakdown = 4
Breakdown of the step-time. Can be unpacked into a GenericStepBreakdown.
optional DeviceMemoryTransfer collectives = 6
Total time/bytes/occurences for collectives. (All-Reduce, All-to-All etc)

Used for both step duration and Op duration.

Used in: GenericStepTimeBreakdown, InputPipelineAnalysisResult, SparseCoreStepSummary, TpuStepTimeBreakdown

double average = 1
double standard_deviation = 2
double minimum = 3
double maximum = 4

System topology, which describes the number of chips in a pod and the connectivity style.

int64 x_dimension = 1
The X, Y, and Z dimensions of this topology. 0 means that dimension does not exist.
int64 y_dimension = 2
int64 z_dimension = 3
int64 num_expected_reduced_chips = 4
The number of expected bad chips in this system.

'Task' contains information about a task that profiler traced.

Used in: Trace

optional int64 changelist = 1
The most recent changelist number from the client that built the binary.
optional bool clean_build = 2
True if the client that built the binary was mint (no local changes).
optional int64 build_time = 3
Build time (in ns relative to the Unix epoch).
optional string build_target = 4
Build target for the binary.
optional string command_line = 5
The full command line used to invoke the task.
optional int64 start_time = 6
Start time of the task (in ns relative to the Unix epoch).
optional string task_address = 7
Task address specified by client at time of profiling request.
optional uint64 profile_time_ns = 8
Profiling start walltime (in ns).
optional uint32 profile_duration_ms = 9
Profiling duration (in ms).
optional uint32 host_trace_level = 10
Host trace level.
optional uint64 tensor_core_freq_hz = 11
Hardware core frequency.
optional uint64 sparse_core_freq_hz = 12
optional uint64 gtc_freq_hz = 13
optional uint64 peak_memory_usage = 14
optional double cpu_limit = 15
optional double cpu_usage = 16

Used in: BatchDetail, RequestDetail

optional int32 tensor_pattern_index = 1
The index of the tensor pattern in TensorPatternDatabase.
optional TensorEventDetail.TensorEventOwner owner = 2
If batching is enabled, the TensorEventDetails in BatchDetail will have owner = BATCH, and they are counted when calculating statistics like the number of occurrence for each tensor pattern. The TensorEventDetails in RequestDetail will have owner = BATCH, which means the tensor events actually happen in the batch, and they are not counted when calculating various statistics. If batching is not enabled, the TensorEventDetail will only appear in RequestDetail and the owner will only be REQUEST.
optional uint64 linearize_delinearize_time_ps = 3
Total time in picosecs spent on linearize and delinearize tensors.

The owner of this TensorEventDetail.

Used in: TensorEventDetail

UNKNOWN = 0
Unknown. This should not happen in production code.
REQUEST = 1
Owned by the request.
BATCH = 2
Owned by the batch.

Tensor pattern database for all the tensor patterns that occurred during the profiling window.

Used in: InferenceStats

repeated string tensor_pattern = 1
A tensor pattern is the string concatenation of all the linearize and delinearize events in an inference request. Each event records the tensor shape, data type and the layout on device.

Per-model aggregated result of tensor transfer.

Used in: PerModelInferenceStats

repeated TensorTransferAggregatedResult.TensorPatternResult tensor_pattern_results = 1

Used in: TensorTransferAggregatedResult

optional int32 tensor_pattern_index = 1
The index of the tensor pattern in TensorPatternDatabase.
optional uint64 count = 2
The number of occurrence of this tensor pattern in this model.
repeated TensorPatternResult.PercentileTime linearize_delinearize_percentile_time = 3
The percentiles of the linearize and delinearize time of this tensor pattern in this model.

Used in: TensorPatternResult

optional double percentile = 1
optional uint64 time_ps = 2

Used in: CombinedTfDataStats

string host = 1
Host name.
string input_pipeline = 2
Input pipeline name.
int64 max_latency_ps = 3
Maximum latency of the input pipeline.
string iterator_name = 4
Name of the bottleneck iterator.
string iterator_long_name = 5
Long name of the bottleneck iterator.
int64 iterator_latency_ps = 7
Latency of the bottleneck iterator.
string suggestion = 6
Suggestion to resolve the bottleneck.

Collection of stats of tf.data input pipelines within a host.

Used in: CombinedTfDataStats

map<int64, IteratorMetadata> iterator_metadata = 2
Metadata per iterator.
map<int64, InputPipelineStats> input_pipelines = 1
Stats per input pipeline.

Statistics for a tf-function.

Used in: TfFunctionDb

map<int32, TfFunctionMetrics> metrics = 1
A map from each execution mode to its corresponding metrics.
int64 total_tracing_count = 2
Total tracing count from the program's beginning (i.e. beyond the profiling period) of this tf-function.
TfFunctionCompiler compiler = 3
Compiler used to compile this function.
double expensive_call_percent = 4
Percentage of time spent in the expensive calls to this function in the profiled period.

All possible compilers that can be used to compile a tf-function in the graph mode.

Used in: TfFunction

INVALID_COMPILER = 0
Yet to be set.
OTHER_COMPILER = 1
Any other compiler.
MIXED_COMPILER = 2
If some instance of the function is compiled with XLA and some is compiled with Non-XLA, use "MIXED_COMPILER".
XLA_COMPILER = 3
XLA compiler.
MLIR_COMPILER = 4
MLIR compiler.

Statistics for all tf-functions.

Used in: OpStats

map<string, TfFunction> tf_functions = 1
A map from function name to the statistics of that function.

All possible execution modes of a tf-function.

INVALID_MODE = 0
Yet to be set.
EAGER_MODE = 1
Eager execution.
TRACED_MODE = 2
Graph execution with tracing.
NOT_TRACED_MODE = 3
Graph execution without tracing.
CONCRETE_MODE = 4
Concrete function.

Metrics associated with a particular execution mode of a tf-function.

Used in: TfFunction

uint64 count = 1
Number of invocations to the function in that execution mode.
uint64 self_time_ps = 2
The sum of "self-execution" time of this function over those invocations.

A database of TfStatsTables.

optional TfStatsTable with_idle = 4
The table that includes IDLE time.
optional TfStatsTable without_idle = 5
The table that excludes IDLE time.
string device_type = 6
The type of device used.

There is one TfStatsRecord for each TF operation profiled.

Used in: TfStatsTable

uint64 rank = 1
Rank of this TF-op among all TF-ops.
string host_or_device = 2
Whether this TF-op is on "Host" or "Device".
string op_type = 3
TF-op type.
string op_name = 4
TF-op name.
int64 occurrences = 5
Number of occurrences of the operation.
double total_time_in_us = 6
Total "accumulated" time in micro-seconds that the operation took. If this operation has any children operations, the "accumulated" time includes the time spent inside children.
double avg_time_in_us = 7
Average "accumulated" time in micro-seconds that each occurrence of the operation took.
double total_self_time_in_us = 8
Total "self" time in micro-seconds that the operation took. If this operation has any children operations, the "self" time doesn't include the time spent inside children.
double avg_self_time_in_us = 9
Average "self" time in micro-seconds that the operation took.
double device_total_self_time_as_fraction = 10
Total "self" time as fraction of the sum of the total self-time of operations run on the device. It is 0 if this op runs on the host.
double device_cumulative_total_self_time_as_fraction = 11
Cumulative value of device_total_self_time_as_fraction.
double host_total_self_time_as_fraction = 12
Total "self" time as fraction of the sum of the total self-time of operations run on the host. It is 0 if this op runs on the device.
double host_cumulative_total_self_time_as_fraction = 13
Cumulative value of host_total_self_time_as_fraction.
double measured_flop_rate = 14
Total floating-point operations (FLOPs) performed per second normalized to the bf16 peak capacity.
double model_flop_rate = 20
Total Floating-point operations for the op per second.
double measured_memory_bw = 15
Number of bytes (including both read and write) accessed per second.
double operational_intensity = 16
Operational intensity, which is defined as FLOPs/bytes-accessed.
string bound_by = 17
Whether this operation is "Compute" or "Memory" bound, according to the Roofline Model.
bool is_eager = 18
Whether this TF-op is eagerly executed.
double gpu_tensorcore_utilization = 19
Fraction of kernel time that utilizes GPU TensorCore. It is 0.0 if this op does not run on a GPU device.
double hbm_bw = 21
Number of bytes accessed from HBM (including both read and write) per second.
double cmem_read_bw = 22
Number of bytes read from CMEM per second.
double cmem_write_bw = 23
Number of bytes written to CMEM per second.
double vmem_read_bw = 24
Number of bytes read from VMEM per second.
double vmem_write_bw = 25
Number of bytes written to VMEM per second.
double hbm_operational_intensity = 26
Operational intensity based on HBM in FLOP/Byte.
double cmem_read_operational_intensity = 27
Operational intensity based on CMEM read in FLOP/Byte.
double cmem_write_operational_intensity = 28
Operational intensity based on CMEM write in FLOP/Byte.
double vmem_read_operational_intensity = 29
Operational intensity based on VMEM read in FLOP/Byte.
double vmem_write_operational_intensity = 30
Operational intensity based on VMEM write in FLOP/Byte.
double bottleneck_operational_intensity = 31
Operational intensity based on the bottleneck resource in FLOP/Byte.
uint64 flops = 32
Flops for the record
uint64 bytes_accessed = 33
Bytes accessed for the record

A table of TFStatsRecords plus the corresponding pprof keys.

Used in: TfStatsDatabase

repeated TfStatsRecord tf_stats_record = 1
All TfStats records, one for each TF operation.
string host_tf_pprof_key = 2
key to the pprof profile for host TF operations.
string device_tf_pprof_key = 3
key to the pprof profile for device TF operations.

Topology of the system. Describes the number of chips and hosts and their connectivity.

Used in: RunEnvironment

optional TopologyDimension chips_per_host_bounds = 1
Topology of chips per host.
optional TopologyDimension host_bounds = 2
Topology of hosts.
repeated TopologyLocation mesh_location = 3
Chip position within the mesh

Used in: Topology

int32 x = 1
int32 y = 2
int32 z = 3

Used in: PodViewerTopology, Topology

int32 x = 1
int32 y = 2
int32 z = 3
int32 host_x = 4
int32 host_y = 5
int32 host_z = 6
int32 index_on_host = 7
int32 global_id = 8

double input_percent = 11
Percentage of step time that is spent on input.
string input_classification = 1
Indicates if input is a bottleneck. Possible values: "host", "device", "both", or "unknown"
string input_statement = 2
A human-readable description of the input bottleneck.
double output_percent = 12
Indicates if output is a bottleneck. Possible values: "host", "device", "both", or "unknown"
string output_classification = 9
Percentage of step time that is spent on output.
string output_statement = 10
A human-readable description of the output bottleneck.
double tc_idle_percent = 13
Percentage of step time where the TC is idle (other than I/O).
string tc_idle_classification = 3
Indicates if TensorCore being idle (other than input) is a bottleneck. Possible values: "no", "yes".
string tc_idle_statement = 4
A human-readable description of the TC-idle bottleneck.
string scv0_classification = 5
Indicates if SparseCoreV0 is a bottleneck. Possible values: "no", "moderate", "high".
string scv0_statement = 6
A human-readable description of the SparseCoreV0 bottleneck.
string all_reduce_classification = 7
Indicates if all-reduce is a bottleneck. Possible values: "no", "yes".
string all_reduce_statement = 8
A human-readable description of the all-reduce bottleneck.
double compute_percent = 14
Percentage of step time that is spent on compute.

Breakdown of step-time on TPU. Next ID: 20

uint64 infeed_duration_ps = 1
The infeed duration (host to TensorCore) in picoseconds.
uint64 host_outfeed_ps = 2
The outfeed duration (TensorCore to host) in picoseconds.
uint64 wait_for_scv0_duration_ps = 3
The TensorCore time that is waiting for SparseCoreV0 in picoseconds.
uint64 scv0_infeed_transform_ps = 4
The TensorCore time spent transforming activations in SparseCoreV0 layout into XLA layout.
uint64 scv0_outfeed_ps = 5
The outfeed duration (TensorCore to SparseCoreV0) in picoseconds.
uint64 crs_duration_ps = 6
The time spent on all-reduce (used to be cross-replica-sum) in picoseconds.
double scv0_infeed_percent = 7
The percentage of the SparseCoreV0 time that spends on infeed from host (including both data and instruction).
uint64 send_duration_ps = 8
The time spent on send operation.
uint64 recv_duration_ps = 9
The time spent on recv operation.
uint64 host_send_duration_ps = 15
The time spent on host send operation.
uint64 host_recv_duration_ps = 16
The time spent on host recv operation.
uint64 wait_for_megacore_fusion_peer_duration_ps = 14
Megacore fusion runs different operations on each core, e.g., a convolution on one core and an all-reduce on the other core. This is the time that the core executing the faster operation waits for the core executing the slower operation to reach the synchronization point.
uint64 overlay_wait_duration_ps = 11
The time waiting for overlay DMAs in picoseconds.
uint64 high_flops_compute_ps = 12
The time spent running high flops ops, such as convolution and output fusion.
uint64 tc_idle_ps = 13
The time that the Tensorcore is idle but not waiting for input or SparseCoreV0.
uint64 tc_busy_ps = 17
The TensorCore time that is busy in picoseconds.
uint64 scv0_busy_ps = 18
The SparseCoreV0 time that is busy in picoseconds (equal to SparseCoreV0 time - HOST_INSTRUCTION_STALL - HOST_DATA_STALL - TENSOR_CORE_STALL).
uint64 scv0_step_ps = 19
SparseCoreV0 step time in picoseconds (equal to SparseCoreV0 time - TENSOR_CORE_STALL).

Next Id: 9

optional StepSummary tc_compute_ms_summary = 1
Summary of all TensorCore compute op duration as a part of step in ms.
optional StepSummary scv0_compute_ms_summary = 2
Summary of all SparseCoreV0 compute op duration as a part of step in ms.
optional StepSummary tc_infeed_ms_summary = 3
Summary of all TensorCore infeed op duration as a part of step in ms.
optional StepSummary tc_outfeed_ms_summary = 6
Summary of all TensorCore outfeed op duration as a part of step in ms.
optional StepSummary scv0_infeed_ms_summary = 4
Summary of all SparseCoreV0 infeed op duration as a part of step in ms.
optional StepSummary tc_idle_ms_summary = 5
Summary of all TensorCore idle (but not input-related) duration as a part of step in ms.
optional StepSummary host_transfer_ms_summary = 7
Summary of all Host to Device and Device to Host transfer part of the step in ms.
optional SparseCoreStepSummary sparse_core_step_summary = 8
Summary of all sparsecore step summary info.

Used in: RawData

optional uint32 dummy = 1

A 'Trace' contains metadata for the individual traces of a system.

map<uint32, Device> devices = 1
The devices that this trace has information about. Maps from device_id to more data about the specific device.
map<uint32, Task> tasks = 6
The tasks that were traced, keyed by a unique ID for the server on which the task ran.
optional uint64 min_timestamp_ps = 4
The time range that this trace covers. Timestamps are picoseconds since tracing started.
Start of first event.
optional uint64 max_timestamp_ps = 5
End of last event.
optional uint64 num_events = 7
map<fixed64, string> name_table = 8
String intern table for event's name or TraceMe argument.

optional TraceEvent.EventType type = 14
optional uint32 device_id = 1
The id of the device that this event occurred on. The full dataset should have this device present in the Trace object.
optional uint32 resource_id = 2
The id of the resource that this event occurred on. The full dataset should have this resource present in the Device object of the Trace object. A resource_id is unique on a specific device, but not necessarily within the trace. NOTE: counter events do not have this field set as they are per device.
oneof name_oneof
- string name = 3
  The name of this trace event.
- fixed64 name_ref = 12
  Reference of the name in Trace's name_table (e.g. in SStable format).
optional int64 group_id = 5
The group id which this event belongs to. This allows the trace viewer to show only a particular group of trace events.
optional uint64 timestamp_ps = 6
The timestamp when this event occurred (picos since tracing started). This timestamp is in the range [min_timestamp, max_timestamp].
optional uint64 duration_ps = 7
The duration of the event in picoseconds, if applicable. Events without duration are called instant events.
optional bytes raw_data = 8
Storage for additional details, e.g. the raw data that led to this TraceEvent. These are stored as raw data so that we don't pay the deserialization cost (memory and runtime) if the data isn't used. See RawData in trace_events_raw.proto.
optional uint64 flow_id = 9
Used to correlate the multiple events of a flow.
optional TraceEvent.FlowEntryType flow_entry_type = 10
optional uint32 flow_category = 11
optional uint32 serial = 13
For streaming trace viewer frontend deduplication, we need an unique id for each events, in the same time, we want to reduce the entropy overhead introduced by this. therefore we will use tuple<device_id, timestamp_ps, serial> as unique ids, serial is optional and only required when timestamp is not unique.

Used in: TraceEvent

EVENT_TYPE_UNSPECIFIED = 0
EVENT_TYPE_COMPLETE = 1
EVENT_TYPE_ASYNC = 3
EVENT_TYPE_COUNTER = 4

Indicates the order of the event within a flow. Events with the same flow_id will appear in trace_viewer linked by arrows. For an arrow to be shown, at least the FLOW_START and FLOW_END must be present. There can be zero or more FLOW_MID events in the flow. Arrows are drawn from FLOW_START to FLOW_END and through each FLOW_MID event in timestamp order.

Used in: TraceEvent

FLOW_NONE = 0
FLOW_START = 1
FLOW_MID = 2
FLOW_END = 3

Generic trace event arguments.

Used in: RawData

repeated TraceEventArguments.Argument arg = 1

Used in: TraceEventArguments

optional string name = 1
oneof value
- string str_value = 2
- uint64 uint_value = 3
- int64 int_value = 5
- double double_value = 4
- fixed64 ref_value = 6
  string type but stored in metadata.

package tensorflow.profiler

message ActiveAllocation

int64 snapshot_index = 1

int64 special_index = 2

int64 num_occurrences = 3

message AllReduceDbResult

repeated AllReduceInfo all_reduce_info = 1

message AllReduceInfo

uint64 id = 1

string name = 2

uint64 all_reduce_id = 3

uint64 start_time_ps = 4

uint64 end_time_ps = 5

uint64 byte_size = 6

message AllReduceOpInfo

string name = 1

uint32 occurrences = 2

double duration_us = 3

uint64 data_size = 4

repeated ReplicaGroup replica_groups = 5

string description = 6

message BatchDetail

optional int64 batch_id = 1

optional uint64 start_time_ps = 2

optional uint64 end_time_ps = 3

optional uint64 batch_delay_ps = 5

repeated int64 related_request_ids = 4

optional int32 padding_amount = 6

optional int32 batch_size_after_padding = 7

optional int32 model_id_index = 8

optional TensorEventDetail tensor_event_detail = 9

optional int32 host_id = 10

optional double percentile = 11

optional uint64 device_time_ps = 12

message BatchingParameters

optional int64 num_batch_threads = 1

optional int64 batch_timeout_micros = 2

optional int64 max_batch_size = 3

optional int64 max_enqueued_batches = 4

optional string allowed_batch_sizes = 5

message BottleneckAnalysis

double input_percent = 7

double output_percent = 8

double idle_percent = 9

double compute_percent = 10

string input_classification = 1

string input_statement = 2

string kernel_launch_classification = 3

string kernel_launch_statement = 4

string all_other_classification = 5

string all_other_statement = 6

string device_collectives_classification = 11

string device_collectives_statement = 12

message BufferAllocation

int64 id = 1

double size_mib = 2

repeated string attributes = 3

repeated LogicalBuffer logical_buffers = 4

string common_shape = 5

message BufferSpan

int32 start = 1

int32 limit = 2

message ChannelInfo

int64 channel_id = 1

repeated uint32 src_core_ids = 11

repeated uint32 dst_core_ids = 12

uint64 data_size = 4

double duration_us = 5

uint32 occurrences = 6

double utilization = 7

repeated string hlo_names = 8

double send_delay_us = 9

string description = 13

message CombinedTfDataStats

bool is_input_bound = 3

string summary = 4

repeated TfDataBottleneckAnalysis bottleneck_analysis = 1

map<string, TfDataStats> tf_data_stats = 2

message CoreDetails

string hostname = 1