package tensorflow.profiler

Get desktop application:
View/edit binary Protocol Buffers messages

The active memory allocations at the peak memory usage.

Used in: PerAllocatorMemoryProfile

int64 snapshot_index = 1
The index of a snapshot in the time-sorted list, used to fetch the MemoryActivityMetadata at front end from the memory_profile_snapshots list.
int64 special_index = 2
The index of MemoryActivityMetadata in the special_allocations list.
int64 num_occurrences = 3
Number of occurrences for identical memory allocations.

Result database for all-reduce ops.

Used in: PerCoreStepInfo

repeated AllReduceInfo all_reduce_info = 1

Result proto for all -educe ops.

Used in: AllReduceDbResult

uint64 id = 1
Unique id for all-reduce ops.
string name = 2
The name of the hlo op. This field is no longer set by the profiler.
uint64 all_reduce_id = 3
For all-reduce nodes from different modules, if they have the same all_reduce_id, they will be 'Allreduce'd'. If empty, AllReduce will not be applied across modules.
uint64 start_time_ps = 4
The start time in picoseconds of the op event.
uint64 end_time_ps = 5
The end time in picoseconds of the op event.
uint64 byte_size = 6
The size of the op in bytes.

Used in: PodStatsMap

string name = 1
Name of this OP.
uint32 occurrences = 2
Number of instances that this OP occurred.
double duration_us = 3
The time in microseconds spent in this OP (averaged across all of its occurrences).
uint64 data_size = 4
Byte size of data transferred.
repeated ReplicaGroup replica_groups = 5
Replica groups.
string description = 6
Description (e.g. XLA expression).

Generic hardware bottleneck.

double input_percent = 7
Percentage of step time that is spent on input.
double output_percent = 8
Percentage of step time that is spent on output.
double idle_percent = 9
Percentage of step time that is idle for non-I/O-related reason.
double compute_percent = 10
Percentage of step time that is spent on compute.
string input_classification = 1
Indicates if input is a bottleneck. Possible values: "host", "device", "both", or "unknown"
string input_statement = 2
A human-readable description of the input bottleneck.
string kernel_launch_classification = 3
Indicates if kernel launching is a bottleneck. Possible values: "no", "moderate", "high".
string kernel_launch_statement = 4
A human-readable description of the kernel launching overhead.
string all_other_classification = 5
Indicates if all other is a bottleneck. Possible values: "no", "moderate", "high".
string all_other_statement = 6
A human-readable description of the all other overhead.
string device_collectives_classification = 11
Indicates if device collective communication is a bottleneck. Possible values: "no", "moderate", "high".
string device_collectives_statement = 12
A human-readable description of the device collective communication overhead.

Used in: PreprocessResult

int64 id = 1
double size_mib = 2
repeated string attributes = 3
repeated LogicalBuffer logical_buffers = 4
string common_shape = 5

Describes the start / exclusive limit HLO program points for a given buffer lifetime, used for rendering a box on the plot.

Used in: PreprocessResult

int32 start = 1
int32 limit = 2

Next ID: 14 Information about a send and recv channel.

Used in: PodStatsMap

int64 channel_id = 1
Id of the channel.
repeated uint32 src_core_ids = 11
Core ids of send ops.
repeated uint32 dst_core_ids = 12
Core ids of recv ops.
uint64 data_size = 4
Byte size of the data transferred.
double duration_us = 5
Duration from the beginning of send to the end of recv-done in microseconds.
uint32 occurrences = 6
Number of occurrences of a channel.
double utilization = 7
Percentage of the link BW utilized over the peak link BW.
repeated string hlo_names = 8
A list of hlo names associated with this channel id.
double send_delay_us = 9
Duration from the beginning of the recv-done to the beginning of send in microseconds. If the recv-done op starts after the beginning of the send op, the delay is zero.
string description = 13
Description (e.g. XLA expression).

TfDataStats of all hosts.

bool is_input_bound = 3
Whether it is input bound.
string summary = 4
Summary of the analysis.
repeated TfDataBottleneckAnalysis bottleneck_analysis = 1
Bottleneck analysis result.
map<string, TfDataStats> tf_data_stats = 2
TfDataStats per host.

Next ID: 7

Used in: OpStats

string hostname = 1
uint32 device_ordinal = 2
unique within host, TPU core only
uint32 core_num = 3
unique within chip per core type
uint32 local_chip_id = 4
unique within host
uint32 global_chip_id = 5
unique within mesh
uint32 global_core_id = 6
unique within mesh, TPU core only

A 'device' is a physical entity in the system and is comprised of several resources.

Used in: Trace

string name = 1
The name of the device.
uint32 device_id = 2
The id of this device, unique in a single trace.
map<uint32, Resource> resources = 3
The resources on this device, keyed by resource_id;

double clock_rate_in_ghz = 1
uint32 num_cores = 2
uint64 memory_size_in_bytes = 3
uint64 memory_bandwidth = 4
Bytes/s.
optional GPUComputeCapability compute_capability = 5
string device_vendor = 6

Information about memory transfer to/from device memory.

Used in: PerCoreStepInfo

uint64 occurrence = 1
double time_us = 2
uint64 bytes_transferred = 3

Used in: InputPipelineAnalysisResult, OpStats, OverviewPage, PodStatsDatabase, PodViewerDatabase

repeated string info = 1
repeated string warnings = 2
repeated string errors = 3

Used in: DeviceCapabilities

uint32 major = 1
uint32 minor = 2

string kernel_launch_bottleneck = 1
Indicates if kernel launch is a performance bottleneck. Possible values: "no", "moderate", "high".
string kernel_launch_statement = 2
A statement that recommends if we need to further investigate kernel-launch performance.
string all_other_bottleneck = 3
Indicates if all other is a performance bottleneck. Possible values: "no", "moderate", "high".
string all_other_statement = 4
A statement that recommends if we need to further investigate all-other performance.
string precision_statement = 5
A statement that recommends if the user should try using lower precision. Shows this statement to users only if it is not empty.
string device_collectives_bottleneck = 6
Indicates if device collectives are a performance bottleneck. Possible values: "no", "moderate", "high".
string device_collectives_statement = 7
A statement that recommends if we need to further investigate device-collectives performance.

Breakdown of step-time on generic hardware. Note that these components are mutually exclusive so that adding them together is equal to the step time. If an execution time interval has multiple types of event happening, we need to pick one of the event type to attribute the time interval to.

map<int32, uint64> type_ps = 1
Map event type to the accumulated duration in picoseconds of that type.

optional StepSummary unknown_time_ms_summary = 1
Summary of all unknown time as a part of step in ms.
optional StepSummary host_wait_input_ms_summary = 9
Summary of all host-wait-input time as a part of step in ms.
optional StepSummary host_to_device_ms_summary = 10
Summary of all host-to-device time as a part of step in ms.
optional StepSummary input_ms_summary = 11
Summary of all input time as a part of step in ms.
optional StepSummary output_ms_summary = 3
Summary of all output time as a part of step in ms.
optional StepSummary device_compute_ms_summary = 4
Summary of all device-compute time as a part of step in ms.
optional StepSummary device_to_device_ms_summary = 5
Summary of all device-to-device time as a part of step in ms.
optional StepSummary device_collectives_ms_summary = 12
Summary of all device-collectives time as a part of step in ms.
optional StepSummary host_compute_ms_summary = 6
Summary of all host-compute time as a part of step in ms.
optional StepSummary host_prepare_ms_summary = 7
Summary of all host-prepare time as a part of step in ms.
optional StepSummary host_compile_ms_summary = 8
Summary of all compilation time as a part of step in ms.

Types of hardware profiled.

UNKNOWN_HARDWARE = 0
Unknown hardware.
CPU_ONLY = 1
CPU only without any hardware accelerator.
GPU = 2
GPU.
TPU = 3
TPU.

Describes a heap object that is displayed in a plot in the memory visualization HTML.

Used in: PreprocessResult

oneof color
- int32 numbered = 1
- string named = 2
string label = 3
int32 logical_buffer_id = 4
double logical_buffer_size_mib = 5
double unpadded_shape_mib = 6
string instruction_name = 7
string shape_string = 8
string tf_op_name = 9
string group_name = 10
string op_code = 11

Result proto for host-dependent job information.

Used in: RunEnvironment

string host_id = 1
This ID of the host where the job was run on.
string command_line = 2
The command line used to run the job.
int64 start_time = 3
The start time of this run (nanoseconds since the Unix epoch).
string bns_address = 4
BNS address specified by client at time of profiling request.
uint64 profile_time_ns = 5
Profiling start walltime (in ns).

Result proto for host-independent job information.

Used in: RunEnvironment

int64 change_list = 1
The change-list number of this build.
int64 build_time = 2
The time of this build (nanoseconds since the Unix epoch).
string build_target = 3
The target of this build.
uint32 profile_duration_ms = 4
Profiling duration (in ms).

Used in: InputPipelineAnalysisResult

string op_name = 1
The Op's name.
uint64 count = 2
The number of occurrences.
double time_in_ms = 3
Time (accumulated over all occurrences) in milliseconds.
double time_in_percent = 4
Time (accumulated over all occurrences) in percentage of the total input processing time.
double self_time_in_ms = 5
Self time (accumulated over all occurrences) in milliseconds.
double self_time_in_percent = 6
Self time (accumulated over all occurrences) in percentage of the total input processing time.
string category = 7
Possible categories: "Enqueue", "Advanced file read", "Demanded file read", "Preprocessing", "Unknown".

Used in: InputPipelineAnalysisResult

repeated string details = 1
A list of detailed recommendations.
optional google.protobuf.Any bottleneck_analysis = 2
An analysis of different types of bottlenecks. Can be unpacked into a BottleneckAnalysis.
string summary_next_step = 3
A suggested step to take next.

Used in: OverviewPage

string hardware_type = 9
Hardware type.
optional StepSummary step_time_summary = 2
Summary of all step duration across all cores.
optional StepSummary input_percent_summary = 3
Summary of all input-related stall as percentage of step duration.
double input_percent = 11
Percentage of step time that is waiting for input.
double output_percent = 13
Percentage of step time that is doing output.
double idle_percent = 14
Percentage of step time that is idle for non-I/O-related reason.
double compute_percent = 15
Percentage of step time that is doing compute.
repeated google.protobuf.Any step_details = 4
Details of each step. Can be unpacked into a PerGenericStepDetails.
optional InputTimeBreakdown input_time_breakdown = 5
The breakdown of the input processing time.
repeated InputOpDetails input_op_details = 6
Details of each input Op executed.
optional InputPipelineAnalysisRecommendation recommendation = 7
Recommendation for next steps to users.
optional google.protobuf.Any step_time_breakdown = 8
Breakdown of the step time. Can be unpacked into a GenericStepTimeBreakdown.
optional Diagnostics diagnostics = 12
Error and warning messages for diagnosing profiling issues.

Metadata for input pipeline.

Used in: InputPipelineStats

int64 id = 1
Id of the input pipeline which is set to the id of its root iterator.
InputPipelineMetadata.InputPipelineType type = 2
string name = 4

The distribution strategy creates one "host" input pipeline which actually runs tf.data user code. Also, it creates a "device" input pipeline per device (e.g., TensorCore) which takes an element from the host input pipeline and transfers it to the device.

Used in: InputPipelineMetadata

HOST = 0
DEVICE = 1

Stat and metadata for input pipeline.

Used in: InputPipelineStats

int64 bottleneck_iterator_id = 2
Id of the blocking iterator with the longest self time.
int64 bottleneck_iterator_latency_ps = 3
Latency of the bottleneck iterator.
map<int64, IteratorStat> iterator_stats = 1
Stats per iterator.

Collection of metadata and stats of input pipeline.

Used in: TfDataStats

optional InputPipelineMetadata metadata = 1
Metadata of the input pipeline.
int64 avg_latency_ps = 3
Average latency (i.e., the root iterator's latency) of the input pipeline.
int64 min_latency_ps = 4
Minimum latency of the input pipeline.
int64 max_latency_ps = 5
Maximum latency of the input pipeline.
int64 num_slow_calls = 6
The number of times this input pipeline was slower than 50 us.
repeated InputPipelineStat stats = 2
Stats per call sorted by the root iterator's duration.

Used in: InputPipelineAnalysisResult

double demanded_file_read_us = 1
Time spent on demanded file read in microseconds.
double advanced_file_read_us = 2
Time spent on advanced file read in microseconds.
double preprocessing_us = 3
Time spent on data preprocessing in microseconds.
double enqueue_us = 4
The infeed enqueue time in microseconds.
double unclassified_non_enqueue_us = 5
This entry is for the situtation where we can't further break down the non-enqueue input time (because the input pipeline is not instrumented).

Metadata for iterator.

Used in: TfDataStats

int64 id = 1
Id of the iterator.
int64 parent_id = 2
Id of the parent iterator.
string name = 3
Name of the iterator.
string long_name = 6
Long name of the iterator.
bool is_async = 4
Whether it is an async iterator.
map<string, string> params = 5
Parameters of the iterator (e.g., num_parallel_calls).

Stat for iterator.

Used in: InputPipelineStat

int64 id = 1
Id of the iterator.
int64 start_time_ps = 2
Start time of the iterator's GetNext in ps.
int64 duration_ps = 3
Duration of the iterator's GetNext in ps.
int64 self_time_ps = 4
Self time of the iterator's GetNext in ps. It takes account into async iterators. It is calculated by subtracting the time overlapped with its child iterator's duration from the iterator's duration.
bool is_blocking = 5
Whether it is blocking the root iterator. An async iterator's child iterator may not block its parent iterator if it is executed in advance and does not overlap with the parent iterator.
int64 num_calls = 6
The number of times this iterator is called. For example, a batch iterator's child iterator may be called multiple times.

Next ID: 15

Used in: KernelStatsDb

string name = 1
Name of the kernel.
uint32 registers_per_thread = 2
Registers per thread.
uint32 static_shmem_bytes = 3
Static shared memory in bytes.
uint32 dynamic_shmem_bytes = 4
Dynamic shared memory in bytes.
repeated uint32 block_dim = 5
Block dimensions.
repeated uint32 grid_dim = 6
Grid dimensions.
uint64 total_duration_ns = 7
Total duration of this kernel.
uint64 min_duration_ns = 8
Min duration of kernel in nanoseconds.
uint64 max_duration_ns = 9
Max duration of kernel in nanoseconds.
bool is_kernel_using_tensor_core = 10
Kernel utilizes TensorCore instructions.
bool is_op_tensor_core_eligible = 11
Operation is eligible to use TensorCores.
string op_name = 12
TF operation name.
uint32 occurrences = 13
Number of occurrences.
float occupancy_pct = 14
Occupancy percentage.

Used in: OpStats

repeated KernelReport reports = 1
A list of kernels aggregated by name.

Data layout of an op.

Used in: OpMetrics

repeated LayoutAnalysis.Dimension dimensions = 1
The physical data layout, from most-minor to most-major dimensions.

Physical data layout in each tensor dimension.

Used in: LayoutAnalysis

int32 size = 1
Size of the data in this dimension.
int32 alignment = 2
Data must be padded to a multiple of alignment.
LayoutDimensionSemantics semantics = 3
What the dimension represents.

What the dimension represents, e.g. spatial, feature or batch.

Used in: LayoutAnalysis.Dimension

UNKNOWN_SEMANTICS = 0
FEATURE = 1
BATCH = 2
SPATIAL = 3

Used in: BufferAllocation

int64 id = 1
string shape = 2
double size_mib = 3
string hlo_name = 4
repeated int64 shape_index = 5

The memory activity that causes change of memory state.

Used in: MemoryActivityMetadata

UNKNOWN_ACTIVITY = 0
ALLOCATION = 1
Memory allocation in heap.
DEALLOCATION = 2
Memory deallocation in heap.
RESERVATION = 3
Memory reservation for stack.
EXPANSION = 4
Expansion of existing memory allocation.

The metadata associated with each memory allocation/deallocation. It can also be interpreted as the metadata for the delta of memory state. Next ID: 10

Used in: MemoryProfileSnapshot, PerAllocatorMemoryProfile

MemoryActivity memory_activity = 1
The activity associated with the MemoryProfileSnapshot.
int64 requested_bytes = 2
The requested memory size in bytes from the caller of memory allocation. Should be a positive number.
int64 allocation_bytes = 3
The allocated (block/chunk) size for the memory allocation. Should be a positive number.
uint64 address = 4
Starting address of the allocated memory chunk/block.
string tf_op_name = 5
TensorFlow Op name for the memory activity.
int64 step_id = 6
Step Id at which the memory activity occurred.
string region_type = 7
Tensor memory region type including "output", "temp", "persist", and "dynamic".
string data_type = 8
From enum DataType defined in tensorflow/core/framework/types.proto.
string tensor_shape = 9
Tensor shape printed in string, e.g. "[3, 3, 512, 512]".

The aggregated memory stats including heap, stack, free memory and fragmentation at a specific time.

Used in: MemoryProfileSnapshot, MemoryProfileSummary

int64 stack_reserved_bytes = 1
Memory usage by stack reservation, in bytes.
int64 heap_allocated_bytes = 2
Memory usage by heap allocation, in bytes.
int64 free_memory_bytes = 3
Free memory available for allocation or reservation, in bytes.
double fragmentation = 4
Fragmentation value within [0, 1].
int64 peak_bytes_in_use = 5
The peak memory usage over the entire program (lifetime of memory allocator). It monotonically increases with upper limit as memory capacity.

Data for memory usage analysis in one host.

map<string, PerAllocatorMemoryProfile> memory_profile_per_allocator = 1
A map from memory allocator's id to PerAllocatorMemoryProfile for memory usage analysis on this host.
int32 num_hosts = 2
Number of hosts profiled, used to populate host selection list at front end.
repeated string memory_ids = 3
Ids for profiled memory allocators, used to populate memory selection list at front end.
int32 version = 5
Version number of MemoryProfile proto.

Profile snapshot of the TensorFlow memory at runtime, including MemoryAggregationStats (memory usage breakdown etc.), and MemoryActivityMetadata (allocation or deallocation, TF Op name etc.).

Used in: PerAllocatorMemoryProfile

int64 time_offset_ps = 1
Memory activity timestamp.
optional MemoryAggregationStats aggregation_stats = 2
The memory aggregation stats at the snapshot time.
optional MemoryActivityMetadata activity_metadata = 3
The metadata for the memory activity at the snapshot time.

message MemoryProfileSummary

memory_profile.proto:75

The summary of memory profile within the profiling window duration.

Used in: PerAllocatorMemoryProfile

int64 peak_bytes_usage_lifetime = 1
The peak memory usage over the entire program (lifetime of memory allocator).
optional MemoryAggregationStats peak_stats = 2
The peak memory usage stats within the profiling window.
int64 peak_stats_time_ps = 3
The timestamp for peak memory usage within the profiling window.
int64 memory_capacity = 4
The memory capacity of the allocator.

Metrics for an operation (accumulated over all occurrences). Next ID: 24

Used in: OpMetricsDb

uint64 hlo_module_id = 13
HLO module id. 0 for TF ops.
string name = 6
Name of this op.
string long_name = 20
Long name of this op (e.g., HLO expression).
string category = 11
Category of this op.
string provenance = 12
Provenance of this op (e.g., if HLO op, original TF op).
bool is_eager = 18
Whether it is executed eagerly.
uint32 occurrences = 3
Number of executions.
uint64 time_ps = 7
Total time (self + children) in picoseconds.
uint64 min_time_ps = 17
Minimum time (self + children) among all occurrences.
uint64 self_time_ps = 1
Total self time in picoseconds.
uint64 flops = 2
Total FLOPs.
uint64 bytes_accessed = 5
Total bytes accessed.
repeated OpMetrics.MemoryAccessed memory_accessed_breakdown = 19
uint64 dma_stall_ps = 10
Total dma stall time in picoseconds.
optional LayoutAnalysis layout = 14
The data layout for this op. Only set for convolution ops for now.
string deduplicated_name = 15
Deduplicated HLO name for this op. Not set for TF ops.
optional OpMetricsDb children = 16
Children of the op. e.g. fused ops if this op is fusion.
uint32 num_cores = 21
Number of cores this op occurs.
uint32 computation_primitive_size = 22
Computation primitive size in BITS. This is the size of the type of the hardware computation. In the future this may be extended to include info such as signed/unsigned, int/fp, etc. Currently only the size is needed.
bool autotuned = 23
Whether the op is autotuned.

Breakdown of memory accessed by operation type and memory space.

Used in: OpMetrics

MemoryAccessed.OperationType operation_type = 1
uint64 memory_space = 2
Device-specific id of memory space.
uint64 bytes_accessed = 3

Used in: MemoryAccessed

UNKNOWN = 0
READ = 1
WRITE = 2

A database for OpMetrics. Next ID: 14

Used in: OpMetrics, OpStats, PerCoreStepInfo

repeated OpMetrics metrics_db = 10
A bunch of OpMetrics.
uint64 total_host_infeed_enq_duration_ps = 2
The total host infeed-enqueue duration in picoseconds.
uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3
The total of the difference between the start times of two consecutive infeed-enqueues (per host) in picoseconds.
uint64 total_time_ps = 11
The total time in picoseconds.
uint64 total_op_time_ps = 12
The total time incurred by OPs in picoseconds.
optional PrecisionStats precision_stats = 13
Precision-related stats.

Next ID: 14 Operator Statistics.

optional OpMetricsDb host_op_metrics_db = 1
The database for the op metrics collected from the host over the entire profiling session including incomplete steps.
optional OpMetricsDb device_op_metrics_db = 2
The database for the op metrics collected from the device over the entire profiling session including incomplete steps.
optional OpMetricsDb hlo_metrics_db_complete_steps_only = 10
The result for the HLO-metric database over the complete steps only.
optional PerfEnv perf_env = 3
Performance environment of the op metrics collected.
optional StepDatabaseResult step_db = 4
The database of step sequences.
optional RunEnvironment run_environment = 5
The run environment of this profiling session.
optional KernelStatsDb kernel_stats_db = 6
Kernel stats results from all GPUs.
optional TfFunctionDb tf_function_db = 8
Statistics for all tf-functions.
map<uint32, CoreDetails> core_id_to_details = 11
A map from core ID to details.
optional Diagnostics diagnostics = 9
Error and warning messages for diagnosing profiling issues.
map<uint64, string> program_id_to_name_map = 12
A map from program ID to program name.
optional PerformanceCounterResult performance_counter_result = 13
Performance counters.

optional OverviewPageRunEnvironment run_environment = 6
The run environment of the profiled session.
optional InputPipelineAnalysisResult input_analysis = 2
The step-time result.
optional OverviewPageAnalysis analysis = 3
The other analysis result.
optional OverviewPageRecommendation recommendation = 4
The recommendation made to the user.
optional Diagnostics diagnostics = 8
Error and warning messages for diagnosing profiling issues.

Overview result for general analysis.

Used in: OverviewPage

double mxu_utilization_percent = 1
MXU utilization in percentage.
double device_idle_time_percent = 2
Percentage of the device time that is idle.
double host_idle_time_percent = 3
Percentage of the host time that is idle.
repeated OverviewTfOp top_device_ops = 4
Top TF Ops executed on the device.
string remark_text = 5
Remark text in the performance summary section.
string remark_color = 6
Color of the remark text.
double flop_rate_utilization_relative_to_roofline_percent = 7
FLOP rate utilization relative to the roofline in percentage.
double memory_bw_utilization_relative_to_hw_limit_percent = 8
Memory bandwidth utilization relative to the hw limit in percentage.
double device_compute_16bit_percent = 9
Percentage of device computation that is 16-bit.
double device_compute_32bit_percent = 10
Percentage of device computation that is 32-bit.
double host_tf_op_percent = 11
Percentage of TF ops executed on the host.
double device_tf_op_percent = 12
Percentage of TF ops executed on the device.
uint32 host_trace_level = 13
Host trace level.
double host_op_time_eager_percent = 14
Percentage of TF-op execution time on the host (excluding the idle time) that are in eager mode.
double device_op_time_eager_percent = 15
Percentage of TF-op execution time on the device (excluding the idle time) that are in eager mode.
double device_op_time_outside_compilation_percent = 16
Percentage of TF-op execution time on the device (excluding the idle time) that are for outside compilation.
double device_duty_cycle_percent = 17
Percentage of the device time that is in use.

Result proto for host-dependent job information.

Used in: OverviewPageRunEnvironment

string host_id = 1
This ID of the host where the job was run on.
string command_line = 2
The command line used to run the job.
int64 start_time = 3
The start time of this run (nanoseconds since the Unix epoch).
string bns_address = 4
BNS address specified by client at time of profiling request.
uint64 profile_time_ns = 5
Profiling start walltime (in ns).

Result proto for host-independent job information.

Used in: OverviewPageRunEnvironment

int64 change_list = 1
The change-list number of this build.
int64 build_time = 2
The time of this build (nanoseconds since the Unix epoch).
string build_target = 3
The target of this build.
uint32 profile_duration_ms = 4
Profiling duration (in ms).

Overview result for the recommendation section.

Used in: OverviewPage

string bottleneck = 1
Possible performance bottleneck: "host", "device", "both".
string statement = 2
A statement for input that recommends the next steps for investigating the bottleneck.
repeated OverviewPageTip input_tips = 11
A list of tips for tackling input bottleneck.
string output_statement = 9
A statement for output that recommends the next steps for investigating the bottleneck.
string eager_statement_html = 12
A statement that recommends the next steps for investigating eager-mode related bottleneck (it is an html so that it can link to other tools/docs.)
string outside_compilation_statement_html = 13
A statement that recommends the next steps for investigating outside-compilation related bottleneck (it is an html so that it can link to other tools/docs.)
string tf_function_statement_html = 10
A statement that recommends the next steps for investigating tf-function related bottleneck (it is an html so that it can link to other tools/docs.)
repeated OverviewPageTip host_tips = 3
A list of tips for improving host performance.
repeated OverviewPageTip device_tips = 4
A list of tips for improving device performance.
repeated OverviewPageTip documentation_tips = 5
A list of links to related useful documents.
optional google.protobuf.Any recommendation = 6
// The recommendation made to the user. Can be unpacked into a GenericRecommendation.
repeated OverviewPageTip faq_tips = 7
A list of tips for FAQ.
repeated OverviewPageTip inference_tips = 8
A list of tips for inference run.

The run environment of a profiling session.

Used in: OverviewPage

int32 host_count = 1
Number of hosts used.
int32 task_count = 2
Number of tasks used.
map<string, bool> hostnames = 3
Distinct hostnames seen.
string device_type = 4
The type of device used.
int32 device_core_count = 5
The number of device cores used. In TPU case, this corresponds to the number of TPU cores In GPU case, this corresponds to the number of GPUs (not the number of SMs).
optional OverviewPageHostIndependentJobInfo host_independent_job_info = 7
Host-independent information about this job.
repeated OverviewPageHostDependentJobInfo host_dependent_job_info = 8
Host-dependent information about this job.
int32 replica_count = 9
The number of replicas, corresponds to input parallelism. If there is no model parallelism, replica_count = device_core_count
int32 num_cores_per_replica = 10
The number of cores used for a single replica, e.g. model parallelism. If there is no model parallelism, then num_cores_per_replica = 1

Overview result for a performance tip to users.

Used in: OverviewPageRecommendation

string link = 1
Link to the tip.

Overview result for a TensorFlow Op.

Used in: OverviewPageAnalysis

string name = 1
Name of the Op.
string category = 2
Category of the Op.
double self_time_fraction = 3
The amount of time that this Op takes by itself as fraction of the total execution time on the device or host.
double cumulative_time_fraction = 4
The cumulative time upto this Op as fraction of the total execution time.
double flop_rate = 5
How many GFlops/sec that this Op achieves.
bool is_op_tensorcore_eligible = 6
Whether the Op is eligible to use TensorCores.
bool is_op_using_tensorcore = 7
Whether at least one of the kernels launched in this op is using TensorCore.

Memory profile snapshots per memory allocator.

Used in: MemoryProfile

repeated MemoryProfileSnapshot memory_profile_snapshots = 1
A list of MemoryProfileSnapshots referenced by <active_allocations>.
optional MemoryProfileSummary profile_summary = 2
The summary of memory profile (e.g. the peak memory usage).
repeated ActiveAllocation active_allocations = 3
The rows in the table of active allocations at peak memory usage within profiling window.
repeated MemoryActivityMetadata special_allocations = 4
The special allocations (e.g. pre-allocated heap memory, stack reservation) that are not captured in the MemoryActivityMetadata of memory_profile_snapshots. Need to handle separately.
repeated MemoryProfileSnapshot sampled_timeline_snapshots = 5
A list of MemoryProfileSnapshots sampled from all the snapshots during the profiling window. It is used to display the memory timeline graph in the frontend. The snapshots are sorted by timestamp.

Result proto for information in a step across all cores.

Used in: StepDatabaseResult

uint32 step_num = 1
The step number.
map<uint32, StepInfoResult> step_info_per_core = 2
A map from core_id to StepInfo.
optional OpMetricsDb hlo_metrics_db = 3
The result for the per-step HLO-metric database.
map<uint32, uint32> core_id_to_replica_id_map = 5
A map from core ID to program replica id. Replica id map could change during a profile session, but should stay stable within a step.
map<uint32, AllReduceDbResult> all_reduce_db_per_core = 6
A map from core_id to all-reduce ops.
repeated DeviceMemoryTransfer device_memory_transfers = 7
Information about deivce memory transfers, categoried by source and destination. Ordered by following categories: 1. HostToDevice 2. DeviceToHost 3. DeviceToDevice

Per-step details on generic hardware.

int32 step_number = 1
The step number of a step.
string step_name = 14
The step name.
double step_time_ms = 2
The step time (in ms).
double unknown_time_ms = 3
Breakdown of the step time in different event categories. The unknown time (in ms).
double host_wait_input_ms = 11
The time (in ms) in which the host is waiting for input data to be ready.
double host_to_device_ms = 12
The time (in ms) in which the host is sending input data to the device. Total input time = host_wait_input_ms + host_to_device_ms.
double output_ms = 5
The output time (in ms).
double device_compute_ms = 6
The device-compute time (in ms).
double device_to_device_ms = 7
The device-to-device communication time (in ms).
double device_collectives_ms = 13
The device time spent on collective communications (in ms).
double host_compute_ms = 8
The host-compute time (in ms).
double host_prepare_ms = 9
The host-prepare time (in ms).
double host_compile_ms = 10
The time spent on compiling (in ms).

Performance environment, e.g the peak performance capabilities of the device.

Used in: OpStats

double peak_tera_flops_per_second = 1
Peak performance of a TPU core or a GPU in TFLOP/s.
double peak_hbm_bw_giga_bytes_per_second = 2
Peak memory bandwidth of a TPU core or a GPU in GiBs/s.
double ridge_point = 3
The ridge point of roofline model in FLOP/Byte. (i.e., minimum operational intensity required to achieve maximum performance).

Metrics based on hardware performance counters.

Used in: OpStats

double matrix_unit_utilization_percent = 1
Overall matrix unit utilization in percentage.

A database of PodStats records.

repeated PodStatsRecord pod_stats_record = 1
All PodStats records, one for each row in the PodStats tool.
optional Diagnostics diagnostics = 3
Error and warning messages for diagnosing profiling issues.
repeated StepBreakdownEvents step_breakdown_events = 4
A map from event type number to event name string for step breakdown.

Result proto for information in a step across all cores.

Used in: PodStatsSequence

uint32 step_num = 1
The (micro) step number.
map<uint32, PodStatsRecord> pod_stats_per_core = 2
A map from core_id to PodStatsRecord.
repeated ChannelInfo channel_db = 3
A database of channel info.
map<uint32, uint32> core_id_to_replica_id_map = 4
A map from core ID to program replica id. Replica id map could change during a profile session, but should stay stable within a step.
repeated AllReduceOpInfo all_reduce_op_db = 5
A database of all reduce ops.

Next ID: 20 There is one PodStatsRecord for each step traced on each compute node.

Used in: PodStatsDatabase, PodStatsMap

string host_name = 1
The host name where the trace was collected.
int32 chip_id = 2
The TPU global chip id where the trace was collected.
int32 node_id = 3
The TPU node id where the trace was collected.
uint32 step_num = 4
The step number.
double total_duration_us = 5
The step duration in micro-seconds.
map<int32, double> step_breakdown_us = 19
Breakdown the durations for each event type in micro-seconds.
string bottleneck = 14
Indicates the bottleneck out of the above mentioned metrics.

A sequence of PodStatsMap for each step.

Used in: PodViewerDatabase

repeated PodStatsMap pod_stats_map = 1

Next ID: 12 A database of pod viewer records.

string device_type = 10
The type of device used.
optional PodStatsSequence pod_stats_sequence = 3
Pod level stats for each step.
optional PodViewerSummary summary = 7
Top level summary of pod viewer.
optional Diagnostics diagnostics = 8
Error and warning messages for diagnosing profiling issues.
repeated StepBreakdownEvents step_breakdown_events = 9
A map from event type number to event name string for step breakdown.
optional PodViewerTopology topology = 11
Info to draw the topology graph.

message PodViewerSummary

pod_viewer.proto:84

Used in: PodViewerDatabase

repeated string warnings = 1

Next ID: 8 Topology graph draws all the cores in the system in a 2-D rectangle or 3-D cube. It is hierarchically grouped by host, chip and core.

Used in: PodViewerDatabase

int32 x_dimension = 1
Number of cores in the x dimension of the rectangle/cube.
int32 y_dimension = 2
Number of cores in the y dimension of the rectangle/cube.
int32 z_dimension = 3
Number of cores in the z dimension of the cube.
int32 host_x_stride = 4
Number of cores in the x dimension of each host.
int32 host_y_stride = 5
Number of cores in the y dimension of each host.
int32 host_z_stride = 6
Number of cores in the z dimension of each host.
int32 num_cores_per_chip = 7
Number of cores per chip.

Statistics about the various precision used in computation.

Used in: OpMetricsDb

uint64 compute_16bit_ps = 1
Amount of time spent on 16-bit computation (in ps).
uint64 compute_32bit_ps = 2
Amount of time spent on 32-bit computation (in ps).

Groups together all results from the preprocessing C++ step.

repeated double heap_sizes = 1
Heap sizes at each HLO program point (the HLO sequential order).
repeated double unpadded_heap_sizes = 2
Unpadded heap sizes (calculated as the minimal sizes based on the data type and dimensionality) at each HLO program point (the HLO sequential order).
repeated HeapObject max_heap = 3
Heap objects at the peak memory usage point ordered by HLO program "birth" time.
repeated HeapObject max_heap_by_size = 4
Heap objects at the peak memory usage point ordered by size, descending.
map<int32, BufferSpan> logical_buffer_spans = 5
Mapping from logical buffer ID to the HLO sequential order span in which it is alive.
repeated int32 max_heap_to_by_size = 6
Indexes to get back and forth from the by-size and by-program-order sequences.
repeated int32 by_size_to_max_heap = 7
string module_name = 8
string entry_computation_name = 9
double peak_heap_mib = 10
Peak heap size for the HLO program.
double peak_unpadded_heap_mib = 11
Peak unpadded heap size for the HLO program.
int32 peak_heap_size_position = 12
HLO program point number at which the peak heap size occurs.
double entry_computation_parameters_mib = 13
Size of the entry computation parameters in MiB. This does not reflect whether those MiB are reusable during the computation or not, it is simply a size value.
double non_reusable_mib = 14
double maybe_live_out_mib = 15
repeated BufferAllocation indefinite_lifetimes = 16

Describes the replica groups in a cross replica op (e.g., all-reduce and all-to-all).

Used in: AllReduceOpInfo

repeated int64 replica_ids = 1
The ids of the replicas that belongs to the same group. The ordering of the ids matters in some ops (e.g., all-to-all).

A 'resource' generally is a specific computation component on a device. These can range from threads on CPUs to specific arithmetic units on hardware devices.

Used in: Device

string name = 1
The name of the resource.
uint32 resource_id = 2
The id of the resource. Unique within a device.
uint32 sort_index = 3
The sort index of the resource. Resources within a device are ordered by this value. if absent, use resource id as sort index.

The run environment of a profiling session.

Used in: OpStats

int32 host_count = 1
Number of hosts used.
int32 task_count = 2
Number of tasks used.
map<string, bool> hostnames = 3
Distinct hostnames seen.
string device_type = 4
The type of device used.
int32 device_core_count = 5
The number of device cores used. In TPU case, this corresponds to the number of TPU cores In GPU case, this corresponds to the number of GPUs (not the number of SMs).
optional HostIndependentJobInfoResult host_independent_job_info = 7
Host-independent information about this job.
repeated HostDependentJobInfoResult host_dependent_job_info = 8
Host-dependent information about this job.
int32 replica_count = 9
The number of replicas, corresponds to input parallelism. If there is no model parallelism, replica_count = device_core_count
int32 num_cores_per_replica = 10
The number of cores used for a single replica, e.g. model parallelism. If there is no model parallelism, then num_cores_per_replica = 1
optional SystemTopology topology = 11
The chip interconnection topology.
uint32 host_trace_level = 12
Host trace level.

Used in: PodStatsDatabase, PodViewerDatabase

int32 id = 1
string name = 2

Result proto for a StepDatabase.

Used in: OpStats

repeated PerCoreStepInfo step_sequence = 1
A sequence of PerCoreStepInfo.
bool use_incomplete_step = 2
Whether the step db uses incomplete step information. This flag is set to true when: 1) no step marker or annotation present. 2) profiling duration is too short to cover a full step. If this flag is false, we will group and breakdown the profile by complete steps only and ignore incomplete steps. If this flag is true, we will simply aggregate and breakdown over the total profile as a single step.
uint32 num_steps_dropped = 3
Number of steps dropped during post processing.
bool empty_intersect = 4
If the step_sequence is empty because: * there is no step profiled on any host, then empty_intersect is false. * there are steps profiled on some host, but the intersection of steps over all hosts is empty, then empty_intersect is true.

Next ID: 6 Result proto for StepInfo.

Used in: PerCoreStepInfo

uint32 step_num = 1
The step number.
string step_name = 5
The step name.
uint64 duration_ps = 2
The step duration in picoseconds.
uint64 begin_ps = 3
The start time of this step in picoseconds.
optional google.protobuf.Any step_breakdown = 4
Breakdown of the step-time. Can be unpacked into a GenericStepBreakdown.

message StepSummary

input_pipeline.proto:42

Used for both step duration and Op duration.

Used in: GenericStepTimeBreakdown, InputPipelineAnalysisResult

double average = 1
double standard_deviation = 2
double minimum = 3
double maximum = 4

System topology, which describes the number of chips in a pod and the connectivity style.

Used in: RunEnvironment

int64 x_dimension = 1
The X, Y, and Z dimensions of this topology. 0 means that dimension does not exist.
int64 y_dimension = 2
int64 z_dimension = 3
int64 num_expected_reduced_chips = 4
The number of expected bad chips in this system.

Used in: CombinedTfDataStats

string host = 1
Host name.
string input_pipeline = 2
Input pipeline name.
int64 max_latency_ps = 3
Maximum latency of the input pipeline.
string iterator_name = 4
Name of the bottleneck iterator.
string iterator_long_name = 5
Long name of the bottleneck iterator.
int64 iterator_latency_ps = 7
Latency of the bottleneck iterator.
string suggestion = 6
Suggestion to resolve the bottleneck.

Collection of stats of tf.data input pipelines within a host.

Used in: CombinedTfDataStats

map<int64, IteratorMetadata> iterator_metadata = 2
Metadata per iterator.
map<int64, InputPipelineStats> input_pipelines = 1
Stats per input pipeline.

Statistics for a tf-function.

Used in: TfFunctionDb

map<int32, TfFunctionMetrics> metrics = 1
A map from each execution mode to its corresponding metrics.
int64 total_tracing_count = 2
Total tracing count from the program's beginning (i.e. beyond the profiling period) of this tf-function.
TfFunctionCompiler compiler = 3
Compiler used to compile this function.
double expensive_call_percent = 4
Percentage of time spent in the expensive calls to this function in the profiled period.

All possible compilers that can be used to compile a tf-function in the graph mode.

Used in: TfFunction

INVALID_COMPILER = 0
Yet to be set.
OTHER_COMPILER = 1
Any other compiler.
MIXED_COMPILER = 2
If some instance of the function is compiled with XLA and some is compiled with Non-XLA, use "MIXED_COMPILER".
XLA_COMPILER = 3
XLA compiler.
MLIR_COMPILER = 4
MLIR compiler.

Statistics for all tf-functions.

Used in: OpStats

map<string, TfFunction> tf_functions = 1
A map from function name to the statistics of that function.

All possible execution modes of a tf-function.

INVALID_MODE = 0
Yet to be set.
EAGER_MODE = 1
Eager execution.
TRACED_MODE = 2
Graph execution with tracing.
NOT_TRACED_MODE = 3
Graph execution without tracing.
CONCRETE_MODE = 4
Concrete function.

Metrics associated with a particular execution mode of a tf-function.

Used in: TfFunction

uint64 count = 1
Number of invocations to the function in that execution mode.
uint64 self_time_ps = 2
The sum of "self-execution" time of this function over those invocations.

A database of TfStatsTables.

optional TfStatsTable with_idle = 4
The table that includes IDLE time.
optional TfStatsTable without_idle = 5
The table that excludes IDLE time.
string device_type = 6
The type of device used.

There is one TfStatsRecord for each TF operation profiled.

Used in: TfStatsTable

uint64 rank = 1
Rank of this TF-op among all TF-ops.
string host_or_device = 2
Whether this TF-op is on "Host" or "Device".
string op_type = 3
TF-op type.
string op_name = 4
TF-op name.
int64 occurrences = 5
Number of occurrences of the operation.
double total_time_in_us = 6
Total "accumulated" time in micro-seconds that the operation took. If this operation has any children operations, the "accumulated" time includes the time spent inside children.
double avg_time_in_us = 7
Average "accumulated" time in micro-seconds that each occurrence of the operation took.
double total_self_time_in_us = 8
Total "self" time in micro-seconds that the operation took. If this operation has any children operations, the "self" time doesn't include the time spent inside children.
double avg_self_time_in_us = 9
Average "self" time in micro-seconds that the operation took.
double device_total_self_time_as_fraction = 10
Total "self" time as fraction of the sum of the total self-time of operations run on the device. It is 0 if this op runs on the host.
double device_cumulative_total_self_time_as_fraction = 11
Cumulative value of device_total_self_time_as_fraction.
double host_total_self_time_as_fraction = 12
Total "self" time as fraction of the sum of the total self-time of operations run on the host. It is 0 if this op runs on the device.
double host_cumulative_total_self_time_as_fraction = 13
Cumulative value of host_total_self_time_as_fraction.
double measured_flop_rate = 14
Number of floating-point operations (FLOPs) performed per second.
double measured_memory_bw = 15
Number of bytes (including both read and write) accessed per second.
double operational_intensity = 16
Operational intensity, which is defined as FLOPs/bytes-accessed.
string bound_by = 17
Whether this operation is "Compute" or "Memory" bound, according to the Roofline Model.
bool is_eager = 18
Whether this TF-op is eagerly executed.
double gpu_tensorcore_utilization = 19
Fraction of kernel time that utilizes GPU TensorCore. It is 0.0 if this op does not run on a GPU device.

A table of TFStatsRecords plus the corresponding pprof keys.

Used in: TfStatsDatabase

repeated TfStatsRecord tf_stats_record = 1
All TfStats records, one for each TF operation.
string host_tf_pprof_key = 2
key to the pprof profile for host TF operations.
string device_tf_pprof_key = 3
key to the pprof profile for device TF operations.

A 'Trace' contains metadata for the individual traces of a system.

map<uint32, Device> devices = 1
The devices that this trace has information about. Maps from device_id to more data about the specific device.
repeated TraceEvent trace_events = 4
All trace events capturing in the profiling period.

Used in: Trace

uint32 device_id = 1
The id of the device that this event occurred on. The full dataset should have this device present in the Trace object.
uint32 resource_id = 2
The id of the resource that this event occurred on. The full dataset should have this resource present in the Device object of the Trace object. A resource_id is unique on a specific device, but not necessarily within the trace.
string name = 3
The name of this trace event.
uint64 timestamp_ps = 9
The timestamp that this event occurred at (in picos since tracing started).
uint64 duration_ps = 10
The duration of the event in picoseconds if applicable. Events without duration are called instant events.
map<string, string> args = 11
Extra arguments that will be displayed in trace view.

An XEvent is a trace event, optionally annotated with XStats. Next ID: 6

Used in: XLine

int64 metadata_id = 1
XEventMetadata.id of corresponding metadata.
oneof data
- int64 offset_ps = 2
  Start time of the event in picoseconds, as offset from XLine.timestamp_ns().
- int64 num_occurrences = 5
  Number of occurrences of the event, if aggregated.
int64 duration_ps = 3
Duration of the event in picoseconds. Can be zero for an instant event.
repeated XStat stats = 4
XStats associated with the event. Each of these XStats should have a different metadata_id.

Metadata for an XEvent, corresponds to an event type and is shared by all XEvents with the same metadata_id. Next ID: 7

Used in: XPlane

int64 id = 1
XPlane.event_metadata map key.
string name = 2
Name of the event.
string display_name = 4
Name of the event shown in trace viewer.
bytes metadata = 3
Additional metadata in serialized format.
repeated XStat stats = 5
XStats that are constant for all XEvents with the same metadata_id. Each of these XStats should have a different metadata_id.
repeated int64 child_id = 6
XPlane.event_metadata map key for children events.

An XLine is a timeline of trace events (XEvents). Next ID: 12

Used in: XPlane

int64 id = 1
Id of this line, can be repeated within an XPlane. All XLines with the same id are effectively the same timeline.
int64 display_id = 10
Display id of this line. Multiple lines with the same display_id are grouped together in the same trace viewer row.
string name = 2
Name of this XLine.
string display_name = 11
Name of this XLine to display in trace viewer.
int64 timestamp_ns = 3
Start time of this line in nanoseconds since the UNIX epoch. XEvent.offset_ps is relative to this timestamp.
int64 duration_ps = 9
Profiling duration for this line in picoseconds.
repeated XEvent events = 4
XEvents within the same XLine should not overlap in time, but they can be nested.

An XPlane is a container of parallel timelines (XLines), generated by a profiling source or by post-processing one or more XPlanes. Next ID: 7

Used in: XSpace

int64 id = 1
string name = 2
Name of this line.
repeated XLine lines = 3
Parallel timelines grouped in this plane. XLines with the same id are effectively the same timeline.
map<int64, XEventMetadata> event_metadata = 4
XEventMetadata map, each entry uses the XEventMetadata.id as key. This map should be used for events that share the same ID over the whole XPlane.
map<int64, XStatMetadata> stat_metadata = 5
XStatMetadata map, each entry uses the XStatMetadata.id as key. This map should be used for stats that share the same ID over the whole XPlane.
repeated XStat stats = 6
XStats associated with this plane, e.g. device capabilities. Each of these XStats should have a different metadata_id.

A container of parallel XPlanes, generated by one or more profiling sources. Next ID: 5

repeated XPlane planes = 1
repeated string errors = 2
Errors (if any) in the generation of planes.
repeated string warnings = 3
Warnings (if any) in the generation of planes;
repeated string hostnames = 4
List of hostnames that XPlanes are generated from.

An XStat is a named value associated with an XEvent, e.g., a performance counter value, a metric computed by a formula applied over nested XEvents and XStats. Next ID: 8

Used in: XEvent, XEventMetadata, XPlane

int64 metadata_id = 1
XStatMetadata.id of corresponding metadata.
oneof value
Value of this stat.
- double double_value = 2
- uint64 uint64_value = 3
- int64 int64_value = 4
- string str_value = 5
- bytes bytes_value = 6
- uint64 ref_value = 7
  A string value that stored in XStatMetadata::name.

Metadata for an XStat, corresponds to a stat type and is shared by all XStats with the same metadata_id. Next ID: 4

Used in: XPlane

int64 id = 1
XPlane.stat_metadata map key.
string name = 2
Name of the stat (should be short). Two XStatMetadata with different id should have different names.
string description = 3
Description of the stat (might be long).

package tensorflow.profiler

message ActiveAllocation

int64 snapshot_index = 1

int64 special_index = 2

int64 num_occurrences = 3

message AllReduceDbResult

repeated AllReduceInfo all_reduce_info = 1

message AllReduceInfo

uint64 id = 1

string name = 2

uint64 all_reduce_id = 3

uint64 start_time_ps = 4

uint64 end_time_ps = 5

uint64 byte_size = 6

message AllReduceOpInfo

string name = 1

uint32 occurrences = 2

double duration_us = 3

uint64 data_size = 4

repeated ReplicaGroup replica_groups = 5

string description = 6

message BottleneckAnalysis

double input_percent = 7

double output_percent = 8

double idle_percent = 9

double compute_percent = 10

string input_classification = 1

string input_statement = 2

string kernel_launch_classification = 3

string kernel_launch_statement = 4

string all_other_classification = 5

string all_other_statement = 6

string device_collectives_classification = 11

string device_collectives_statement = 12

message BufferAllocation

int64 id = 1

double size_mib = 2

repeated string attributes = 3

repeated LogicalBuffer logical_buffers = 4

string common_shape = 5

message BufferSpan

int32 start = 1

int32 limit = 2

message ChannelInfo

int64 channel_id = 1

repeated uint32 src_core_ids = 11

repeated uint32 dst_core_ids = 12

uint64 data_size = 4

double duration_us = 5

uint32 occurrences = 6

double utilization = 7

repeated string hlo_names = 8

double send_delay_us = 9

string description = 13

message CombinedTfDataStats

bool is_input_bound = 3

string summary = 4

repeated TfDataBottleneckAnalysis bottleneck_analysis = 1

map<string, TfDataStats> tf_data_stats = 2

message CoreDetails

string hostname = 1

uint32 device_ordinal = 2

uint32 core_num = 3

uint32 local_chip_id = 4

uint32 global_chip_id = 5

uint32 global_core_id = 6

message Device

string name = 1

uint32 device_id = 2

map<uint32, Resource> resources = 3

message DeviceCapabilities

double clock_rate_in_ghz = 1

uint32 num_cores = 2

uint64 memory_size_in_bytes = 3

uint64 memory_bandwidth = 4

optional GPUComputeCapability compute_capability = 5

string device_vendor = 6

message DeviceMemoryTransfer

uint64 occurrence = 1

double time_us = 2