package tensorflow.profiler

Get desktop application:
View/edit binary Protocol Buffers messages

Result database for all-reduce ops.

Used in: PerCoreStepInfo

repeated AllReduceInfo all_reduce_info = 1

Result proto for all -educe ops.

Used in: AllReduceDbResult

uint64 id = 1
Unique id for all-reduce ops.
string name = 2
The name of the hlo op.
uint64 all_reduce_id = 3
For all-reduce nodes from different modules, if they have the same all_reduce_id, they will be 'Allreduce'd'. If empty, AllReduce will not be applied across modules.
uint64 start_time_ps = 4
The start time in picoseconds of the op event.
uint64 end_time_ps = 5
The end time in picoseconds of the op event.
uint64 byte_size = 6
The size of the op in bytes.

Used in: DeviceCapabilities

uint32 major = 1
uint32 minor = 2

A 'device' is a physical entity in the system and is comprised of several resources.

Used in: Trace

string name = 1
The name of the device.
uint32 device_id = 2
The id of this device, unique in a single trace.
map<uint32, Resource> resources = 3
The resources on this device, keyed by resource_id;

double clock_rate_in_ghz = 1
uint32 num_cores = 2
uint64 memory_size_in_bytes = 3
uint64 memory_bandwidth = 4
optional CudaComputeCapability compute_capability = 5

Result database for core to core flow events.

Used in: PerCoreStepInfo

repeated FlowEventInfo flow_info = 1

Result proto for metrics on flow events.

Used in: FlowDbResult

uint64 flow_id = 1
Unique id for each send and recv pair.
int64 channel_id = 2
Channel id generated by the XLA compiler, it is statically unique within an HloModule.
string name = 3
The name of the hlo op.
string category = 4
Category of the hlo op.
uint64 start_time_ps = 5
The start time in picoseconds of the op event.
uint64 end_time_ps = 6
The end time in picoseconds of the op event.
uint64 byte_size = 7
The size of the op in bytes.
uint32 replica_id = 8
The replica id of the program running the flow event.

string kernel_launch_bottleneck = 1
Indicates if kernel launch is a performance bottleneck. Possible values: "no", "moderate", "high".
string kernel_launch_statement = 2
A statement that recommends if we need to further investigate kernel-launch performance.
string all_other_bottleneck = 3
Indicates if all other is a performance bottleneck. Possible values: "no", "moderate", "high".
string all_other_statement = 4
A statement that recommends if we need to further investigate all-other performance.

Breakdown of step-time on generic hardware. Note that these components are mutually exclusive so that adding them together is equal to the step time. If an execution time interval has multiple types of event happening, we need to pick one of the event type to attribute the time interval to.

map<int32, uint64> type_ps = 1
Map event type to the accumulated duration in picoseconds of that type.

optional StepSummary unknown_time_ms_summary = 1
Summary of all unknown time as a part of step in ms.
optional StepSummary host_wait_input_ms_summary = 9
Summary of all host-wait-input time as a part of step in ms.
optional StepSummary host_to_device_ms_summary = 10
Summary of all host-to-device time as a part of step in ms.
optional StepSummary input_ms_summary = 11
Summary of all input time as a part of step in ms.
optional StepSummary output_ms_summary = 3
Summary of all output time as a part of step in ms.
optional StepSummary device_compute_ms_summary = 4
Summary of all device-compute time as a part of step in ms.
optional StepSummary device_to_device_ms_summary = 5
Summary of all device-to-device time as a part of step in ms.
optional StepSummary host_compute_ms_summary = 6
Summary of all host-compute time as a part of step in ms.
optional StepSummary host_prepare_ms_summary = 7
Summary of all host-prepare time as a part of step in ms.
optional StepSummary host_compile_ms_summary = 8
Summary of all compilation time as a part of step in ms.

Types of hardware profiled.

Used in: InputPipelineAnalysisResult

UNKNOWN_HARDWARE = 0
Unknown hardware.
CPU_ONLY = 1
CPU only without any hardware accelerator.
GPU = 2
GPU.
TPU = 3
TPU.

Result proto for host-dependent job information.

Used in: RunEnvironment

string host_id = 1
This ID of the host where the job was run on.
string command_line = 2
The command line used to run the job.
int64 start_time = 3
The start time of this run (nanoseconds since the Unix epoch).
string bns_address = 4
BNS address specified by client at time of profiling request.
uint64 profile_time_ns = 5
Profiling start walltime (in ns).

Result proto for host-independent job information.

Used in: RunEnvironment

int64 change_list = 1
The change-list number of this build.
int64 build_time = 2
The time of this build (nanoseconds since the Unix epoch).
string build_target = 3
The target of this build.
uint32 profile_duration_ms = 4
Profiling duration (in ms).

Used in: InputPipelineAnalysisResult

string op_name = 1
The Op's name.
uint64 count = 2
The number of occurrences.
double time_in_ms = 3
Time (accumulated over all occurrences) in milliseconds.
double time_in_percent = 4
Time (accumulated over all occurrences) in percentage of the total input processing time.
double self_time_in_ms = 5
Self time (accumulated over all occurrences) in milliseconds.
double self_time_in_percent = 6
Self time (accumulated over all occurrences) in percentage of the total input processing time.
string category = 7
Possible categories: "Enqueue", "Advanced file read", "Demanded file read", "Preprocessing", "Unknown".

Used in: InputPipelineAnalysisResult

repeated string details = 1
A list of detailed recommendations.

Used in: OverviewPage

HardwareType hardware_type = 1
Hardware type.
optional StepSummary step_time_summary = 2
Summary of all step duration across all cores.
optional StepSummary input_percent_summary = 3
Summary of all input-related stall as percentage of step duration.
repeated google.protobuf.Any step_details = 4
Details of each step. Can be unpacked into a PerGenericStepDetails.
optional InputTimeBreakdown input_time_breakdown = 5
The breakdown of the input processing time.
repeated InputOpDetails input_op_details = 6
Details of each input Op executed.
optional InputPipelineAnalysisRecommendation recommendation = 7
Recommendation for next steps to users.
optional google.protobuf.Any step_time_breakdown = 8
Breakdown of the step time. Can be unpacked into a GenericStepTimeBreakdown.

Used in: InputPipelineAnalysisResult

double demanded_file_read_us = 1
Time spent on demanded file read in microseconds.
double advanced_file_read_us = 2
Time spent on advanced file read in microseconds.
double preprocessing_us = 3
Time spent on data preprocessing in microseconds.
double enqueue_us = 4
The infeed enqueue time in microseconds.
double unclassified_non_enqueue_us = 5
This entry is for the situtation where we can't further break down the non-enqueue input time (because the input pipeline is not instrumented).

Data layout of an op.

Used in: OpMetrics

repeated LayoutAnalysis.Dimension dimensions = 1
The physical data layout, from most-minor to most-major dimensions.

Physical data layout in each tensor dimension.

Used in: LayoutAnalysis

int32 size = 1
Size of the data in this dimension.
int32 alignment = 2
Data must be padded to a multiple of alignment.
LayoutDimensionSemantics semantics = 3
What the dimension represents.

What the dimension represents, e.g. spatial, feature or batch.

Used in: LayoutAnalysis.Dimension

UNKNOWN_SEMANTICS = 0
FEATURE = 1
BATCH = 2
SPATIAL = 3

Metrics for an operation (accumulated over all occurrences). Next ID: 18

Used in: OpMetricsDb

uint64 hlo_module_id = 13
HLO module id. 0 for TF ops.
string name = 6
Name of this op.
string category = 11
Category of this op.
string provenance = 12
Provenance of this op (e.g., if HLO op, original TF op).
uint32 occurrences = 3
Number of executions.
uint64 time_ps = 7
Total time (self + children) in picoseconds.
uint64 min_time_ps = 17
Minimum time (self + children) among all occurrences.
uint64 self_time_ps = 1
Total self time in picoseconds.
uint64 flops = 2
Total FLOPs.
uint64 bytes_accessed = 5
Total bytes accessed.
uint64 dma_stall_ps = 10
Total dma stall time in picoseconds.
optional LayoutAnalysis layout = 14
The data layout for this op. Only set for convolution ops for now.
string deduplicated_name = 15
Deduplicated HLO name for this op. Not set for TF ops.
optional OpMetricsDb children = 16
Children of the op. e.g. fused ops if this op is fusion.

A database for OpMetrics. Next ID: 13

Used in: OpMetrics, OpStats, PerCoreStepInfo

repeated OpMetrics metrics_db = 10
A bunch of OpMetrics.
uint64 total_host_infeed_enq_duration_ps = 2
The total host infeed-enqueue duration in picoseconds.
uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3
The total of the difference between the start times of two consecutive infeed-enqueues (per host) in picoseconds.
uint64 total_time_ps = 11
The total time in picoseconds.
uint64 total_op_time_ps = 12
The total time incurred by OPs in picoseconds.

Operator Statistics.

optional OpMetricsDb host_op_metrics_db = 1
The database for the op metrics collected from the host over the entire profiling session including incomplete steps.
optional OpMetricsDb device_op_metrics_db = 2
The database for the op metrics collected from the device over the entire profiling session including incomplete steps.
optional PerfEnv perf_env = 3
Performance environment of the op metrics collected.
optional StepDatabaseResult step_db = 4
The database of step sequences.
optional RunEnvironment run_environment = 5
The run environment of this profiling session.

optional RunEnvironment run_environment = 5
The run environment of the profiled session.
optional InputPipelineAnalysisResult input_analysis = 2
The step-time result.
optional OverviewPageAnalysis analysis = 3
The other analysis result.
optional OverviewPageRecommendation recommendation = 4
The recommendation made to the user.

Overview result for general analysis.

Used in: OverviewPage

double mxu_utilization_percent = 1
MXU utilization in percentage.
double device_idle_time_percent = 2
Percentage of the device time that is idle.
double host_idle_time_percent = 3
Percentage of the host time that is idle.
repeated OverviewTfOp top_device_ops = 4
Top TF Ops executed on the device.
string remark_text = 5
Remark text in the performance summary section.
string remark_color = 6
Color of the remark text.
double flop_rate_utilization_relative_to_roofline_percent = 7
FLOP rate utilization relative to the roofline in percentage.
double memory_bw_utilization_relative_to_hw_limit_percent = 8
Memory bandwidth utilization relative to the hw limit in percentage.

Overview result for the recommendation section.

Used in: OverviewPage

string bottleneck = 1
Possible performance bottleneck: "host", "device", "both".
string statement = 2
A statement that recommends the next steps for investigating the bottleneck.
repeated OverviewPageTip host_tips = 3
A list of tips for improving host performance.
repeated OverviewPageTip device_tips = 4
A list of tips for improving device performance.
repeated OverviewPageTip documentation_tips = 5
A list of links to related useful documents.
optional google.protobuf.Any recommendation = 6
// The recommendation made to the user. Can be unpacked into a GenericRecommendation.
repeated OverviewPageTip faq_tips = 7
A list of tips for FAQ.
repeated OverviewPageTip inference_tips = 8
A list of tips for inference run.

Overview result for a performance tip to users.

Used in: OverviewPageRecommendation

string link = 1
Link to the tip.

Overview result for a TensorFlow Op.

Used in: OverviewPageAnalysis

string name = 1
Name of the Op.
string category = 2
Category of the Op.
double self_time_fraction = 3
The amount of time that this Op takes by itself as fraction of the total execution time on the device or host.
double cumulative_time_fraction = 4
The cumulative time upto this Op as fraction of the total execution time.
double flop_rate = 5
How many GFlops/sec that this Op achieves.

Result proto for information in a step across all cores.

Used in: StepDatabaseResult

uint32 step_num = 1
The step number.
map<uint32, StepInfoResult> step_info_per_core = 2
A map from core_id to StepInfo.
optional OpMetricsDb hlo_metrics_db = 3
The result for the per-step HLO-metric database.
map<uint32, FlowDbResult> flow_db_per_core = 4
The result for send and recv flows.
map<uint32, uint32> core_id_to_replica_id_map = 5
A map from core ID to program replica id. Replica id map could change during a profile session, but should stay stable within a step.
map<uint32, AllReduceDbResult> all_reduce_db_per_core = 6
The result for all-reduce ops.hlo_metrics_db

Per-step details on generic hardware.

int32 step_number = 1
The step number of a step.
double step_time_ms = 2
The step time (in ms).
double unknown_time_ms = 3
Breakdown of the step time in different event categories. The unknown time (in ms).
double host_wait_input_ms = 11
The time (in ms) in which the host is waiting for input data to be ready.
double host_to_device_ms = 12
The time (in ms) in which the host is sending input data to the device. Total input time = host_wait_input_ms + host_to_device_ms.
double output_ms = 5
The output time (in ms).
double device_compute_ms = 6
The device-compute time (in ms).
double device_to_device_ms = 7
The device-to-device communication time (in ms).
double host_compute_ms = 8
The host-compute time (in ms).
double host_prepare_ms = 9
The host-prepare time (in ms).
double host_compile_ms = 10
The time spent on compiling (in ms).

Performance environment, e.g the peak performance capabilities of the device.

Used in: OpStats

double peak_tera_flops_per_second = 1
Peak performance of a TPU core or a GPU in TFLOP/s.
double peak_hbm_bw_giga_bytes_per_second = 2
Peak memory bandwidth of a TPU core or a GPU in GiBs/s.
double ridge_point = 3
The ridge point of roofline model in FLOP/Byte. (i.e., minimum operational intensity required to achieve maximum performance).

A 'resource' generally is a specific computation component on a device. These can range from threads on CPUs to specific arithmetic units on hardware devices.

Used in: Device

string name = 1
The name of the resource.
uint32 resource_id = 2
The id of the resource. Unique within a device.

The run environment of a profiling session.

Used in: OpStats, OverviewPage

int32 host_count = 1
Number of hosts used.
int32 task_count = 2
Number of tasks used.
map<string, bool> hostnames = 3
Distinct hostnames seen.
string device_type = 4
The type of device used.
int32 device_core_count = 5
The number of device cores used. In TPU case, this corresponds to the number of TPU cores In GPU case, this corresponds to the number of GPUs (not the number of SMs).
int32 per_core_batch_size = 6
The per-device-core batch size.
optional HostIndependentJobInfoResult host_independent_job_info = 7
Host-independent information about this job.
repeated HostDependentJobInfoResult host_dependent_job_info = 8
Host-dependent information about this job.
int32 replica_count = 9
The number of replicas, corresponds to input parallelism. If there is no model parallelism, replica_count = device_core_count
int32 num_cores_per_replica = 10
The number of cores used for a single replica, e.g. model parallelism. If there is no model parallelism, then num_cores_per_replica = 1
optional SystemTopology topology = 11
The chip interconnection topology.

Result proto for a StepDatabase.

Used in: OpStats

repeated PerCoreStepInfo step_sequence = 1
A sequence of PerCoreStepInfo.

Next ID: 5 Result proto for StepInfo.

Used in: PerCoreStepInfo

uint32 step_num = 1
The step number.
uint64 duration_ps = 2
The step duration in picoseconds.
uint64 begin_ps = 3
The start time of this step in picoseconds.
optional google.protobuf.Any step_breakdown = 4
Breakdown of the step-time. Can be unpacked into a GenericStepBreakdown.

message StepSummary

input_pipeline.proto:9

Used for both step duration and Op duration.

Used in: GenericStepTimeBreakdown, InputPipelineAnalysisResult

double average = 1
double standard_deviation = 2
double minimum = 3
double maximum = 4

System topology, which describes the number of chips in a pod and the connectivity style.

Used in: RunEnvironment

int64 x_dimension = 1
The X, Y, and Z dimensions of this topology. 0 means that dimension does not exist.
int64 y_dimension = 2
int64 z_dimension = 3
int64 num_expected_reduced_chips = 4
The number of expected bad chips in this system.

A database of TfStatsTables.

optional TfStatsTable with_idle = 4
The table that includes IDLE time.
optional TfStatsTable without_idle = 5
The table that excludes IDLE time.

There is one TfStatsRecord for each TF operation profiled.

Used in: TfStatsTable

uint64 rank = 1
Rank of this TF-op among all TF-ops.
string host_or_device = 2
Whether this TF-op is on "Host" or "Device".
string op_type = 3
TF-op type.
string op_name = 4
TF-op name.
int64 occurrences = 5
Number of occurrences of the operation.
double total_time_in_us = 6
Total "accumulated" time in micro-seconds that the operation took. If this operation has any children operations, the "accumulated" time includes the time spent inside children.
double avg_time_in_us = 7
Average "accumulated" time in micro-seconds that each occurrence of the operation took.
double total_self_time_in_us = 8
Total "self" time in micro-seconds that the operation took. If this operation has any children operations, the "self" time doesn't include the time spent inside children.
double avg_self_time_in_us = 9
Average "self" time in micro-seconds that the operation took.
double device_total_self_time_as_fraction = 10
Total "self" time as fraction of the sum of the total self-time of operations run on the device. It is 0 if this op runs on the host.
double device_cumulative_total_self_time_as_fraction = 11
Cumulative value of device_total_self_time_as_fraction.
double host_total_self_time_as_fraction = 12
Total "self" time as fraction of the sum of the total self-time of operations run on the host. It is 0 if this op runs on the device.
double host_cumulative_total_self_time_as_fraction = 13
Cumulative value of host_total_self_time_as_fraction.
double measured_flop_rate = 14
Number of floating-point operations (FLOPs) performed per second.
double measured_memory_bw = 15
Number of bytes (including both read and write) accessed per second.
double operational_intensity = 16
Operational intensity, which is defined as FLOPs/bytes-accessed.
string bound_by = 17
Whether this operation is "Compute" or "Memory" bound, according to the Roofline Model.

A table of TFStatsRecords plus the corresponding pprof keys.

Used in: TfStatsDatabase

repeated TfStatsRecord tf_stats_record = 1
All TfStats records, one for each TF operation.
string host_tf_pprof_key = 2
key to the pprof profile for host TF operations.
string device_tf_pprof_key = 3
key to the pprof profile for device TF operations.

A 'Trace' contains metadata for the individual traces of a system.

map<uint32, Device> devices = 1
The devices that this trace has information about. Maps from device_id to more data about the specific device.
repeated TraceEvent trace_events = 4
All trace events capturing in the profiling period.

Used in: Trace

uint32 device_id = 1
The id of the device that this event occurred on. The full dataset should have this device present in the Trace object.
uint32 resource_id = 2
The id of the resource that this event occurred on. The full dataset should have this resource present in the Device object of the Trace object. A resource_id is unique on a specific device, but not necessarily within the trace.
string name = 3
The name of this trace event.
uint64 timestamp_ps = 9
The timestamp that this event occurred at (in picos since tracing started).
uint64 duration_ps = 10
The duration of the event in picoseconds if applicable. Events without duration are called instant events.
map<string, string> args = 11
Extra arguments that will be displayed in trace view.

An XEvent is a trace event, optionally annotated with XStats. Next ID: 6

Used in: XLine

int64 metadata_id = 1
XEventMetadata.id of corresponding metadata.
oneof data
- int64 offset_ps = 2
  Start time of the event in picoseconds, as offset from XLine.timestamp_ns().
- int64 num_occurrences = 5
  Number of occurrences of the event, if aggregated.
int64 duration_ps = 3
Duration of the event in picoseconds. Can be zero for an instant event.
repeated XStat stats = 4
XStats associated with the event.

Metadata for an XEvent, shared by all instances of the same event. Next ID: 5

Used in: XPlane

int64 id = 1
XPlane.event_metadata map key.
string name = 2
Name of the event.
string display_name = 4
Name of the event shown in trace viewer.
bytes metadata = 3
Additional metadata in serialized format.

An XLine is a timeline of trace events (XEvents). Next ID: 12

Used in: XPlane

int64 id = 1
Id of this line, can be repeated within an XPlane. All XLines with the same id are effectively the same timeline.
int64 display_id = 10
Display id of this line. Multiple lines with the same display_id are grouped together in the same trace viewer row.
string name = 2
Name of this XLine.
string display_name = 11
Name of this XLine to display in trace viewer.
int64 timestamp_ns = 3
Start time of this line in nanoseconds since the UNIX epoch. XEvent.offset_ps is relative to this timestamp.
int64 duration_ps = 9
Profiling duration for this line in picoseconds.
repeated XEvent events = 4
XEvents within the same XLine should not overlap in time, but they can be nested.

An XPlane is a container of parallel timelines (XLines), generated by a profiling source or by post-processing one or more XPlanes. Next ID: 7

Used in: XSpace

int64 id = 1
string name = 2
Name of this line.
repeated XLine lines = 3
Parallel timelines grouped in this plane. XLines with the same id are effectively the same timeline.
map<int64, XEventMetadata> event_metadata = 4
XEventMetadata map, each entry uses the XEventMetadata.id as key. This map should be used for events that share the same ID over the whole XPlane.
map<int64, XStatMetadata> stat_metadata = 5
XStatMetadata map, each entry uses the XStatMetadata.id as key. This map should be used for stats that share the same ID over the whole XPlane.
repeated XStat stats = 6
XStats associated with this plane, e.g. device capabilities.

A container of parallel XPlanes, generated by one or more profiling sources. Next ID: 2

repeated XPlane planes = 1

An XStat is a named value associated with an XEvent, e.g., a performance counter value, a metric computed by a formula applied over nested XEvents and XStats. Next ID: 6

Used in: XEvent, XPlane

int64 metadata_id = 1
XStatMetadata.id of corresponding metadata.
oneof value
Value of this stat.
- double double_value = 2
- uint64 uint64_value = 3
- int64 int64_value = 4
- string str_value = 5

Metadata for an XStat, shared by all instances of the same stat. Next ID: 4

Used in: XPlane

int64 id = 1
XPlane.stat_metadata map key.
string name = 2
Name of the stat (should be short).
string description = 3
Description of the stat (might be long).

package tensorflow.profiler

message AllReduceDbResult

repeated AllReduceInfo all_reduce_info = 1

message AllReduceInfo

uint64 id = 1

string name = 2

uint64 all_reduce_id = 3

uint64 start_time_ps = 4

uint64 end_time_ps = 5

uint64 byte_size = 6

message CudaComputeCapability

uint32 major = 1

uint32 minor = 2

message Device

string name = 1

uint32 device_id = 2

map<uint32, Resource> resources = 3

message DeviceCapabilities

double clock_rate_in_ghz = 1

uint32 num_cores = 2

uint64 memory_size_in_bytes = 3

uint64 memory_bandwidth = 4

optional CudaComputeCapability compute_capability = 5

message FlowDbResult

repeated FlowEventInfo flow_info = 1

message FlowEventInfo

uint64 flow_id = 1

int64 channel_id = 2

string name = 3

string category = 4

uint64 start_time_ps = 5

uint64 end_time_ps = 6

uint64 byte_size = 7

uint32 replica_id = 8

message GenericRecommendation

string kernel_launch_bottleneck = 1

string kernel_launch_statement = 2

string all_other_bottleneck = 3

string all_other_statement = 4

message GenericStepBreakdown

map<int32, uint64> type_ps = 1

message GenericStepTimeBreakdown

optional StepSummary unknown_time_ms_summary = 1

optional StepSummary host_wait_input_ms_summary = 9

optional StepSummary host_to_device_ms_summary = 10

optional StepSummary input_ms_summary = 11

optional StepSummary output_ms_summary = 3

optional StepSummary device_compute_ms_summary = 4

optional StepSummary device_to_device_ms_summary = 5

optional StepSummary host_compute_ms_summary = 6

optional StepSummary host_prepare_ms_summary = 7

optional StepSummary host_compile_ms_summary = 8

enum HardwareType

UNKNOWN_HARDWARE = 0

CPU_ONLY = 1

GPU = 2

TPU = 3

message HostDependentJobInfoResult

string host_id = 1

string command_line = 2

int64 start_time = 3

string bns_address = 4

uint64 profile_time_ns = 5

message HostIndependentJobInfoResult

int64 change_list = 1

int64 build_time = 2

string build_target = 3

uint32 profile_duration_ms = 4

message InputOpDetails

string op_name = 1

uint64 count = 2

double time_in_ms = 3

double time_in_percent = 4

double self_time_in_ms = 5

double self_time_in_percent = 6

string category = 7

message InputPipelineAnalysisRecommendation

repeated string details = 1

message InputPipelineAnalysisResult

HardwareType hardware_type = 1