package sax.server.vision

Get desktop application:
View/edit binary Protocol Buffers messages

rpc Classify (ClassifyRequest, ClassifyResponse)
vision.proto:203
Returns the score (e.g., log pplx) given the text.
message ClassifyRequest
vision.proto:23
- string model_key = 1
- bytes image_bytes = 2
- optional ExtraInputs extra_inputs = 3
message ClassifyResponse
vision.proto:37
- repeated DecodedText texts = 1
rpc TextToImage (TextToImageRequest, TextToImageResponse)
vision.proto:206
Returns image generation results given the text.
message TextToImageRequest
vision.proto:41
- string model_key = 1
- string text = 2
- optional ExtraInputs extra_inputs = 3
message TextToImageResponse
vision.proto:62
- repeated ImageGenerations images = 2
rpc TextAndImageToImage (TextAndImageToImageRequest, TextAndImageToImageResponse)
vision.proto:209
Returns image generation results given the text and image.
message TextAndImageToImageRequest
vision.proto:47
- string model_key = 1
- string text = 2
- bytes image_bytes = 4
- optional ExtraInputs extra_inputs = 3
message TextAndImageToImageResponse
vision.proto:66
- repeated ImageGenerations images = 2
rpc Embed (EmbedRequest, EmbedResponse)
vision.proto:213
Returns an image embedding given an image.
message EmbedRequest
vision.proto:70
- string model_key = 1
- bytes image_bytes = 2
- optional ExtraInputs extra_inputs = 3
message EmbedResponse
vision.proto:76
- repeated double embedding = 1
rpc Detect (DetectRequest, DetectResponse)
vision.proto:217
Returns bounding box, label, and the score of objects detected given an image.
message DetectRequest
vision.proto:122
- string model_key = 1
- bytes image_bytes = 2
- repeated string text = 4
  For open-set detection models, one can pass specified sets here. Elements in `text` describe concepts in the image that should be detected; it is up to the detection model to interpret these texts. Some detection models may interpret the text to be object names, and the corresponding response.bounding_boxes.text must be one of the given text elements.
- repeated BoundingBox boxes_of_interest = 5
  Optionally accept box inputs to predict region class labels. The boxes are N regions of interest in the image. The .text and .score fields should be unset.
- optional ExtraInputs extra_inputs = 3
message DetectResponse
vision.proto:142
- repeated BoundingBox bounding_boxes = 1
  A list of bounding boxes. The bounding boxes have no explicit order.
rpc ImageToText (ImageToTextRequest, ImageToTextResponse)
vision.proto:220
Returns text generation results given image_bytes.
message ImageToTextRequest
vision.proto:147
- string model_key = 1
- bytes image_bytes = 2
- string text = 3
  Optional prefix text.
- optional ExtraInputs extra_inputs = 4
message ImageToTextResponse
vision.proto:155
- repeated DecodedText texts = 1
rpc ImageToImage (ImageToImageRequest, ImageToImageResponse)
vision.proto:223
Returns image generation results given image_bytes.
message ImageToImageRequest
vision.proto:159
- string model_key = 1
- bytes image_bytes = 2
- optional ExtraInputs extra_inputs = 4
message ImageToImageResponse
vision.proto:165
- repeated ImageGenerations images = 2
rpc VideoToText (VideoToTextRequest, VideoToTextResponse)
vision.proto:226
Returns text generation results given video.
message VideoToTextRequest
vision.proto:169
- string model_key = 1
- repeated bytes image_frames = 2
- string text = 3
  Optional prefix text.
- optional ExtraInputs extra_inputs = 4
message VideoToTextResponse
vision.proto:177
- repeated DecodedText texts = 1
rpc VideoToToken (VideoToTokenRequest, VideoToTokenResponse)
vision.proto:229
Returns video tokens results given video (tokenization).
message VideoToTokenRequest
vision.proto:181
- string model_key = 1
- repeated bytes image_frames = 2
  Video composed of multiple image frames.
- optional ExtraInputs extra_inputs = 3
message VideoToTokenResponse
vision.proto:187
- repeated double tokens = 1
  quantized or soft tokens.
rpc TokenToVideo (TokenToVideoRequest, TokenToVideoResponse)
vision.proto:232
Returns video bytes results given video tokens (de-tokenization).
message TokenToVideoRequest
vision.proto:191
- string model_key = 1
- repeated double tokens = 2
  quantized or soft tokens.
- optional ExtraInputs extra_inputs = 3
message TokenToVideoResponse
vision.proto:197
- repeated bytes image_frames = 2
  Video composed of multiple image frames.

Used in: DetectRequest, DetectResponse

double cx = 1
Coordinates are in pixel space. Upper left corner of the image represents (0.0, 0.0) Bottom right corner of the image represents (image_width, image_height)
double cy = 2
double w = 3
double h = 4
string text = 5
A label for the bounding box object.
double score = 6
A positive number which represents the ranking (the higher the better) of the bounding boxes. The semantic meaning of the score is left for each model to define.
optional DetectionMask mask = 7
When a mask is present, it only contains values inside the bounding box defined by cx,cy,w,h. I.e.: mask[i, j] corresponding the image pixel[cx - w/2 + i, cy - h / 2 + j], where cx,cy,w,h are given in the BoundingBox. Outside the bounding box the mask is always zero. This enables sending smaller mask sizes, since the mask size is only the bounding box size. E.g. MaskRCNN returns masks that are always 28x28 pixels, and they resized by third_party/cloud_tpu/models/detection/utils/mask_utils.py.

Used in: ClassifyResponse, ImageToTextResponse, VideoToTextResponse

string text = 1
The label of the classified object.
double score = 2
The score of the classified object.

Used in: BoundingBox

bytes mask_values = 1
mask represents a C-order 2-D uint8 array. The array's dimension are given by [mask_height, mask_width]. mask[i, j] / 255 represents the probability of the pixel being in the segment (the range [0,1] was scaled to [0, 255] for compression).
int32 mask_height = 2
int32 mask_width = 3

Used in: ImageToImageResponse, TextAndImageToImageResponse, TextToImageResponse

bytes image = 1
The generated image in byte array format. TODO(jianlijianli): decide on a image encoding format; Currently PNG.
double score = 2
The score for the generated image.

package sax.server.vision

service VisionService

rpc Classify (ClassifyRequest, ClassifyResponse)

message ClassifyRequest

string model_key = 1

bytes image_bytes = 2

optional ExtraInputs extra_inputs = 3

message ClassifyResponse

repeated DecodedText texts = 1

rpc TextToImage (TextToImageRequest, TextToImageResponse)

message TextToImageRequest

string model_key = 1

string text = 2

optional ExtraInputs extra_inputs = 3

message TextToImageResponse

repeated ImageGenerations images = 2

rpc TextAndImageToImage (TextAndImageToImageRequest, TextAndImageToImageResponse)

message TextAndImageToImageRequest

string model_key = 1

string text = 2

bytes image_bytes = 4

optional ExtraInputs extra_inputs = 3

message TextAndImageToImageResponse

repeated ImageGenerations images = 2

rpc Embed (EmbedRequest, EmbedResponse)

message EmbedRequest

string model_key = 1

bytes image_bytes = 2

optional ExtraInputs extra_inputs = 3

message EmbedResponse

repeated double embedding = 1

rpc Detect (DetectRequest, DetectResponse)

message DetectRequest

string model_key = 1

bytes image_bytes = 2

repeated string text = 4

repeated BoundingBox boxes_of_interest = 5

optional ExtraInputs extra_inputs = 3

message DetectResponse

repeated BoundingBox bounding_boxes = 1

rpc ImageToText (ImageToTextRequest, ImageToTextResponse)

message ImageToTextRequest

string model_key = 1

bytes image_bytes = 2

string text = 3

optional ExtraInputs extra_inputs = 4

message ImageToTextResponse

repeated DecodedText texts = 1

rpc ImageToImage (ImageToImageRequest, ImageToImageResponse)

message ImageToImageRequest

string model_key = 1

bytes image_bytes = 2

optional ExtraInputs extra_inputs = 4

message ImageToImageResponse

repeated ImageGenerations images = 2

rpc VideoToText (VideoToTextRequest, VideoToTextResponse)

message VideoToTextRequest

string model_key = 1

repeated bytes image_frames = 2

string text = 3

optional ExtraInputs extra_inputs = 4

message VideoToTextResponse

repeated DecodedText texts = 1

rpc VideoToToken (VideoToTokenRequest, VideoToTokenResponse)

message VideoToTokenRequest

string model_key = 1

repeated bytes image_frames = 2

optional ExtraInputs extra_inputs = 3

message VideoToTokenResponse

repeated double tokens = 1

rpc TokenToVideo (TokenToVideoRequest, TokenToVideoResponse)

message TokenToVideoRequest

string model_key = 1

repeated double tokens = 2

optional ExtraInputs extra_inputs = 3

message TokenToVideoResponse

repeated bytes image_frames = 2

message BoundingBox

double cx = 1

double cy = 2