package nisaba.translit.fst

Get desktop application:
View/edit binary Protocol Buffers messages

Next available ID: 23

optional string pairlm_file = 1
Filename for FST-based pair LM.
optional bool invert_pairlm = 2
Whether to invert the pairs, i.e., input symbols are output in pairs.
optional bool pairlm_is_transducer = 13
Whether pair LM is encoded as a transducer.
optional string translit_cands_file = 3
Filename for pre-computed transliteration pairs. Expects a TSV file with 3 columns: input string, output string and cost (negative log probability).
optional bool translit_cands_override = 14
Pre-computed translit_cands_file items override PairLM rather than mix.
optional double pairlm_translit_weight = 4
Mixture weight between pair LM candidates and pre-computed pairs.
optional string lm_file = 5
Filename for language model for full-string transliteration. Expects an FST encoded model.
optional bool apply_closure_to_lm = 12
Whether language model requires closure (e.g., word-level wordpiece model).
optional string oov_symbol = 6
Symbol in language model FST corresponding to OOV symbols.
optional double oov_cost = 7
Additional cost accrued by OOV symbols.
optional string epsilon_symbol = 15
Symbol in the language model FST corresponding to epsilon.
optional int32 max_word_cands = 8
The maximum number of candidate transliterations per word.
optional double word_cand_thresh = 9
Threshold on scores for word candidates, in terms of the difference of log probability with the highest probability candidate.
optional double min_cand_posterior = 11
A minimum posterior probability for any transliteration candidate at a word-position, as part of final pruning in addition to max_word_cands.
optional string word_piece_internal_prefix = 10
Internal word-piece prefix. If non-empty, will assume language model is constructed of word-pieces.
optional bool add_to_cache = 16
Allows turning off caching to prevent memory leak when running as a service.
optional int32 max_parallel_tokens = 17
Maximum number of tokens to transliterate in parallel using different cores.
optional bool apply_lm_at_word_level = 18
Whether to apply language model at the word (rather than sentence) level.
optional string word_piece_model = 19
Filename of word-piece model for on-the-fly word-piece segmentation.
optional string word_piece_word_initial_prefix = 20
String prefix indicating word-initial wordpieces. Uses default if not set.
optional bool sample_from_k_best = 21
Whether to sample from k-best rather than returning all of them.
optional int64 random_seed = 22
Random seed for sampling; defaults to std::random_device if not set. This is mainly provided for testing purposes, best to leave unset.

package nisaba.translit.fst

message PairLMDecoderOptions

optional string pairlm_file = 1

optional bool invert_pairlm = 2

optional bool pairlm_is_transducer = 13

optional string translit_cands_file = 3

optional bool translit_cands_override = 14

optional double pairlm_translit_weight = 4

optional string lm_file = 5

optional bool apply_closure_to_lm = 12

optional string oov_symbol = 6

optional double oov_cost = 7

optional string epsilon_symbol = 15

optional int32 max_word_cands = 8

optional double word_cand_thresh = 9

optional double min_cand_posterior = 11

optional string word_piece_internal_prefix = 10

optional bool add_to_cache = 16

optional int32 max_parallel_tokens = 17

optional bool apply_lm_at_word_level = 18

optional string word_piece_model = 19

optional string word_piece_word_initial_prefix = 20

optional bool sample_from_k_best = 21

optional int64 random_seed = 22