@@ -87,6 +87,8 @@ struct whisper_params {
87
87
std::string tdrz_speaker_turn = " [SPEAKER_TURN]" ; // TODO: set from command line
88
88
89
89
std::string openvino_encode_device = " CPU" ;
90
+
91
+ std::string dtw = " " ;
90
92
};
91
93
92
94
void whisper_print_usage (int /* argc*/ , char ** argv, const whisper_params & params, const server_params& sparams) {
@@ -126,6 +128,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
126
128
fprintf (stderr, " -m FNAME, --model FNAME [%-7s] model path\n " , params.model .c_str ());
127
129
fprintf (stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n " , params.openvino_encode_device .c_str ());
128
130
// server params
131
+ fprintf (stderr, " -dtw MODEL --dtw MODEL [%-7s] compute token-level timestamps\n " , params.dtw .c_str ());
129
132
fprintf (stderr, " --host HOST, [%-7s] Hostname/ip-adress for the server\n " , sparams.hostname .c_str ());
130
133
fprintf (stderr, " --port PORT, [%-7d] Port number for the server\n " , sparams.port );
131
134
fprintf (stderr, " --public PATH, [%-7s] Path to the public folder\n " , sparams.public_path .c_str ());
@@ -173,6 +176,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
173
176
else if ( arg == " --prompt" ) { params.prompt = argv[++i]; }
174
177
else if (arg == " -m" || arg == " --model" ) { params.model = argv[++i]; }
175
178
else if (arg == " -oved" || arg == " --ov-e-device" ) { params.openvino_encode_device = argv[++i]; }
179
+ else if (arg == " -dtw" || arg == " --dtw" ) { params.dtw = argv[++i]; }
176
180
else if (arg == " -ng" || arg == " --no-gpu" ) { params.use_gpu = false ; }
177
181
// server params
178
182
else if ( arg == " --port" ) { sparams.port = std::stoi (argv[++i]); }
@@ -499,6 +503,49 @@ int main(int argc, char ** argv) {
499
503
// whisper init
500
504
struct whisper_context_params cparams = whisper_context_default_params ();
501
505
cparams.use_gpu = params.use_gpu ;
506
+ if (!params.dtw .empty ()) {
507
+ cparams.dtw_token_timestamps = true ;
508
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_NONE;
509
+
510
+ if (params.dtw == " tiny" ) {
511
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY;
512
+ }
513
+ if (params.dtw == " tiny.en" ) {
514
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_TINY_EN;
515
+ }
516
+ if (params.dtw == " base" ) {
517
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE;
518
+ }
519
+ if (params.dtw == " base.en" ) {
520
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_BASE_EN;
521
+ }
522
+ if (params.dtw == " small" ) {
523
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL;
524
+ }
525
+ if (params.dtw == " small.en" ) {
526
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_SMALL_EN;
527
+ }
528
+ if (params.dtw == " medium" ) {
529
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM;
530
+ }
531
+ if (params.dtw == " medium.en" ) {
532
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_MEDIUM_EN;
533
+ }
534
+ if (params.dtw == " large.v1" ) {
535
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1;
536
+ }
537
+ if (params.dtw == " large.v2" ) {
538
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2;
539
+ }
540
+ if (params.dtw == " large.v3" ) {
541
+ cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3;
542
+ }
543
+
544
+ if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) {
545
+ fprintf (stderr, " error: unknown DTW preset '%s'\n " , params.dtw .c_str ());
546
+ return 3 ;
547
+ }
548
+ }
502
549
503
550
struct whisper_context * ctx = whisper_init_from_file_with_params (params.model .c_str (), cparams);
504
551
@@ -865,6 +912,7 @@ int main(int argc, char ** argv) {
865
912
if (!params.no_timestamps ) {
866
913
word[" start" ] = token.t0 * 0.01 ;
867
914
word[" end" ] = token.t1 * 0.01 ;
915
+ word[" t_dtw" ] = token.t_dtw ;
868
916
}
869
917
word[" probability" ] = token.p ;
870
918
total_logprob += token.plog ;
0 commit comments