-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathinference_deepseek-r1-distill-llama.yml
72 lines (68 loc) · 4.44 KB
/
inference_deepseek-r1-distill-llama.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
models:
- name: trtllm_model
version: 1.0.0
device: auto
inferer_type: customized
inferer_name: trtllm_inferer
inferer_path:
inferer_args: # more args of model inferer.
# llm style used to build prompt(chat or function call) and parse generated response for openai interface.
# Support llm_style see README.md.
llm_style: deepseek-r1
# tokenizer config.
tokenizer_type: huggingface # can be `huggingface`, `sentencepiece`. Must be set.
tokenizer_path: /tmp/DeepSeek-R1-Distill-Llama-8B/ # path of tokenizer. Must be set.
tokenizer_parallelism: 16 # tokenizers count for parallel tokenization. Will be set to 1 if not set.
end_token_id: 128001 # "<|end▁of▁sentence|>"
pad_token_id: 128001 # "<|end▁of▁sentence|>"
skip_special_tokens: # skip special tokens when decoding. Empty if not set.
- 128001 # "<|end▁of▁sentence|>"
force_tokens_dict: # will be used to force map tokens to ids when encode and decode instead of using tokenizer. Empty if not set.
prefix_tokens_id: # prefix tokens id will be added to the beginning of the input ids. Empty if not set.
suffix_tokens_id: # suffix tokens id will be added to the end of the input ids. Empty if not set.
# default sampling config, sampling param in request will overwrite these. Support sampling params see
# @ref(src/constants.h - SamplingConfig)
sampling:
temperature: 0.6
# trtllm config.
gpt_model_type: inflight_fused_batching # must be `V1`(==`v1`) or `inflight_batching`(==`inflight_fused_batching`).
gpt_model_path: /tmp/DeepSeek-R1-Distill-Llama-8B/trt_engines/ # path of decoder model. Must be set.
encoder_model_path: # path of encoder model. Null if not set.
stop_words: # additional stop words. Empty if not set.
- "<|end▁of▁sentence|>"
bad_words: # additional bad words. Empty if not set.
max_tokens_in_paged_kv_cache: # use default if not set.
max_attention_window_size: # use default (i.e. max_sequence_length) if not set.
sink_token_length: # use default if not set.
batch_scheduler_policy: guaranteed_no_evict # must be `max_utilization` or `guaranteed_no_evict`.
kv_cache_free_gpu_mem_fraction: 0.9 # will be set to 0.9 or `max_tokens_in_paged_kv_cache` if not set.
kv_cache_host_memory_bytes: # will be set to 0 if not set.
kv_cache_onboard_blocks: # will be set to true if not set.
exclude_input_in_output: true # will be set to false if not set.
cancellation_check_period_ms: # will be set to 100 (ms) if not set.
stats_check_period_ms: # will be set to 100 (ms) if not set.
iter_stats_max_iterations: # will be set to 1000 if not set.
request_stats_max_iterations: # will be set to 0 if not set.
enable_kv_cache_reuse: true # will be set to false if not set.
normalize_log_probs: # will be set to true if not set.
enable_chunked_context: # will be set to false if not set.
gpu_device_ids: # will be automatically set if not set.
lora_cache_optimal_adapter_size: # will be set to 8 if not set.
lora_cache_max_adapter_size: # will be set to 64 if not set.
lora_cache_gpu_memory_fraction: # will be set to 0.05 if not set.
lora_cache_host_memory_bytes: # will be set to 1073741824(1GB) if not set.
decoding_mode: # must be one of the {`top_k`, `top_p`, `top_k_top_p`, `beam_search`}. Use default: `top_k_top_p` if max_beam_width == 1, beam_search otherwise.
executor_worker_path: # will be set to `/opt/tritonserver/backends/tensorrtllm/trtllmExecutorWorker` if not set.
medusa_choices: # will be set to `mc_sim_7b_63` if not set.
gpu_weights_percent: # will be set to 1.0 if not set.
converter_type: none # only support `torch` (torch tensor converter), `tensorflow` (tf tensor converter), `tensorrt` (tensorrt tensor converter), `customized` or `none`(no converter mode) now.
converter_name: # converter name that has registered in src/customized_converter.h. Not none when converter_type is `customized`.
converter_path: # path of converter.
converter_args: # more args of converter.
dag:
type: sequential # only support `sequential` now.
name: your_dag # dag name.
nodes: # sequential mode will run node in the order of nodes.
- name: node-1
type: model # only support `model` now.
model: trtllm_model-1.0.0 # model(name-version format) that has been declared in models.