Skip to content

Commit 842678b

Browse files
committed
talk-llama : sync llama.cpp
ggml-ci
1 parent 16b5022 commit 842678b

13 files changed

+159
-74
lines changed

examples/talk-llama/llama-arch.cpp

+20
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
1919
{ LLM_ARCH_REFACT, "refact" },
2020
{ LLM_ARCH_BERT, "bert" },
2121
{ LLM_ARCH_NOMIC_BERT, "nomic-bert" },
22+
{ LLM_ARCH_NOMIC_BERT_MOE, "nomic-bert-moe" },
2223
{ LLM_ARCH_JINA_BERT_V2, "jina-bert-v2" },
2324
{ LLM_ARCH_BLOOM, "bloom" },
2425
{ LLM_ARCH_STABLELM, "stablelm" },
@@ -106,6 +107,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
106107
{ LLM_KV_EXPERT_WEIGHTS_SCALE, "%s.expert_weights_scale" },
107108
{ LLM_KV_EXPERT_WEIGHTS_NORM, "%s.expert_weights_norm" },
108109
{ LLM_KV_EXPERT_GATING_FUNC, "%s.expert_gating_func" },
110+
{ LLM_KV_MOE_EVERY_N_LAYERS, "%s.moe_every_n_layers" },
109111
{ LLM_KV_POOLING_TYPE, "%s.pooling_type" },
110112
{ LLM_KV_LOGIT_SCALE, "%s.logit_scale" },
111113
{ LLM_KV_DECODER_START_TOKEN_ID, "%s.decoder_start_token_id" },
@@ -472,6 +474,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
472474
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
473475
},
474476
},
477+
{
478+
LLM_ARCH_NOMIC_BERT_MOE,
479+
{
480+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
481+
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
482+
{ LLM_TENSOR_TOKEN_TYPES, "token_types" },
483+
{ LLM_TENSOR_ATTN_OUT_NORM, "blk.%d.attn_output_norm" },
484+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
485+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
486+
{ LLM_TENSOR_LAYER_OUT_NORM, "blk.%d.layer_output_norm" },
487+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
488+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
489+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
490+
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
491+
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
492+
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
493+
},
494+
},
475495
{
476496
LLM_ARCH_JINA_BERT_V2,
477497
{

examples/talk-llama/llama-arch.h

+2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ enum llm_arch {
2323
LLM_ARCH_REFACT,
2424
LLM_ARCH_BERT,
2525
LLM_ARCH_NOMIC_BERT,
26+
LLM_ARCH_NOMIC_BERT_MOE,
2627
LLM_ARCH_JINA_BERT_V2,
2728
LLM_ARCH_BLOOM,
2829
LLM_ARCH_STABLELM,
@@ -110,6 +111,7 @@ enum llm_kv {
110111
LLM_KV_EXPERT_WEIGHTS_SCALE,
111112
LLM_KV_EXPERT_WEIGHTS_NORM,
112113
LLM_KV_EXPERT_GATING_FUNC,
114+
LLM_KV_MOE_EVERY_N_LAYERS,
113115
LLM_KV_POOLING_TYPE,
114116
LLM_KV_LOGIT_SCALE,
115117
LLM_KV_DECODER_START_TOKEN_ID,

examples/talk-llama/llama-chat.cpp

+7-15
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
5050
{ "deepseek3", LLM_CHAT_TEMPLATE_DEEPSEEK_3 },
5151
{ "command-r", LLM_CHAT_TEMPLATE_COMMAND_R },
5252
{ "llama3", LLM_CHAT_TEMPLATE_LLAMA_3 },
53-
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGML_3 },
54-
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGML_4 },
53+
{ "chatglm3", LLM_CHAT_TEMPLATE_CHATGLM_3 },
54+
{ "chatglm4", LLM_CHAT_TEMPLATE_CHATGLM_4 },
5555
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
5656
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
5757
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
@@ -122,6 +122,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
122122
}
123123
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
124124
return LLM_CHAT_TEMPLATE_PHI_3;
125+
} else if (tmpl_contains("[gMASK]<sop>")) {
126+
return LLM_CHAT_TEMPLATE_CHATGLM_4;
125127
} else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
126128
return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
127129
} else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
@@ -154,9 +156,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
154156
return LLM_CHAT_TEMPLATE_LLAMA_3;
155157
} else if (tmpl_contains("[gMASK]sop")) {
156158
// chatglm3-6b
157-
return LLM_CHAT_TEMPLATE_CHATGML_3;
158-
} else if (tmpl_contains("[gMASK]<sop>")) {
159-
return LLM_CHAT_TEMPLATE_CHATGML_4;
159+
return LLM_CHAT_TEMPLATE_CHATGLM_3;
160160
} else if (tmpl_contains(LU8("<用户>"))) {
161161
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
162162
return LLM_CHAT_TEMPLATE_MINICPM;
@@ -437,7 +437,7 @@ int32_t llm_chat_apply_template(
437437
if (add_ass) {
438438
ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
439439
}
440-
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
440+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
441441
// chatglm3-6b
442442
ss << "[gMASK]" << "sop";
443443
for (auto message : chat) {
@@ -447,7 +447,7 @@ int32_t llm_chat_apply_template(
447447
if (add_ass) {
448448
ss << "<|assistant|>";
449449
}
450-
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
450+
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
451451
ss << "[gMASK]" << "<sop>";
452452
for (auto message : chat) {
453453
std::string role(message->role);
@@ -456,14 +456,6 @@ int32_t llm_chat_apply_template(
456456
if (add_ass) {
457457
ss << "<|assistant|>";
458458
}
459-
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
460-
for (auto message : chat) {
461-
std::string role(message->role);
462-
ss << "<|" << role << "|>" << "\n" << message->content;
463-
}
464-
if (add_ass) {
465-
ss << "<|assistant|>";
466-
}
467459
} else if (tmpl == LLM_CHAT_TEMPLATE_MINICPM) {
468460
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
469461
for (auto message : chat) {

examples/talk-llama/llama-chat.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ enum llm_chat_template {
2929
LLM_CHAT_TEMPLATE_DEEPSEEK_3,
3030
LLM_CHAT_TEMPLATE_COMMAND_R,
3131
LLM_CHAT_TEMPLATE_LLAMA_3,
32-
LLM_CHAT_TEMPLATE_CHATGML_3,
33-
LLM_CHAT_TEMPLATE_CHATGML_4,
32+
LLM_CHAT_TEMPLATE_CHATGLM_3,
33+
LLM_CHAT_TEMPLATE_CHATGLM_4,
3434
LLM_CHAT_TEMPLATE_GLMEDGE,
3535
LLM_CHAT_TEMPLATE_MINICPM,
3636
LLM_CHAT_TEMPLATE_EXAONE_3,

examples/talk-llama/llama-context.cpp

+4-17
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ llama_context::llama_context(
114114
}
115115

116116
if (n_ctx_per_seq > hparams.n_ctx_train) {
117-
LLAMA_LOG_WARN("%s: n_ctx_pre_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
117+
LLAMA_LOG_WARN("%s: n_ctx_per_seq (%u) > n_ctx_train (%u) -- possible training context overflow\n",
118118
__func__, n_ctx_per_seq, hparams.n_ctx_train);
119119
}
120120

@@ -469,8 +469,7 @@ ggml_tensor * llama_context::build_rope_shift(
469469
ggml_tensor * shift,
470470
ggml_tensor * factors,
471471
float freq_base,
472-
float freq_scale,
473-
ggml_backend_buffer * bbuf) const {
472+
float freq_scale) const {
474473
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
475474

476475
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
@@ -492,17 +491,7 @@ ggml_tensor * llama_context::build_rope_shift(
492491
// dequantize to f32 -> RoPE -> quantize back
493492
tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32);
494493

495-
if (bbuf) {
496-
for (const auto & backend : backends) {
497-
// Figure out which backend KV cache belongs to
498-
if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) {
499-
ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get());
500-
break;
501-
}
502-
}
503-
}
504-
505-
tmp = ggml_rope_ext_inplace(ctx0, tmp,
494+
tmp = ggml_rope_ext(ctx0, tmp,
506495
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
507496
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
508497

@@ -582,7 +571,7 @@ llm_graph_result_ptr llama_context::build_kv_self_shift(
582571
ggml_row_size(kv_self->k_l[il]->type, n_embd_k_gqa),
583572
0);
584573

585-
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l, kv_self->k_l[il]->buffer);
574+
ggml_tensor * cur = build_rope_shift(ctx0, k, inp->k_shift, rope_factors, freq_base_l, freq_scale_l);
586575

587576
ggml_build_forward_expand(gf, cur);
588577
}
@@ -1547,8 +1536,6 @@ int32_t llama_context::output_reserve(int32_t n_outputs) {
15471536
// set all ids as invalid (negative)
15481537
std::fill(output_ids.begin(), output_ids.end(), -1);
15491538

1550-
ggml_backend_buffer_clear(buf_output.get(), 0);
1551-
15521539
this->n_outputs = 0;
15531540
this->n_outputs_max = n_outputs_max;
15541541

examples/talk-llama/llama-context.h

+1-2
Original file line numberDiff line numberDiff line change
@@ -170,8 +170,7 @@ struct llama_context {
170170
ggml_tensor * shift,
171171
ggml_tensor * factors,
172172
float freq_base,
173-
float freq_scale,
174-
ggml_backend_buffer * bbuf) const;
173+
float freq_scale) const;
175174

176175
llm_graph_result_ptr build_kv_self_shift(
177176
ggml_context * ctx0,

examples/talk-llama/llama-graph.cpp

+42-16
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
5555
if (ubatch->pos && pos) {
5656
const int64_t n_tokens = ubatch->n_tokens;
5757

58-
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_token*ggml_element_size(pos));
58+
if (ubatch->token && n_pos_per_embd == 4) {
59+
// in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
60+
// the 3 first dims are the same, and 4th dim is all 0
61+
std::vector<llama_pos> pos_data(n_tokens*n_pos_per_embd);
62+
// copy the first dimension
63+
for (int i = 0; i < n_tokens; ++i) {
64+
pos_data[ i] = ubatch->pos[i];
65+
pos_data[ n_tokens + i] = ubatch->pos[i];
66+
pos_data[2 * n_tokens + i] = ubatch->pos[i];
67+
pos_data[3 * n_tokens + i] = 0; // 4th dim is 0
68+
}
69+
ggml_backend_tensor_set(pos, pos_data.data(), 0, pos_data.size()*ggml_element_size(pos));
70+
} else {
71+
ggml_backend_tensor_set(pos, ubatch->pos, 0, n_tokens*n_pos_per_embd*ggml_element_size(pos));
72+
}
5973
}
6074
}
6175

@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
7185
) * f_attn_temp_scale + 1.0;
7286
}
7387

74-
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
88+
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*ggml_element_size(attn_scale));
7589
}
7690
}
7791

@@ -592,7 +606,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
592606
res (std::make_unique<llm_graph_result>()) {
593607
}
594608

595-
int64_t llm_graph_context::n_pos_per_token() const {
609+
int64_t llm_graph_context::n_pos_per_embd() const {
596610
return arch == LLM_ARCH_QWEN2VL ? 4 : 1;
597611
}
598612

@@ -803,6 +817,10 @@ ggml_tensor * llm_graph_context::build_ffn(
803817

804818
if (down) {
805819
cur = build_lora_mm(down, cur);
820+
if (arch == LLM_ARCH_GLM4) {
821+
// GLM4 seems to have numerical issues with half-precision accumulators
822+
ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
823+
}
806824
}
807825

808826
if (down_b) {
@@ -910,28 +928,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
910928
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
911929
cb(up, "ffn_moe_up", il);
912930

913-
ggml_tensor * gate = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
914-
cb(gate, "ffn_moe_gate", il);
931+
ggml_tensor * experts = nullptr;
932+
if (gate_exps) {
933+
cur = build_lora_mm_id(gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
934+
cb(cur, "ffn_moe_gate", il);
935+
} else {
936+
cur = up;
937+
}
915938

916939
switch (type_op) {
917940
case LLM_FFN_SILU:
918941
{
919-
gate = ggml_silu(ctx0, gate);
920-
cb(gate, "ffn_moe_silu", il);
942+
cur = ggml_silu(ctx0, cur);
943+
cb(cur, "ffn_moe_silu", il);
921944
} break;
922945
case LLM_FFN_GELU:
923946
{
924-
gate = ggml_gelu(ctx0, gate);
925-
cb(gate, "ffn_moe_gelu", il);
947+
cur = ggml_gelu(ctx0, cur);
948+
cb(cur, "ffn_moe_gelu", il);
926949
} break;
927950
default:
928951
GGML_ABORT("fatal error");
929952
}
930953

931-
ggml_tensor * par = ggml_mul(ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
932-
cb(par, "ffn_moe_gate_par", il);
954+
if (gate_exps) {
955+
cur = ggml_mul(ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
956+
cb(cur, "ffn_moe_gate_par", il);
957+
}
933958

934-
ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
959+
experts = build_lora_mm_id(down_exps, cur, selected_experts); // [n_embd, n_expert_used, n_tokens]
935960
cb(experts, "ffn_moe_down", il);
936961

937962
if (!weight_before_ffn) {
@@ -1014,11 +1039,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
10141039
}
10151040

10161041
ggml_tensor * llm_graph_context::build_inp_pos() const {
1017-
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token());
1042+
auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd());
10181043

10191044
auto & cur = inp->pos;
10201045

1021-
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token());
1046+
cur = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd());
10221047
ggml_set_input(cur);
10231048

10241049
res->add_input(std::move(inp));
@@ -1027,11 +1052,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
10271052
}
10281053

10291054
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
1030-
auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
1055+
auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
10311056

10321057
auto & cur = inp->attn_scale;
10331058

1034-
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
1059+
// this need to be 1x1xN for broadcasting
1060+
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens);
10351061
ggml_set_input(cur);
10361062

10371063
res->add_input(std::move(inp));

examples/talk-llama/llama-graph.h

+5-7
Original file line numberDiff line numberDiff line change
@@ -90,29 +90,27 @@ class llm_graph_input_embd : public llm_graph_input_i {
9090

9191
class llm_graph_input_pos : public llm_graph_input_i {
9292
public:
93-
llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
93+
llm_graph_input_pos(int64_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
9494
virtual ~llm_graph_input_pos() = default;
9595

9696
void set_input(const llama_ubatch * ubatch) override;
9797

9898
ggml_tensor * pos = nullptr; // I32 [n_batch]
9999

100-
const int64_t n_pos_per_token = 1;
100+
const int64_t n_pos_per_embd = 1;
101101
};
102102

103103
// temperature tuning, used by llama4
104104
class llm_graph_input_attn_temp : public llm_graph_input_i {
105105
public:
106-
llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
107-
: n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
106+
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
107+
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
108108
virtual ~llm_graph_input_attn_temp() = default;
109109

110110
void set_input(const llama_ubatch * ubatch) override;
111111

112112
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
113113

114-
const int64_t n_pos_per_token = 1;
115-
116114
const uint32_t n_attn_temp_floor_scale;
117115
const float f_attn_temp_scale;
118116
};
@@ -419,7 +417,7 @@ struct llm_graph_context {
419417

420418
llm_graph_context(const llm_graph_params & params);
421419

422-
int64_t n_pos_per_token() const;
420+
int64_t n_pos_per_embd() const;
423421

424422
void cb(ggml_tensor * cur, const char * name, int il) const;
425423

examples/talk-llama/llama-hparams.h

+1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ struct llama_hparams {
6666
float expert_weights_scale = 0.0;
6767
bool expert_weights_norm = false;
6868
uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
69+
uint32_t moe_every_n_layers = 0;
6970

7071
float f_norm_eps;
7172
float f_norm_rms_eps;

0 commit comments

Comments
 (0)