@@ -55,7 +55,21 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
55
55
if (ubatch->pos && pos) {
56
56
const int64_t n_tokens = ubatch->n_tokens ;
57
57
58
- ggml_backend_tensor_set (pos, ubatch->pos , 0 , n_tokens*n_pos_per_token*ggml_element_size (pos));
58
+ if (ubatch->token && n_pos_per_embd == 4 ) {
59
+ // in case we're using M-RoPE with text tokens, convert the 1D positions to 4D
60
+ // the 3 first dims are the same, and 4th dim is all 0
61
+ std::vector<llama_pos> pos_data (n_tokens*n_pos_per_embd);
62
+ // copy the first dimension
63
+ for (int i = 0 ; i < n_tokens; ++i) {
64
+ pos_data[ i] = ubatch->pos [i];
65
+ pos_data[ n_tokens + i] = ubatch->pos [i];
66
+ pos_data[2 * n_tokens + i] = ubatch->pos [i];
67
+ pos_data[3 * n_tokens + i] = 0 ; // 4th dim is 0
68
+ }
69
+ ggml_backend_tensor_set (pos, pos_data.data (), 0 , pos_data.size ()*ggml_element_size (pos));
70
+ } else {
71
+ ggml_backend_tensor_set (pos, ubatch->pos , 0 , n_tokens*n_pos_per_embd*ggml_element_size (pos));
72
+ }
59
73
}
60
74
}
61
75
@@ -71,7 +85,7 @@ void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
71
85
) * f_attn_temp_scale + 1.0 ;
72
86
}
73
87
74
- ggml_backend_tensor_set (attn_scale, attn_scale_data.data (), 0 , n_tokens*n_pos_per_token* ggml_element_size (attn_scale));
88
+ ggml_backend_tensor_set (attn_scale, attn_scale_data.data (), 0 , n_tokens*ggml_element_size (attn_scale));
75
89
}
76
90
}
77
91
@@ -592,7 +606,7 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
592
606
res (std::make_unique<llm_graph_result>()) {
593
607
}
594
608
595
- int64_t llm_graph_context::n_pos_per_token () const {
609
+ int64_t llm_graph_context::n_pos_per_embd () const {
596
610
return arch == LLM_ARCH_QWEN2VL ? 4 : 1 ;
597
611
}
598
612
@@ -803,6 +817,10 @@ ggml_tensor * llm_graph_context::build_ffn(
803
817
804
818
if (down) {
805
819
cur = build_lora_mm (down, cur);
820
+ if (arch == LLM_ARCH_GLM4) {
821
+ // GLM4 seems to have numerical issues with half-precision accumulators
822
+ ggml_mul_mat_set_prec (cur, GGML_PREC_F32);
823
+ }
806
824
}
807
825
808
826
if (down_b) {
@@ -910,28 +928,35 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
910
928
ggml_tensor * up = build_lora_mm_id (up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
911
929
cb (up, " ffn_moe_up" , il);
912
930
913
- ggml_tensor * gate = build_lora_mm_id (gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
914
- cb (gate, " ffn_moe_gate" , il);
931
+ ggml_tensor * experts = nullptr ;
932
+ if (gate_exps) {
933
+ cur = build_lora_mm_id (gate_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
934
+ cb (cur, " ffn_moe_gate" , il);
935
+ } else {
936
+ cur = up;
937
+ }
915
938
916
939
switch (type_op) {
917
940
case LLM_FFN_SILU:
918
941
{
919
- gate = ggml_silu (ctx0, gate );
920
- cb (gate , " ffn_moe_silu" , il);
942
+ cur = ggml_silu (ctx0, cur );
943
+ cb (cur , " ffn_moe_silu" , il);
921
944
} break ;
922
945
case LLM_FFN_GELU:
923
946
{
924
- gate = ggml_gelu (ctx0, gate );
925
- cb (gate , " ffn_moe_gelu" , il);
947
+ cur = ggml_gelu (ctx0, cur );
948
+ cb (cur , " ffn_moe_gelu" , il);
926
949
} break ;
927
950
default :
928
951
GGML_ABORT (" fatal error" );
929
952
}
930
953
931
- ggml_tensor * par = ggml_mul (ctx0, up, gate); // [n_ff, n_expert_used, n_tokens]
932
- cb (par, " ffn_moe_gate_par" , il);
954
+ if (gate_exps) {
955
+ cur = ggml_mul (ctx0, cur, up); // [n_ff, n_expert_used, n_tokens]
956
+ cb (cur, " ffn_moe_gate_par" , il);
957
+ }
933
958
934
- ggml_tensor * experts = build_lora_mm_id (down_exps, par , selected_experts); // [n_embd, n_expert_used, n_tokens]
959
+ experts = build_lora_mm_id (down_exps, cur , selected_experts); // [n_embd, n_expert_used, n_tokens]
935
960
cb (experts, " ffn_moe_down" , il);
936
961
937
962
if (!weight_before_ffn) {
@@ -1014,11 +1039,11 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
1014
1039
}
1015
1040
1016
1041
ggml_tensor * llm_graph_context::build_inp_pos () const {
1017
- auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_token ());
1042
+ auto inp = std::make_unique<llm_graph_input_pos>(n_pos_per_embd ());
1018
1043
1019
1044
auto & cur = inp->pos ;
1020
1045
1021
- cur = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_token ());
1046
+ cur = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, n_tokens*n_pos_per_embd ());
1022
1047
ggml_set_input (cur);
1023
1048
1024
1049
res->add_input (std::move (inp));
@@ -1027,11 +1052,12 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
1027
1052
}
1028
1053
1029
1054
ggml_tensor * llm_graph_context::build_inp_attn_scale () const {
1030
- auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token (), hparams.n_attn_temp_floor_scale , hparams.f_attn_temp_scale );
1055
+ auto inp = std::make_unique<llm_graph_input_attn_temp>(hparams.n_attn_temp_floor_scale , hparams.f_attn_temp_scale );
1031
1056
1032
1057
auto & cur = inp->attn_scale ;
1033
1058
1034
- cur = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 1 , 1 , n_tokens*n_pos_per_token ());
1059
+ // this need to be 1x1xN for broadcasting
1060
+ cur = ggml_new_tensor_3d (ctx0, GGML_TYPE_F32, 1 , 1 , n_tokens);
1035
1061
ggml_set_input (cur);
1036
1062
1037
1063
res->add_input (std::move (inp));
0 commit comments