@@ -1518,6 +1518,12 @@ int create_gpu_instance()
1518
1518
gpu_info.support_fp16_arithmetic = true ;
1519
1519
}
1520
1520
1521
+ if (physicalDeviceProperties.vendorID == 0x5143 && !gpu_info.support_fp16_storage )
1522
+ {
1523
+ // fp16 arithmetic yields wrong result on old adreno drivers :(
1524
+ gpu_info.support_fp16_arithmetic = false ;
1525
+ }
1526
+
1521
1527
if (gpu_info.support_cooperative_matrix )
1522
1528
{
1523
1529
// query supported cooperative matrix types and operations
@@ -3354,7 +3360,7 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
3354
3360
custom_defines.push_back (std::make_pair (" afpmat4" , " mat4" ));
3355
3361
}
3356
3362
3357
- if (opt.use_fp16_arithmetic )
3363
+ if (opt.use_fp16_storage && opt. use_fp16_arithmetic )
3358
3364
{
3359
3365
custom_defines.push_back (std::make_pair (" lfp" , " float16_t" ));
3360
3366
custom_defines.push_back (std::make_pair (" lfpvec4" , " f16vec4" ));
@@ -3384,7 +3390,7 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
3384
3390
custom_defines.push_back (std::make_pair (" sfp2lfpvec4(v)" , " v" ));
3385
3391
3386
3392
custom_defines.push_back (std::make_pair (" lfp2afp(v)" , " float16_t(v)" ));
3387
- custom_defines.push_back (std::make_pair (" lfp2afpvec4(v)" , " f16vec4(vec4(unpackHalf2x16( v.x),unpackHalf2x16 (v.y) ))" ));
3393
+ custom_defines.push_back (std::make_pair (" lfp2afpvec4(v)" , " f16vec4(unpackFloat2x16( v.x),unpackFloat2x16 (v.y))" ));
3388
3394
}
3389
3395
else if (opt.use_fp16_storage )
3390
3396
{
@@ -3439,20 +3445,20 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
3439
3445
custom_defines.push_back (std::make_pair (" buffer_ld1(buf,i)" , " float16_t(buf[i])" ));
3440
3446
custom_defines.push_back (std::make_pair (" buffer_st1(buf,i,v)" , " {buf[i]=float(v);}" ));
3441
3447
custom_defines.push_back (std::make_pair (" buffer_cp1(buf,i,sbuf,si)" , " {buf[i]=sbuf[si];}" ));
3442
- custom_defines.push_back (std::make_pair (" buffer_cp1to4(buf,i,sbuf,si4)" , " {buf[i]=uvec2(packHalf2x16(vec2( f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2( f16vec2(sbuf[si4.b],sbuf[si4.a]) )));}" ));
3443
- custom_defines.push_back (std::make_pair (" buffer_cp1to8(buf,i,sbuf,si4,sii4)" , " {buf[i]=uvec4(packHalf2x16(vec2( f16vec2(sbuf[si4.r],sbuf[si4.g]))),packHalf2x16(vec2( f16vec2(sbuf[si4.b],sbuf[si4.a]))),packHalf2x16(vec2( f16vec2(sbuf[sii4.r],sbuf[sii4.g]))),packHalf2x16(vec2( f16vec2(sbuf[sii4.b],sbuf[sii4.a]) )));}" ));
3444
- custom_defines.push_back (std::make_pair (" buffer_ld2(buf,i)" , " f16vec2(unpackHalf2x16( buf[i]) )" ));
3445
- custom_defines.push_back (std::make_pair (" buffer_st2(buf,i,v)" , " {buf[i]=packHalf2x16(vec2(v) )}" ));
3448
+ custom_defines.push_back (std::make_pair (" buffer_cp1to4(buf,i,sbuf,si4)" , " {buf[i]=uvec2(packFloat2x16( f16vec2(sbuf[si4.r],sbuf[si4.g])),packFloat2x16( f16vec2(sbuf[si4.b],sbuf[si4.a])));}" ));
3449
+ custom_defines.push_back (std::make_pair (" buffer_cp1to8(buf,i,sbuf,si4,sii4)" , " {buf[i]=uvec4(packFloat2x16( f16vec2(sbuf[si4.r],sbuf[si4.g])),packFloat2x16( f16vec2(sbuf[si4.b],sbuf[si4.a])),packFloat2x16( f16vec2(sbuf[sii4.r],sbuf[sii4.g])),packFloat2x16( f16vec2(sbuf[sii4.b],sbuf[sii4.a])));}" ));
3450
+ custom_defines.push_back (std::make_pair (" buffer_ld2(buf,i)" , " unpackFloat2x16( buf[i])" ));
3451
+ custom_defines.push_back (std::make_pair (" buffer_st2(buf,i,v)" , " {buf[i]=packFloat2x16(v )}" ));
3446
3452
custom_defines.push_back (std::make_pair (" buffer_cp2(buf,i,sbuf,si)" , " {buf[i]=sbuf[si];}" ));
3447
- custom_defines.push_back (std::make_pair (" buffer_ld4(buf,i)" , " f16vec4(vec4(unpackHalf2x16( buf[i].x),unpackHalf2x16 (buf[i].y) ))" ));
3448
- custom_defines.push_back (std::make_pair (" buffer_st4(buf,i,v)" , " {buf[i]=uvec2(packHalf2x16(vec2( v.rg)),packHalf2x16(vec2( v.ba) ));}" ));
3453
+ custom_defines.push_back (std::make_pair (" buffer_ld4(buf,i)" , " f16vec4(unpackFloat2x16( buf[i].x),unpackFloat2x16 (buf[i].y))" ));
3454
+ custom_defines.push_back (std::make_pair (" buffer_st4(buf,i,v)" , " {buf[i]=uvec2(packFloat2x16( v.rg),packFloat2x16( v.ba));}" ));
3449
3455
custom_defines.push_back (std::make_pair (" buffer_cp4(buf,i,sbuf,si)" , " {buf[i]=sbuf[si];}" ));
3450
- custom_defines.push_back (std::make_pair (" buffer_cp4to1(buf,i4,sbuf,si)" , " {uvec2 _v=sbuf[si]; vec2 _v0=unpackHalf2x16 (_v.x);vec2 _v1=unpackHalf2x16 (_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}" ));
3456
+ custom_defines.push_back (std::make_pair (" buffer_cp4to1(buf,i4,sbuf,si)" , " {uvec2 _v=sbuf[si]; f16vec2 _v0=unpackFloat2x16 (_v.x);f16vec2 _v1=unpackFloat2x16 (_v.y); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g;}" ));
3451
3457
custom_defines.push_back (std::make_pair (" buffer_cp4to8(buf,i,sbuf,si2)" , " {buf[i]=uvec4(sbuf[si2.r],sbuf[si2.g]);}" ));
3452
- custom_defines.push_back (std::make_pair (" buffer_ld8(buf,i)" , " f16mat2x4(f16vec4(vec4(unpackHalf2x16( buf[i].r),unpackHalf2x16 (buf[i].g))) ,f16vec4(vec4(unpackHalf2x16( buf[i].b),unpackHalf2x16 (buf[i].a) )))" ));
3453
- custom_defines.push_back (std::make_pair (" buffer_st8(buf,i,v)" , " {buf[i]=uvec4(uvec2(packHalf2x16(vec2( v[0].rg)),packHalf2x16(vec2( v[0].ba))) ,uvec2(packHalf2x16(vec2( v[1].rg)),packHalf2x16(vec2( v[1].ba) )));}" ));
3458
+ custom_defines.push_back (std::make_pair (" buffer_ld8(buf,i)" , " f16mat2x4(f16vec4(unpackFloat2x16( buf[i].r),unpackFloat2x16 (buf[i].g)),f16vec4(unpackFloat2x16( buf[i].b),unpackFloat2x16 (buf[i].a)))" ));
3459
+ custom_defines.push_back (std::make_pair (" buffer_st8(buf,i,v)" , " {buf[i]=uvec4(uvec2(packFloat2x16( v[0].rg),packFloat2x16( v[0].ba)),uvec2(packFloat2x16( v[1].rg),packFloat2x16( v[1].ba)));}" ));
3454
3460
custom_defines.push_back (std::make_pair (" buffer_cp8(buf,i,sbuf,si)" , " {buf[i]=sbuf[si];}" ));
3455
- custom_defines.push_back (std::make_pair (" buffer_cp8to1(buf,i4,ii4,sbuf,si)" , " {uvec4 _v=sbuf[si]; vec2 _v0=unpackHalf2x16 (_v.r);vec2 _v1=unpackHalf2x16 (_v.g);vec2 _v2=unpackHalf2x16 (_v.b);vec2 _v3=unpackHalf2x16 (_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}" ));
3461
+ custom_defines.push_back (std::make_pair (" buffer_cp8to1(buf,i4,ii4,sbuf,si)" , " {uvec4 _v=sbuf[si]; f16vec2 _v0=unpackFloat2x16 (_v.r);f16vec2 _v1=unpackFloat2x16 (_v.g);f16vec2 _v2=unpackFloat2x16 (_v.b);f16vec2 _v3=unpackFloat2x16 (_v.a); buf[i4.r]=_v0.r;buf[i4.g]=_v0.g;buf[i4.b]=_v1.r;buf[i4.a]=_v1.g; buf[ii4.r]=_v2.r;buf[ii4.g]=_v2.g;buf[ii4.b]=_v3.r;buf[ii4.a]=_v3.g;}" ));
3456
3462
custom_defines.push_back (std::make_pair (" buffer_cp8to4(buf,i2,sbuf,si)" , " {uvec4 _v=sbuf[si]; buf[i2.r]=_v.rg;buf[i2.g]=_v.ba;}" ));
3457
3463
}
3458
3464
else if (opt.use_fp16_storage )
0 commit comments