diff --git a/src/panfrost/bifrost/ISA.xml b/src/panfrost/bifrost/ISA.xml index 5970f7da675..55fb080ff54 100644 --- a/src/panfrost/bifrost/ISA.xml +++ b/src/panfrost/bifrost/ISA.xml @@ -2429,6 +2429,7 @@ rtz rtna + @@ -3870,6 +3871,7 @@ h0 h1 + @@ -7938,6 +7940,7 @@ rtz rtna + diff --git a/src/panfrost/bifrost/bi_builder.h.py b/src/panfrost/bifrost/bi_builder.h.py index 5ba37818264..a41edb66750 100644 --- a/src/panfrost/bifrost/bi_builder.h.py +++ b/src/panfrost/bifrost/bi_builder.h.py @@ -21,7 +21,7 @@ SKIP = set(["lane", "lane_dest", "lanes", "lanes", "replicate", "swz", "widen", "swap", "neg", "abs", "not", "sign", "extend", "divzero", "clamp", "sem", - "not_result", "skip", "round"]) + "not_result", "skip", "round", "ftz"]) TEMPLATE = """ #ifndef _BI_BUILDER_H_ diff --git a/src/panfrost/bifrost/bi_pack.c b/src/panfrost/bifrost/bi_pack.c index e400adfaf74..5cef648bf5e 100644 --- a/src/panfrost/bifrost/bi_pack.c +++ b/src/panfrost/bifrost/bi_pack.c @@ -57,6 +57,7 @@ bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2) .dependency_slot = clause->scoreboard_id, .message_type = clause->message_type, .next_message_type = next_1 ? next_1->message_type : 0, + .flush_to_zero = clause->ftz ? BIFROST_FTZ_ALWAYS : BIFROST_FTZ_DISABLE }; uint64_t u = 0; diff --git a/src/panfrost/bifrost/bi_schedule.c b/src/panfrost/bifrost/bi_schedule.c index 01bf6299be0..a3e176b0a15 100644 --- a/src/panfrost/bifrost/bi_schedule.c +++ b/src/panfrost/bifrost/bi_schedule.c @@ -107,6 +107,17 @@ struct bi_const_state { unsigned word_idx; }; +enum bi_ftz_state { + /* No flush-to-zero state assigned yet */ + BI_FTZ_STATE_NONE, + + /* Never flush-to-zero */ + BI_FTZ_STATE_DISABLE, + + /* Always flush-to-zero */ + BI_FTZ_STATE_ENABLE, +}; + struct bi_clause_state { /* Has a message-passing instruction already been assigned? */ bool message; @@ -118,6 +129,9 @@ struct bi_clause_state { unsigned tuple_count; struct bi_const_state consts[8]; + + /* Numerical state of the clause */ + enum bi_ftz_state ftz; }; /* Determines messsage type by checking the table and a few special cases. Only @@ -1027,6 +1041,28 @@ bi_write_count(bi_instr *instr, uint64_t live_after_temp) return count; } +/* + * Test if an instruction required flush-to-zero mode. Currently only supported + * for f16<-->f32 conversions to implement fquantize16 + */ +static bool +bi_needs_ftz(bi_instr *I) +{ + return (I->op == BI_OPCODE_F16_TO_F32 || + I->op == BI_OPCODE_V2F32_TO_V2F16) && I->ftz; +} + +/* + * Test if an instruction would be numerically incompatible with the clause. At + * present we only consider flush-to-zero modes. + */ +static bool +bi_numerically_incompatible(struct bi_clause_state *clause, bi_instr *instr) +{ + return (clause->ftz != BI_FTZ_STATE_NONE) && + ((clause->ftz == BI_FTZ_STATE_ENABLE) != bi_needs_ftz(instr)); +} + /* Instruction placement entails two questions: what subset of instructions in * the block can legally be scheduled? and of those which is the best? That is, * we seek to maximize a cost function on a subset of the worklist satisfying a @@ -1056,6 +1092,10 @@ bi_instr_schedulable(bi_instr *instr, if (bi_must_not_last(instr) && tuple->last) return false; + /* Numerical properties must be compatible with the clause */ + if (bi_numerically_incompatible(clause, instr)) + return false; + /* Message-passing instructions are not guaranteed write within the * same clause (most likely they will not), so if a later instruction * in the clause accesses the destination, the message-passing @@ -1220,6 +1260,13 @@ bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple, if (bi_tuple_is_new_src(instr, &tuple->reg, s)) tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s]; } + + /* This could be optimized to allow pairing integer instructions with + * special flush-to-zero instructions, but punting on this until we have + * a workload that cares. + */ + clause->ftz = bi_needs_ftz(instr) ? BI_FTZ_STATE_ENABLE : + BI_FTZ_STATE_DISABLE; } /* Choose the best instruction and pop it off the worklist. Returns NULL if no @@ -1865,6 +1912,8 @@ bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP); clause->block = block; + clause->ftz = (clause_state.ftz == BI_FTZ_STATE_ENABLE); + /* We emit in reverse and emitted to the back of the tuples array, so * move it up front for easy indexing */ memmove(clause->tuples, diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c index 83507f477a0..871333630f2 100644 --- a/src/panfrost/bifrost/bifrost_compile.c +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -2357,6 +2357,15 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr) bi_f16_to_f32_to(b, dst, s0); break; + case nir_op_fquantize2f16: + { + bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0); + bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false)); + + f16->ftz = f32->ftz = true; + break; + } + case nir_op_f2i32: if (src_sz == 32) bi_f32_to_s32_to(b, dst, s0); diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h index 3675965fd15..98ec840c4ca 100644 --- a/src/panfrost/bifrost/compiler.h +++ b/src/panfrost/bifrost/compiler.h @@ -465,6 +465,7 @@ typedef struct { struct { enum bi_special special; /* FADD_RSCALE, FMA_RSCALE */ enum bi_round round; /* FMA, converts, FADD, _RSCALE, etc */ + bool ftz; /* Flush-to-zero for F16_TO_F32 */ }; struct { @@ -635,6 +636,9 @@ typedef struct { /* Discard helper threads */ bool td; + + /* Should flush-to-zero mode be enabled for this clause? */ + bool ftz; } bi_clause; #define BI_NUM_SLOTS 8 diff --git a/src/panfrost/ci/deqp-panfrost-g52-vk.toml b/src/panfrost/ci/deqp-panfrost-g52-vk.toml index 6858f2460e6..a223c499bc5 100644 --- a/src/panfrost/ci/deqp-panfrost-g52-vk.toml +++ b/src/panfrost/ci/deqp-panfrost-g52-vk.toml @@ -23,5 +23,6 @@ include = [ "dEQP-VK.image.load_store.with_format.*", "dEQP-VK.pipeline.input_assembly.*", "dEQP-VK.pipeline.sampler.view_type.*.format.r*.address_modes.all_mode_clamp_to_border*", + "dEQP-VK.spirv_assembly.instruction.compute.opquantize.*", "dEQP-VK.ssbo.layout.single_basic_type.*", ]