pan/bi: Implement fquantize2f16
Implement as f2f32(f2f16(x)) with the conversions in flush-to-zero mode. Accessing flush-to-zero mode on Bifrost is nontrivial: it is specified per-clause, rather than per-instruction. I've opted to pipe support for ftz clauses through the scheduler. This solution has two nice properties: * It uses the native hardware for flushing subnormals, avoiding extra lowering. * It's "smart" about scheduling around FTZ requirements, meaning we get good code generated even for a shader that e.g. quantizes a vector. With an unrelated scheduler fix, the *V2F32_TO_V2F16/+F16_TO_F32 operation fits in a single tuple, minimizing the overhead of the special FTZ clause. We'll have to do something a bit different for Valhall (FLUSH.f32), but we'll worry about when we actually have PanVK brought up on Valhall. Fixes dEQP-VK.spirv_assembly.instruction.compute.opquantize.* Signed-off-by: Alyssa Rosenzweig <alyssa@collabora.com> Reviewed-by: Jason Ekstrand <jason.ekstrand@collabora.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16123>
This commit is contained in:
parent
dfcb2f0699
commit
c9b33fe7dc
7 changed files with 68 additions and 1 deletions
|
|
@ -2429,6 +2429,7 @@
|
|||
<opt>rtz</opt>
|
||||
<opt>rtna</opt>
|
||||
</mod>
|
||||
<mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/>
|
||||
<derived start="6" size="1">
|
||||
<and>
|
||||
<eq left="abs0" right="#none"/>
|
||||
|
|
@ -3870,6 +3871,7 @@
|
|||
<opt>h0</opt>
|
||||
<opt>h1</opt>
|
||||
</mod>
|
||||
<mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/>
|
||||
</ins>
|
||||
|
||||
<ins name="+F16_TO_S32">
|
||||
|
|
@ -7938,6 +7940,7 @@
|
|||
<opt>rtz</opt>
|
||||
<opt>rtna</opt>
|
||||
</mod>
|
||||
<mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/>
|
||||
<derived start="6" size="1">
|
||||
<and>
|
||||
<eq left="abs0" right="#none"/>
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@
|
|||
|
||||
SKIP = set(["lane", "lane_dest", "lanes", "lanes", "replicate", "swz", "widen",
|
||||
"swap", "neg", "abs", "not", "sign", "extend", "divzero", "clamp", "sem",
|
||||
"not_result", "skip", "round"])
|
||||
"not_result", "skip", "round", "ftz"])
|
||||
|
||||
TEMPLATE = """
|
||||
#ifndef _BI_BUILDER_H_
|
||||
|
|
|
|||
|
|
@ -57,6 +57,7 @@ bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2)
|
|||
.dependency_slot = clause->scoreboard_id,
|
||||
.message_type = clause->message_type,
|
||||
.next_message_type = next_1 ? next_1->message_type : 0,
|
||||
.flush_to_zero = clause->ftz ? BIFROST_FTZ_ALWAYS : BIFROST_FTZ_DISABLE
|
||||
};
|
||||
|
||||
uint64_t u = 0;
|
||||
|
|
|
|||
|
|
@ -107,6 +107,17 @@ struct bi_const_state {
|
|||
unsigned word_idx;
|
||||
};
|
||||
|
||||
enum bi_ftz_state {
|
||||
/* No flush-to-zero state assigned yet */
|
||||
BI_FTZ_STATE_NONE,
|
||||
|
||||
/* Never flush-to-zero */
|
||||
BI_FTZ_STATE_DISABLE,
|
||||
|
||||
/* Always flush-to-zero */
|
||||
BI_FTZ_STATE_ENABLE,
|
||||
};
|
||||
|
||||
struct bi_clause_state {
|
||||
/* Has a message-passing instruction already been assigned? */
|
||||
bool message;
|
||||
|
|
@ -118,6 +129,9 @@ struct bi_clause_state {
|
|||
|
||||
unsigned tuple_count;
|
||||
struct bi_const_state consts[8];
|
||||
|
||||
/* Numerical state of the clause */
|
||||
enum bi_ftz_state ftz;
|
||||
};
|
||||
|
||||
/* Determines messsage type by checking the table and a few special cases. Only
|
||||
|
|
@ -1027,6 +1041,28 @@ bi_write_count(bi_instr *instr, uint64_t live_after_temp)
|
|||
return count;
|
||||
}
|
||||
|
||||
/*
|
||||
* Test if an instruction required flush-to-zero mode. Currently only supported
|
||||
* for f16<-->f32 conversions to implement fquantize16
|
||||
*/
|
||||
static bool
|
||||
bi_needs_ftz(bi_instr *I)
|
||||
{
|
||||
return (I->op == BI_OPCODE_F16_TO_F32 ||
|
||||
I->op == BI_OPCODE_V2F32_TO_V2F16) && I->ftz;
|
||||
}
|
||||
|
||||
/*
|
||||
* Test if an instruction would be numerically incompatible with the clause. At
|
||||
* present we only consider flush-to-zero modes.
|
||||
*/
|
||||
static bool
|
||||
bi_numerically_incompatible(struct bi_clause_state *clause, bi_instr *instr)
|
||||
{
|
||||
return (clause->ftz != BI_FTZ_STATE_NONE) &&
|
||||
((clause->ftz == BI_FTZ_STATE_ENABLE) != bi_needs_ftz(instr));
|
||||
}
|
||||
|
||||
/* Instruction placement entails two questions: what subset of instructions in
|
||||
* the block can legally be scheduled? and of those which is the best? That is,
|
||||
* we seek to maximize a cost function on a subset of the worklist satisfying a
|
||||
|
|
@ -1056,6 +1092,10 @@ bi_instr_schedulable(bi_instr *instr,
|
|||
if (bi_must_not_last(instr) && tuple->last)
|
||||
return false;
|
||||
|
||||
/* Numerical properties must be compatible with the clause */
|
||||
if (bi_numerically_incompatible(clause, instr))
|
||||
return false;
|
||||
|
||||
/* Message-passing instructions are not guaranteed write within the
|
||||
* same clause (most likely they will not), so if a later instruction
|
||||
* in the clause accesses the destination, the message-passing
|
||||
|
|
@ -1220,6 +1260,13 @@ bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
|
|||
if (bi_tuple_is_new_src(instr, &tuple->reg, s))
|
||||
tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s];
|
||||
}
|
||||
|
||||
/* This could be optimized to allow pairing integer instructions with
|
||||
* special flush-to-zero instructions, but punting on this until we have
|
||||
* a workload that cares.
|
||||
*/
|
||||
clause->ftz = bi_needs_ftz(instr) ? BI_FTZ_STATE_ENABLE :
|
||||
BI_FTZ_STATE_DISABLE;
|
||||
}
|
||||
|
||||
/* Choose the best instruction and pop it off the worklist. Returns NULL if no
|
||||
|
|
@ -1865,6 +1912,8 @@ bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint
|
|||
clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);
|
||||
clause->block = block;
|
||||
|
||||
clause->ftz = (clause_state.ftz == BI_FTZ_STATE_ENABLE);
|
||||
|
||||
/* We emit in reverse and emitted to the back of the tuples array, so
|
||||
* move it up front for easy indexing */
|
||||
memmove(clause->tuples,
|
||||
|
|
|
|||
|
|
@ -2357,6 +2357,15 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
|
|||
bi_f16_to_f32_to(b, dst, s0);
|
||||
break;
|
||||
|
||||
case nir_op_fquantize2f16:
|
||||
{
|
||||
bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0);
|
||||
bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false));
|
||||
|
||||
f16->ftz = f32->ftz = true;
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_op_f2i32:
|
||||
if (src_sz == 32)
|
||||
bi_f32_to_s32_to(b, dst, s0);
|
||||
|
|
|
|||
|
|
@ -465,6 +465,7 @@ typedef struct {
|
|||
struct {
|
||||
enum bi_special special; /* FADD_RSCALE, FMA_RSCALE */
|
||||
enum bi_round round; /* FMA, converts, FADD, _RSCALE, etc */
|
||||
bool ftz; /* Flush-to-zero for F16_TO_F32 */
|
||||
};
|
||||
|
||||
struct {
|
||||
|
|
@ -635,6 +636,9 @@ typedef struct {
|
|||
|
||||
/* Discard helper threads */
|
||||
bool td;
|
||||
|
||||
/* Should flush-to-zero mode be enabled for this clause? */
|
||||
bool ftz;
|
||||
} bi_clause;
|
||||
|
||||
#define BI_NUM_SLOTS 8
|
||||
|
|
|
|||
|
|
@ -23,5 +23,6 @@ include = [
|
|||
"dEQP-VK.image.load_store.with_format.*",
|
||||
"dEQP-VK.pipeline.input_assembly.*",
|
||||
"dEQP-VK.pipeline.sampler.view_type.*.format.r*.address_modes.all_mode_clamp_to_border*",
|
||||
"dEQP-VK.spirv_assembly.instruction.compute.opquantize.*",
|
||||
"dEQP-VK.ssbo.layout.single_basic_type.*",
|
||||
]
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue