diff --git a/src/intel/isl/isl_tiled_memcpy.c b/src/intel/isl/isl_tiled_memcpy.c index fed08511030..267b7a7147f 100644 --- a/src/intel/isl/isl_tiled_memcpy.c +++ b/src/intel/isl/isl_tiled_memcpy.c @@ -405,6 +405,203 @@ linear_to_ytiled(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, } } +/** + * Copy texture data from linear to Tile-4 layout. + * + * \copydoc tile_copy_fn + */ +static inline void +linear_to_tile4(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, + uint32_t y0, uint32_t y3, + char *dst, const char *src, + int32_t src_pitch, + uint32_t swizzle_bit, + isl_mem_copy_fn mem_copy, + isl_mem_copy_fn mem_copy_align16) +{ + /* Tile 4 consist of columns that are 'ytile_span' wide and each 64B tile + * block consists of 4 row of Y-tile ordered data. + * Each 512B block within a 4kB tile contains 8 such block. + * + * To calculate the tiled offset, we need to identify: + * Block X and Block Y offset at each 512B block boundary in X and Y + * direction. + * + * A Tile4 has the following layout : + * + * |<------------- 128 B-------------------| + * _________________________________________ + * 512B blk(Blk0)^| 0 | 1 | 2 | 3 | 8 | 9 | 10 | 11 | ^ 512B blk(Blk1) + * (cell 0..7)) v| 4 | 5 | 6 | 7 | 12 | 13 | 14 | 15 | v (cell 8..15)) + * ----------------------------------------- + * | 16 | 17 | 18 | 19 | 24 | 25 | 26 | 27 | + * | 20 | 21 | 22 | 23 | 28 | 29 | 30 | 31 | + * ----------------------------------------- + * | 32 | 33 | 34 | 35 | 40 | 41 | 42 | 43 | + * | 36 | 37 | 38 | 39 | 44 | 45 | 46 | 47 | + * ----------------------------------------- + * | 48 | 49 | 50 | 51 | 56 | 57 | 58 | 59 | + * | 52 | 53 | 54 | 55 | 60 | 61 | 62 | 63 | + * ----------------------------------------- + * + * The tile is divided in 512B blocks[Blk0..Blk7], themselves made of 2 + * rows of 256B sub-blocks. + * + * Each sub-block is composed of 4 64B elements[cell(0)-cell(3)] (a cell + * in the figure above). + * + * Each 64B cell represents 4 rows of data.[cell(0), cell(1), .., cell(63)] + * + * + * Block X - Adds 256B to offset when we encounter block boundary in + * X direction.(Ex: Blk 0 --> Blk 1(BlkX_off = 256)) + * Block Y - Adds 512B to offset when we encounter block boundary in + * Y direction.(Ex: Blk 0 --> Blk 3(BlkY_off = 512)) + * + * (x / ytile_span) * cacheline_size_B //Byte offset in the X dir of + * the containing 64B block + * x % ytile_span //Byte offset in X dir within a 64B block/cacheline + * + * (y % 4) * 16 // Byte offset of the Y dir within a 64B block/cacheline + * (y / 4) * 256// Byte offset of the Y dir within 512B block after 1 row + * of 64B blocks/cachelines + * + * The copy destination offset for each range copied is the sum of + * Block X offset 'BlkX_off', Block Y offset 'BlkY_off', X offset 'xo' + * and a Y offset 'yo.' + */ + const uint32_t column_width = ytile_span; + const uint32_t tile4_blkh = 4; + + assert(ytile_span * tile4_blkh == 64); + const uint32_t cacheline_size_B = 64; + + /* Find intermediate Y offsets that are aligned to a 64B element + * (4 rows), so that we can do fully 64B memcpys on those. + */ + uint32_t y1 = MIN2(y3, ALIGN_UP(y0, 4)); + uint32_t y2 = MAX2(y1, ALIGN_DOWN(y3, 4)); + + /* xsb0 and xsb1 are the byte offset within a 256B sub block for x0 and x1 */ + uint32_t xsb0 = (x0 % ytile_span) + (x0 / ytile_span) * cacheline_size_B; + uint32_t xsb1 = (x1 % ytile_span) + (x1 / ytile_span) * cacheline_size_B; + + uint32_t Blkxsb0_off = ALIGN_DOWN(xsb0, 256); + uint32_t Blky0_off = (y0 / 8) * 512; + + uint32_t BlkX_off, BlkY_off; + + uint32_t x, yo, Y0, Y2; + + /* Y0 determines the initial byte offset in the Y direction */ + Y0 = (y0 / 4) * 256 + (y0 % 4) * ytile_span; + + /* Y2 determines the byte offset required for reaching y2 if y2 doesn't map + * exactly to 512B block boundary + */ + Y2 = y2 * 4 * column_width; + + src += (ptrdiff_t)y0 * src_pitch; + + /* To maximize memcpy speed, we do the copy in 3 parts : + * - copy the first lines that are not aligned to the 64B cell's height (4 rows) + * - copy the lines that are aligned to 64B cell's height + * - copy the remaining lines not making up for a full 64B cell's height + */ + if (y0 != y1) { + for (yo = Y0; yo < Y0 + (y1 - y0) * column_width; yo += column_width) { + uint32_t xo = xsb1; + + if (x0 != x1) + mem_copy(dst + (Blky0_off + Blkxsb0_off) + (xsb0 + yo), src + x0, x1 - x0); + + for (x = x1; x < x2; x += ytile_span) { + BlkX_off = ALIGN_DOWN(xo, 256); + + mem_copy_align16(dst + (Blky0_off + BlkX_off) + (xo + yo), src + x, ytile_span); + xo += cacheline_size_B; + } + + if (x3 != x2) { + BlkX_off = ALIGN_DOWN(xo, 256); + mem_copy_align16(dst + (Blky0_off + BlkX_off) + (xo + yo), src + x2, x3 - x2); + } + + src += src_pitch; + } + } + + for (yo = y1 * 4 * column_width; yo < y2 * 4 * column_width; yo += 16 * column_width) { + uint32_t xo = xsb1; + BlkY_off = ALIGN_DOWN(yo, 512); + + if (x0 != x1) { + mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 0 * column_width), + src + x0 + 0 * src_pitch, x1 - x0); + mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 1 * column_width), + src + x0 + 1 * src_pitch, x1 - x0); + mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 2 * column_width), + src + x0 + 2 * src_pitch, x1 - x0); + mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo + 3 * column_width), + src + x0 + 3 * src_pitch, x1 - x0); + } + + for (x = x1; x < x2; x += ytile_span) { + BlkX_off = ALIGN_DOWN(xo, 256); + + mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo+ 0 * column_width), + src + x + 0 * src_pitch, ytile_span); + mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo + 1 * column_width), + src + x + 1 * src_pitch, ytile_span); + mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo + 2 * column_width), + src + x + 2 * src_pitch, ytile_span); + mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo + 3 * column_width), + src + x + 3 * src_pitch, ytile_span); + + xo += cacheline_size_B; + } + + if (x2 != x3) { + BlkX_off = ALIGN_DOWN(xo, 256); + + mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 0 * column_width), + src + x2 + 0 * src_pitch, x3 - x2); + mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 1 * column_width), + src + x2 + 1 * src_pitch, x3 - x2); + mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 2 * column_width), + src + x2 + 2 * src_pitch, x3 - x2); + mem_copy(dst + (BlkY_off + BlkX_off) + (xo + yo + 3 * column_width), + src + x2 + 3 * src_pitch, x3 - x2); + } + + src += 4 * src_pitch; + } + + if (y2 != y3) { + for (yo = Y2; yo < Y2 + (y3 - y2) * column_width; yo += column_width) { + uint32_t xo = xsb1; + BlkY_off = ALIGN_DOWN(yo, 512); + + if (x0 != x1) + mem_copy(dst + (BlkY_off + Blkxsb0_off) + (xsb0 + yo), src + x0, x1 - x0); + + for (x = x1; x < x2; x += ytile_span) { + BlkX_off = ALIGN_DOWN(xo, 256); + + mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo), src + x, ytile_span); + xo += cacheline_size_B; + } + + if (x3 != x2) { + BlkX_off = ALIGN_DOWN(xo, 256); + mem_copy_align16(dst + (BlkY_off + BlkX_off) + (xo + yo), src + x2, x3 - x2); + } + + src += src_pitch; + } + } +} + /** * Copy texture data from X tile layout to linear. * @@ -704,6 +901,49 @@ linear_to_ytiled_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, dst, src, src_pitch, swizzle_bit, mem_copy, mem_copy); } +/** + * Copy texture data from linear to tile 4 layout, faster. + * + * Same as \ref linear_to_tile4 but faster, because it passes constant + * parameters for common cases, allowing the compiler to inline code + * optimized for those cases. + * + * \copydoc tile_copy_fn + */ +static FLATTEN void +linear_to_tile4_faster(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, + uint32_t y0, uint32_t y1, + char *dst, const char *src, + int32_t src_pitch, + uint32_t swizzle_bit, + isl_memcpy_type copy_type) +{ + isl_mem_copy_fn mem_copy = choose_copy_function(copy_type); + assert(swizzle_bit == 0); + + if (x0 == 0 && x3 == ytile_width && y0 == 0 && y1 == ytile_height) { + if (mem_copy == memcpy) + return linear_to_tile4(0, 0, ytile_width, ytile_width, 0, ytile_height, + dst, src, src_pitch, swizzle_bit, memcpy, memcpy); + else if (mem_copy == rgba8_copy) + return linear_to_tile4(0, 0, ytile_width, ytile_width, 0, ytile_height, + dst, src, src_pitch, swizzle_bit, + rgba8_copy, rgba8_copy_aligned_dst); + else + unreachable("not reached"); + } else { + if (mem_copy == memcpy) + return linear_to_tile4(x0, x1, x2, x3, y0, y1, + dst, src, src_pitch, swizzle_bit, memcpy, memcpy); + else if (mem_copy == rgba8_copy) + return linear_to_tile4(x0, x1, x2, x3, y0, y1, + dst, src, src_pitch, swizzle_bit, + rgba8_copy, rgba8_copy_aligned_dst); + else + unreachable("not reached"); + } +} + /** * Copy texture data from X tile layout to linear, faster. * @@ -853,6 +1093,11 @@ linear_to_tiled(uint32_t xt1, uint32_t xt2, th = ytile_height; span = ytile_span; tile_copy = linear_to_ytiled_faster; + } else if (tiling == ISL_TILING_4) { + tw = ytile_width; + th = ytile_height; + span = ytile_span; + tile_copy = linear_to_tile4_faster; } else { unreachable("unsupported tiling"); }