From d9f1e5d31b26a0d06ad32fedfd415e9939b092b5 Mon Sep 17 00:00:00 2001 From: Eyck-Alexander Jentzsch Date: Sat, 8 Mar 2025 16:05:45 +0100 Subject: [PATCH] small refactor --- src/vm/vector_functions.hpp | 843 +++++++++++++++--------------------- 1 file changed, 356 insertions(+), 487 deletions(-) diff --git a/src/vm/vector_functions.hpp b/src/vm/vector_functions.hpp index 3180f79..bec7192 100644 --- a/src/vm/vector_functions.hpp +++ b/src/vm/vector_functions.hpp @@ -51,30 +51,44 @@ extern "C" { #error __FILE__ should only be included from vector_functions.h #endif #include + +#ifdef __SIZEOF_INT128__ +template <> struct std::make_signed<__uint128_t> { using type = __int128_t; }; +template <> struct std::make_signed<__int128_t> { using type = __int128_t; }; +#endif + namespace softvector { template struct vreg_view { uint8_t* start; - size_t elem_count; + size_t vlmax; elem_t& operator[](size_t idx) { - assert(idx < elem_count); + assert(idx < vlmax); return *(reinterpret_cast(start) + idx); } }; -template vreg_view get_vreg(uint8_t* V, uint8_t reg_idx, uint16_t elem_count) { - assert(V + elem_count * sizeof(elem_t) <= V + VLEN * RFS / 8); - return {V + VLEN / 8 * reg_idx, elem_count}; +template vreg_view get_vreg(uint8_t* V, uint8_t reg_idx, uint16_t vlmax) { + assert(V + vlmax * sizeof(elem_t) <= V + VLEN * RFS / 8); + return {V + VLEN / 8 * reg_idx, vlmax}; } -template vmask_view read_vmask(uint8_t* V, uint16_t elem_count, uint8_t reg_idx) { +template vmask_view read_vmask(uint8_t* V, uint16_t vlmax, uint8_t reg_idx) { uint8_t* mask_start = V + VLEN / 8 * reg_idx; - assert(mask_start + elem_count / 8 <= V + VLEN * RFS / 8); - return {mask_start, elem_count}; + assert(mask_start + vlmax / 8 <= V + VLEN * RFS / 8); + return {mask_start, vlmax}; } template constexpr elem_t shift_mask() { static_assert(std::numeric_limits::is_integer, "shift_mask only supports integer types"); return std::numeric_limits::digits - 1; } +template constexpr T agnostic_behavior(T val) { +#ifdef AGNOSTIC_ONES + return std::numeric_limits::max(); +#else + return val; +#endif +} + enum FUNCT3 { OPIVV = 0b000, OPFVV = 0b001, @@ -96,6 +110,9 @@ template <> struct twice { using type = __int128_t; }; template <> struct twice { using type = __uint128_t; }; #endif template using twice_t = typename twice::type; // for convenience +template constexpr TO sext(FROM val) { + return static_cast>(static_cast>(val)); +}; template uint64_t vector_load_store(void* core, std::function load_store_fn, uint8_t* V, uint64_t vl, @@ -116,15 +133,14 @@ uint64_t vector_load_store(void* core, std::function(addressed_elem))) return idx; } - } else { - for(size_t s_idx = 0; s_idx < segment_size; s_idx++) { - // vtype.vma(); - } - } - } - for(size_t idx = vl; idx < vlmax; idx++) { - // vtype.vta(); + } else if(vtype.vma()) + for(size_t s_idx = 0; s_idx < segment_size; s_idx++) + vd_view[idx + emul_stride * s_idx] = agnostic_behavior(vd_view[idx + emul_stride * s_idx]); } + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) + for(size_t s_idx = 0; s_idx < segment_size; s_idx++) + vd_view[idx + emul_stride * s_idx] = agnostic_behavior(vd_view[idx + emul_stride * s_idx]); return 0; } // eew for index registers, sew for data register @@ -148,15 +164,14 @@ uint64_t vector_load_store_index(void* core, std::function(addressed_elem))) return idx; } - } else { - for(size_t s_idx = 0; s_idx < segment_size; s_idx++) { - // vtype.vma(); - } - } - } - for(size_t idx = vl; idx < vlmax; idx++) { - // vtype.vta(); + } else if(vtype.vma()) + for(size_t s_idx = 0; s_idx < segment_size; s_idx++) + vd_view[idx + emul_stride * s_idx] = agnostic_behavior(vd_view[idx + emul_stride * s_idx]); } + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) + for(size_t s_idx = 0; s_idx < segment_size; s_idx++) + vd_view[idx + emul_stride * s_idx] = agnostic_behavior(vd_view[idx + emul_stride * s_idx]); return 0; } template @@ -170,17 +185,13 @@ std::function get_funct(unsi case 0b000011: // VRSUB return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs1 - vs2; }; case 0b000100: // VMINU - return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return std::min(vs2, static_cast(vs1)); }; + return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return std::min(vs2, vs1); }; case 0b000101: // VMIN - return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { - return std::min(static_cast>(vs2), static_cast>(vs1)); - }; + return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return std::min>(vs2, vs1); }; case 0b000110: // VMAXU - return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return std::max(vs2, static_cast(vs1)); }; + return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return std::max(vs2, vs1); }; case 0b000111: // VMAX - return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { - return std::max(static_cast>(vs2), static_cast>(vs1)); - }; + return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return std::max>(vs2, vs1); }; case 0b001001: // VAND return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs1 & vs2; }; case 0b001010: // VOR @@ -257,15 +268,11 @@ std::function get_funct(unsi }; case 0b100110: // VMULHSU return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { - return (static_cast>>(static_cast>(vs2)) * - static_cast>(vs1)) >> - sizeof(dest_elem_t) * 8; + return (sext>(vs2) * static_cast>(vs1)) >> sizeof(dest_elem_t) * 8; }; case 0b100111: // VMULH return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { - return (static_cast>>(static_cast>(vs2)) * - static_cast>>(static_cast>(vs1))) >> - sizeof(dest_elem_t) * 8; + return (sext>(vs2) * sext>(vs1)) >> sizeof(dest_elem_t) * 8; }; case 0b101001: // VMADD return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs1 * vd + vs2; }; @@ -280,73 +287,46 @@ std::function get_funct(unsi return static_cast(vs2) + static_cast(vs1); }; case 0b110001: // VWADD - return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { - return static_cast>(static_cast>(vs2)) + - static_cast>(static_cast>(vs1)); - }; + return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return sext(vs2) + sext(vs1); }; case 0b110010: // VWSUBU return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return static_cast(vs2) - static_cast(vs1); }; case 0b110011: // VWSUB - return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { - return static_cast>(static_cast>(vs2)) - - static_cast>(static_cast>(vs1)); - }; + return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return sext(vs2) - sext(vs1); }; case 0b110100: // VWADDU.W return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return static_cast(vs2) + static_cast(vs1); }; case 0b110101: // VWADD.W - return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { - return static_cast>(static_cast>(vs2)) + - static_cast>(static_cast>(vs1)); - }; + return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return sext(vs2) + sext(vs1); }; case 0b110110: // VWSUBU.W return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return static_cast(vs2) - static_cast(vs1); }; case 0b110111: // VWSUB.W - return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { - return static_cast>(static_cast>(vs2)) - - static_cast>(static_cast>(vs1)); - }; + return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return sext(vs2) - sext(vs1); }; case 0b111000: // VWMULU return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return (static_cast(vs2) * static_cast(vs1)); }; case 0b111010: // VWMULSU - return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { - return static_cast>( - static_cast>(static_cast>(vs2))) * - static_cast(vs1); - }; + return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return sext(vs2) * static_cast(vs1); }; case 0b111011: // VWMUL - return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { - return (static_cast>(static_cast>(vs2)) * - static_cast>(static_cast>(vs1))); - }; + return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return sext(vs2) * sext(vs1); }; case 0b111100: // VWMACCU return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return static_cast(vs1) * static_cast(vs2) + vd; }; case 0b111101: // VWMACC - return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { - return static_cast>(static_cast>(vs1)) * - static_cast>(static_cast>(vs2)) + - vd; - }; + return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return sext(vs1) * sext(vs2) + vd; }; case 0b111110: // VWMACCUS return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { - return static_cast(vs1) * - static_cast>(static_cast>(vs2)) + - vd; + return static_cast(vs1) * sext(vs2) + vd; }; case 0b111111: // VWMACCSU return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { - return static_cast>(static_cast>(vs1)) * - static_cast(vs2) + - vd; + return sext(vs1) * static_cast(vs2) + vd; }; default: throw new std::runtime_error("Unknown funct6 in get_funct"); @@ -357,52 +337,40 @@ std::function get_funct(unsi template void vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, unsigned vs1, carry_t carry) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs1_view = get_vreg(V, vs1, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); - auto vd_view = get_vreg(V, vd, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs1_view = get_vreg(V, vs1, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); + auto vd_view = get_vreg(V, vd, vlmax); auto fn = get_funct(funct6, funct3); - // elements w/ index smaller than vstart are in the prestart and get skipped - // body is from vstart to min(elem_count, vl) - if(carry == carry_t::NO_CARRY) { - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + if(carry == carry_t::NO_CARRY) + for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { + if(mask_active) vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]); - } else { - vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx]; - } + else if(vtype.vma()) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } - } else if(carry == carry_t::SUB_CARRY) { - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + else if(carry == carry_t::SUB_CARRY) + for(size_t idx = vstart; idx < vl; idx++) vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]) - mask_reg[idx]; - } - } else { - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + else + for(size_t idx = vstart; idx < vl; idx++) vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]) + mask_reg[idx]; - } - } - // elements w/ index larger than elem_count are in the tail (fractional LMUL) - // elements w/ index larger than vl are in the tail - unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8); - for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } template void vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, typename std::make_signed::type imm, carry_t carry) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); - auto vd_view = get_vreg(V, vd, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); + auto vd_view = get_vreg(V, vd, vlmax); auto fn = get_funct(funct6, funct3); - // elements w/ index smaller than vstart are in the prestart and get skipped - // body is from vstart to min(elem_count, vl) - if(carry == carry_t::NO_CARRY) { - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + if(carry == carry_t::NO_CARRY) + for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; if(mask_active) { vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm); @@ -410,30 +378,23 @@ void vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, ui vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx]; } } - } else if(carry == carry_t::SUB_CARRY) { - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + else if(carry == carry_t::SUB_CARRY) + for(size_t idx = vstart; idx < vl; idx++) vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm) - mask_reg[idx]; - } - } else { - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + else + for(size_t idx = vstart; idx < vl; idx++) vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm) + mask_reg[idx]; - } - } - // elements w/ index larger than elem_count are in the tail (fractional LMUL) - // elements w/ index larger than vl are in the tail - unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8); - for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } template void vector_vector_merge(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, unsigned vs1) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs1_view = get_vreg(V, vs1, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); - auto vd_view = get_vreg(V, vd, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs1_view = get_vreg(V, vs1, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); + auto vd_view = get_vreg(V, vd, vlmax); for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; if(mask_active) @@ -444,10 +405,10 @@ void vector_vector_merge(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype } template void vector_imm_merge(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); - auto vd_view = get_vreg(V, vd, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); + auto vd_view = get_vreg(V, vd, vlmax); for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; if(mask_active) @@ -512,53 +473,41 @@ template std::function get_mask_funct(un template void mask_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, unsigned vs1) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs1_view = get_vreg(V, vs1, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs1_view = get_vreg(V, vs1, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); vmask_view vd_mask_view = read_vmask(V, VLEN, vd); auto fn = get_mask_funct(funct6, funct3); - // elements w/ index smaller than vstart are in the prestart and get skipped - // body is from vstart to min(elem_count, vl) - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { + if(mask_active) vd_mask_view[idx] = fn(vs2_view[idx], vs1_view[idx]); - } else { - vd_mask_view[idx] = vtype.vma() ? vd_mask_view[idx] : vd_mask_view[idx]; - } + else if(vtype.vma()) + vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]); } - // elements w/ index larger than elem_count are in the tail (fractional LMUL) - // elements w/ index larger than vl are in the tail - for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) { - vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = vl; idx < VLEN; idx++) + vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]); } template void mask_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, typename std::make_signed::type imm) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); vmask_view vd_mask_view = read_vmask(V, VLEN, vd); auto fn = get_mask_funct(funct6, funct3); - // elements w/ index smaller than vstart are in the prestart and get skipped - // body is from vstart to min(elem_count, vl) - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { + if(mask_active) vd_mask_view[idx] = fn(vs2_view[idx], imm); - } else { - vd_mask_view[idx] = vtype.vma() ? vd_mask_view[idx] : vd_mask_view[idx]; - } + else if(vtype.vma()) + vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]); } - // elements w/ index larger than elem_count are in the tail (fractional LMUL) - // elements w/ index larger than vl are in the tail - for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) { - vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = vl; idx < VLEN; idx++) + vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]); } template std::function get_unary_fn(unsigned unary_op) { @@ -577,28 +526,21 @@ std::function get_unary_fn(unsigned unary_op) { } template void vector_unary_op(uint8_t* V, unsigned unary_op, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); - auto vd_view = get_vreg(V, vd, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); + auto vd_view = get_vreg(V, vd, vlmax); auto fn = get_unary_fn(unary_op); - // elements w/ index smaller than vstart are in the prestart and get skipped - // body is from vstart to min(elem_count, vl) - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { + if(mask_active) vd_view[idx] = fn(vs2_view[idx]); - } else { + else vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx]; - } } - // elements w/ index larger than elem_count are in the tail (fractional LMUL) - // elements w/ index larger than vl are in the tail - unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8); - for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } template std::function get_carry_funct(unsigned funct) { switch(funct) { @@ -617,57 +559,45 @@ template std::function get_carry template void carry_vector_vector_op(uint8_t* V, unsigned funct, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, unsigned vs1) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs1_view = get_vreg(V, vs1, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); - vmask_view vd_mask_view = read_vmask(V, elem_count, vd); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs1_view = get_vreg(V, vs1, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); + vmask_view vd_mask_view = read_vmask(V, vlmax, vd); auto fn = get_carry_funct(funct); - // elements w/ index smaller than vstart are in the prestart and get skipped - // body is from vstart to min(elem_count, vl) - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + for(size_t idx = vstart; idx < vl; idx++) { elem_t carry = vm ? 0 : mask_reg[idx]; vd_mask_view[idx] = fn(vs2_view[idx], vs1_view[idx], carry); } - // elements w/ index larger than elem_count are in the tail (fractional LMUL) - // elements w/ index larger than vl are in the tail - for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) { - // always tail agnostic - } - return; + for(size_t idx = vl; idx < vlmax; idx++) + vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]); } template void carry_vector_imm_op(uint8_t* V, unsigned funct, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, typename std::make_signed::type imm) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); - vmask_view vd_mask_view = read_vmask(V, elem_count, vd); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); + vmask_view vd_mask_view = read_vmask(V, vlmax, vd); auto fn = get_carry_funct(funct); - // elements w/ index smaller than vstart are in the prestart and get skipped - // body is from vstart to min(elem_count, vl) - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + for(size_t idx = vstart; idx < vl; idx++) { elem_t carry = vm ? 0 : mask_reg[idx]; vd_mask_view[idx] = fn(vs2_view[idx], imm, carry); } - // elements w/ index larger than elem_count are in the tail (fractional LMUL) - // elements w/ index larger than vl are in the tail - for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) { - // always tail agnostic - } - return; + for(size_t idx = vl; idx < vlmax; idx++) + vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]); } template bool get_rounding_increment(T v, uint64_t d, int64_t vxrm) { if(d == 0) return 0; - switch(vxrm & 0b11) { // Mask to ensure only lower 2 bits are used - case 0b00: // rnu: round-to-nearest-up (add +0.5 LSB) + switch(vxrm & 0b11) { + case 0b00: return (v >> (d - 1)) & 1; - case 0b01: // rne: round-to-nearest-even + case 0b01: return ((v >> (d - 1)) & 1) && (((v & ((1 << (d - 1)) - 1)) != 0) || ((v >> d) & 1)); - case 0b10: // rdn: round-down (truncate) + case 0b10: return false; - case 0b11: // rod: round-to-odd (jam) + case 0b11: return (!(v & (static_cast(1) << d)) && ((v & ((static_cast(1) << d) - 1)) != 0)); } return false; @@ -676,13 +606,13 @@ template T roundoff(T v, uint64_t d, int64_t vxrm) { unsigned r = get_rounding_increment(v, d, vxrm); return (v >> d) + r; } -template -std::function get_sat_funct(unsigned funct6, unsigned funct3) { +template +std::function get_sat_funct(unsigned funct6, unsigned funct3) { if(funct3 == OPIVV || funct3 == OPIVX || funct3 == OPIVI) switch(funct6) { case 0b100000: // VSADDU - return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_T vs1) { - auto res = static_cast>(vs2) + static_cast>(vs1); + return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) { + auto res = static_cast>(vs2) + static_cast>(vs1); if(res > std::numeric_limits::max()) { vd = std::numeric_limits::max(); return 1; @@ -692,9 +622,9 @@ std::function g } }; case 0b100001: // VSADD - return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_T vs1) { + return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) { auto res = static_cast>>(static_cast>(vs2)) + - static_cast>>(static_cast>(vs1)); + static_cast>>(static_cast>(vs1)); if(res < std::numeric_limits>::min()) { vd = std::numeric_limits>::min(); return 1; @@ -707,7 +637,7 @@ std::function g } }; case 0b100010: // VSSUBU - return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_T vs1) { + return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) { if(vs2 < vs1) { vd = 0; return 1; @@ -717,9 +647,9 @@ std::function g } }; case 0b100011: // VSSUB - return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_T vs1) { + return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) { auto res = static_cast>>(static_cast>(vs2)) - - static_cast>>(static_cast>(vs1)); + static_cast>>(static_cast>(vs1)); if(res < std::numeric_limits>::min()) { vd = std::numeric_limits>::min(); return 1; @@ -732,9 +662,9 @@ std::function g } }; case 0b100111: // VSMUL - return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_T vs1) { + return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) { auto big_val = static_cast>>(static_cast>(vs2)) * - static_cast>>(static_cast>(vs1)); + static_cast>>(static_cast>(vs1)); auto res = roundoff(big_val, vtype.sew() - 1, vxrm); if(res < std::numeric_limits>::min()) { vd = std::numeric_limits>::min(); @@ -748,17 +678,17 @@ std::function g } }; case 0b101010: // VSSRL - return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_T vs1) { - vd = roundoff(vs2, vs1 & shift_mask(), vxrm); + return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) { + vd = roundoff(vs2, vs1 & shift_mask(), vxrm); return 0; }; case 0b101011: // VSSRA - return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_T vs1) { - vd = roundoff(static_cast>(vs2), vs1 & shift_mask(), vxrm); + return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) { + vd = roundoff(static_cast>(vs2), vs1 & shift_mask(), vxrm); return 0; }; case 0b101110: // VNCLIPU - return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_T vs1) { + return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) { auto res = roundoff(vs2, vs1 & shift_mask(), vxrm); if(res > std::numeric_limits::max()) { vd = std::numeric_limits::max(); @@ -769,7 +699,7 @@ std::function g } }; case 0b101111: // VNCLIP - return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_T vs1) { + return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) { auto res = roundoff(static_cast>(vs2), vs1 & shift_mask(), vxrm); if(res < std::numeric_limits>::min()) { vd = std::numeric_limits>::min(); @@ -788,28 +718,26 @@ std::function g else if(funct3 == OPMVV || funct3 == OPMVX) switch(funct6) { case 0b001000: // VAADDU - return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_T vs1) { - auto res = static_cast>(vs2) + static_cast>(vs1); + return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) { + auto res = static_cast(vs2) + static_cast>(vs1); vd = roundoff(res, 1, vxrm); return 0; }; case 0b001001: // VAADD - return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_T vs1) { - auto res = static_cast>>(static_cast>(vs2)) + - static_cast>>(static_cast>(vs1)); + return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) { + auto res = sext>(vs2) + sext>(vs1); vd = roundoff(res, 1, vxrm); return 0; }; case 0b001010: // VASUBU - return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_T vs1) { - auto res = static_cast>(vs2) - static_cast>(vs1); + return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) { + auto res = static_cast(vs2) - static_cast>(vs1); vd = roundoff(res, 1, vxrm); return 0; }; case 0b001011: // VASUB - return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_T vs1) { - auto res = static_cast>>(static_cast>(vs2)) - - static_cast>>(static_cast>(vs1)); + return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) { + auto res = sext>(vs2) - sext>(vs1); vd = roundoff(res, 1, vxrm); return 0; }; @@ -819,19 +747,17 @@ std::function g else throw new std::runtime_error("Unknown funct3 in get_sat_funct"); } -template +template bool sat_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, int64_t vxrm, bool vm, unsigned vd, unsigned vs2, unsigned vs1) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); bool saturated = false; - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs1_view = get_vreg(V, vs1, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); - auto vd_view = get_vreg(V, vd, elem_count); - auto fn = get_sat_funct(funct6, funct3); - // elements w/ index smaller than vstart are in the prestart and get skipped - // body is from vstart to min(elem_count, vl) - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs1_view = get_vreg(V, vs1, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); + auto vd_view = get_vreg(V, vd, vlmax); + auto fn = get_sat_funct(funct6, funct3); + for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; if(mask_active) { saturated |= fn(vxrm, vtype, vd_view[idx], vs2_view[idx], vs1_view[idx]); @@ -839,26 +765,22 @@ bool sat_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx]; } } - // elements w/ index larger than elem_count are in the tail (fractional LMUL) - // elements w/ index larger than vl are in the tail - unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8); - for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) { + vd_view[idx] = agnostic_behavior(vd_view[idx]); + } return saturated; } -template +template bool sat_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, int64_t vxrm, bool vm, - unsigned vd, unsigned vs2, typename std::make_signed::type imm) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); + unsigned vd, unsigned vs2, typename std::make_signed::type imm) { + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); bool saturated = false; - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); - auto vd_view = get_vreg(V, vd, elem_count); - auto fn = get_sat_funct(funct6, funct3); - // elements w/ index smaller than vstart are in the prestart and get skipped - // body is from vstart to min(elem_count, vl) - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); + auto vd_view = get_vreg(V, vd, vlmax); + auto fn = get_sat_funct(funct6, funct3); + for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; if(mask_active) { saturated |= fn(vxrm, vtype, vd_view[idx], vs2_view[idx], imm); @@ -866,12 +788,10 @@ bool sat_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx]; } } - // elements w/ index larger than elem_count are in the tail (fractional LMUL) - // elements w/ index larger than vl are in the tail - unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8); - for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) { + vd_view[idx] = agnostic_behavior(vd_view[idx]); + } return saturated; } template @@ -883,8 +803,7 @@ std::function get_red_funct(unsigned funct6, uns case 0b110001: // VWREDSUM return [](dest_elem_t& running_total, src_elem_t vs2) { // cast the signed vs2 elem to unsigned to enable wrap around on overflow - return running_total += static_cast( - static_cast>(static_cast>(vs2))); + return running_total += static_cast(sext(vs2)); }; default: throw new std::runtime_error("Unknown funct6 in get_red_funct"); @@ -900,8 +819,7 @@ std::function get_red_funct(unsigned funct6, uns case 0b000011: // VREDXOR return [](dest_elem_t& running_total, src_elem_t vs2) { running_total ^= vs2; }; case 0b000100: // VREDMINU - return - [](dest_elem_t& running_total, src_elem_t vs2) { running_total = std::min(running_total, static_cast(vs2)); }; + return [](dest_elem_t& running_total, src_elem_t vs2) { running_total = std::min(running_total, vs2); }; case 0b000101: // VREDMIN return [](dest_elem_t& running_total, src_elem_t vs2) { running_total = std::min(static_cast>(running_total), @@ -924,28 +842,25 @@ std::function get_red_funct(unsigned funct6, uns template void vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, unsigned vs1) { - if(vl == 0) - return; - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs1_elem = get_vreg(V, vs1, elem_count)[0]; - auto vs2_view = get_vreg(V, vs2, elem_count); - auto vd_view = get_vreg(V, vd, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs1_elem = get_vreg(V, vs1, vlmax)[0]; + auto vs2_view = get_vreg(V, vs2, vlmax); + auto vd_view = get_vreg(V, vd, vlmax); auto fn = get_red_funct(funct6, funct3); - dest_elem_t& running_total = {vs1_elem}; - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + dest_elem_t& running_total = vd_view[0] = vs1_elem; + for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; if(mask_active) { fn(running_total, vs2_view[idx]); } } - vd_view[0] = running_total; // the tail is all elements of the destination register beyond the first one - for(size_t idx = 1; idx < VLEN / (vtype.sew() * RFS); idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = 1; idx < VLEN / vtype.sew(); idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } + // might be that these exist somewhere in softfloat template constexpr bool isNaN(src_elem_t x); template <> constexpr bool isNaN(uint32_t x) { return ((x & 0x7F800000) == 0x7F800000) && ((x & 0x007FFFFF) != 0); } @@ -961,7 +876,7 @@ template <> constexpr bool isPosZero(uint64_t x) { return x == 0x00000 template dest_elem_t widen_float(src_elem_t val) { throw new std::runtime_error("Trying to widen a weird 'float'"); -}; +} template <> inline uint64_t widen_float(uint32_t val) { return f32_to_f64(float32_t{val}).v; } template elem_size_t fp_add(uint8_t, elem_size_t, elem_size_t); @@ -990,7 +905,6 @@ template <> inline uint32_t fp_nmsub(uint8_t mode, uint32_t v2, uint32 template <> inline uint64_t fp_nmsub(uint8_t mode, uint64_t v2, uint64_t v1, uint64_t v3) { return fmadd_d(v1, v2, v3, 3, mode); } template elem_size_t fp_min(elem_size_t, elem_size_t); template <> inline uint32_t fp_min(uint32_t v2, uint32_t v1) { - bool v1_lt_v2 = fcmp_s(v1, v2, 2); if(isNaN(v1) && isNaN(v2)) return defaultNaNF32UI; else if(isNaN(v1)) @@ -1001,7 +915,7 @@ template <> inline uint32_t fp_min(uint32_t v2, uint32_t v1) { return v1; else if(isNegZero(v2) && isNegZero(v1)) return v2; - else if(v1_lt_v2) + else if(fcmp_s(v1, v2, 2)) return v1; else return v2; @@ -1233,59 +1147,46 @@ std::function void fp_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, unsigned vs1, uint8_t rm) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs1_view = get_vreg(V, vs1, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); - auto vd_view = get_vreg(V, vd, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs1_view = get_vreg(V, vs1, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); + auto vd_view = get_vreg(V, vd, vlmax); auto fn = get_fp_funct(funct6, funct3); uint8_t accrued_flags = 0; - // elements w/ index smaller than vstart are in the prestart and get skipped - // body is from vstart to min(elem_count, vl) - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { + if(mask_active) vd_view[idx] = fn(rm, accrued_flags, vd_view[idx], vs2_view[idx], vs1_view[idx]); - } else { - vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx]; - } + else if(vtype.vma()) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } softfloat_exceptionFlags = accrued_flags; - // elements w/ index larger than elem_count are in the tail (fractional LMUL) - // elements w/ index larger than vl are in the tail - unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8); - for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) { + vd_view[idx] = agnostic_behavior(vd_view[idx]); + } } template void fp_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, src1_elem_t imm, uint8_t rm) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); - auto vd_view = get_vreg(V, vd, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); + auto vd_view = get_vreg(V, vd, vlmax); auto fn = get_fp_funct(funct6, funct3); uint8_t accrued_flags = 0; - // elements w/ index smaller than vstart are in the prestart and get skipped - // body is from vstart to min(elem_count, vl) - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { + if(mask_active) vd_view[idx] = fn(rm, accrued_flags, vd_view[idx], vs2_view[idx], imm); - } else { - vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx]; - } + else if(vtype.vma()) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } softfloat_exceptionFlags = accrued_flags; - // elements w/ index larger than elem_count are in the tail (fractional LMUL) - // elements w/ index larger than vl are in the tail - unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8); - for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } template std::function get_fp_red_funct(unsigned funct6, unsigned funct3) { @@ -1328,29 +1229,24 @@ std::function get_fp_red_func template void fp_vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, unsigned vs1, uint8_t rm) { - if(vl == 0) - return; - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs1_elem = get_vreg(V, vs1, elem_count)[0]; - auto vs2_view = get_vreg(V, vs2, elem_count); - auto vd_view = get_vreg(V, vd, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs1_elem = get_vreg(V, vs1, vlmax)[0]; + auto vs2_view = get_vreg(V, vs2, vlmax); + auto vd_view = get_vreg(V, vd, vlmax); auto fn = get_fp_red_funct(funct6, funct3); - dest_elem_t& running_total = {vs1_elem}; + dest_elem_t& running_total = vd_view[0] = vs1_elem; uint8_t accrued_flags = 0; - for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) { + for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { + if(mask_active) fn(rm, accrued_flags, running_total, vs2_view[idx]); - } } - vd_view[0] = running_total; softfloat_exceptionFlags = accrued_flags; // the tail is all elements of the destination register beyond the first one - for(size_t idx = 1; idx < VLEN / (vtype.sew() * RFS); idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = 1; idx < VLEN / vtype.sew(); idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } template elem_size_t fp_sqrt(uint8_t, elem_size_t); template <> inline uint16_t fp_sqrt(uint8_t mode, uint16_t v2) { return fsqrt_h(v2, mode); } @@ -1597,24 +1493,23 @@ std::function get_fp_unary_fn(unsign template void fp_vector_unary_op(uint8_t* V, unsigned encoding_space, unsigned unary_op, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint8_t rm) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); - auto vd_view = get_vreg(V, vd, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); + auto vd_view = get_vreg(V, vd, vlmax); auto fn = get_fp_unary_fn(encoding_space, unary_op); uint8_t accrued_flags = 0; for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { + if(mask_active) vd_view[idx] = fn(rm, accrued_flags, vs2_view[idx]); - } else { - vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx]; - } + else if(vtype.vma()) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } softfloat_exceptionFlags = accrued_flags; - for(size_t idx = vl; idx < elem_count; idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } template bool fp_eq(elem_size_t, elem_size_t); template <> inline bool fp_eq(uint32_t v2, uint32_t v1) { return fcmp_s(v2, v1, 0); } @@ -1670,72 +1565,64 @@ template std::function void mask_fp_vector_vector_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, unsigned vs1, uint8_t rm) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs1_view = get_vreg(V, vs1, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs1_view = get_vreg(V, vs1, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); vmask_view vd_mask_view = read_vmask(V, VLEN, vd); auto fn = get_fp_mask_funct(funct6); uint8_t accrued_flags = 0; for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { + if(mask_active) vd_mask_view[idx] = fn(rm, accrued_flags, vs2_view[idx], vs1_view[idx]); - } else { - vd_mask_view[idx] = vtype.vma() ? vd_mask_view[idx] : vd_mask_view[idx]; - } + else if(vtype.vma()) + vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]); } softfloat_exceptionFlags = accrued_flags; - for(size_t idx = vl; idx < VLEN; idx++) { - vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = vl; idx < VLEN; idx++) + vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]); } template void mask_fp_vector_imm_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, elem_t imm, uint8_t rm) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); vmask_view vd_mask_view = read_vmask(V, VLEN, vd); auto fn = get_fp_mask_funct(funct6); uint8_t accrued_flags = 0; for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { + if(mask_active) vd_mask_view[idx] = fn(rm, accrued_flags, vs2_view[idx], imm); - } else { - vd_mask_view[idx] = vtype.vma() ? vd_mask_view[idx] : vd_mask_view[idx]; - } + else if(vtype.vma()) + vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]); } softfloat_exceptionFlags = accrued_flags; - for(size_t idx = vl; idx < VLEN; idx++) { - vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = vl; idx < VLEN; idx++) + vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]); } template void mask_mask_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, unsigned vd, unsigned vs2, unsigned vs1) { - uint64_t elem_count = VLEN; - auto vs1_view = read_vmask(V, elem_count, vs1); - auto vs2_view = read_vmask(V, elem_count, vs2); - auto vd_view = read_vmask(V, elem_count, vd); + uint64_t vlmax = VLEN; + auto vs1_view = read_vmask(V, vlmax, vs1); + auto vs2_view = read_vmask(V, vlmax, vs2); + auto vd_view = read_vmask(V, vlmax, vd); auto fn = get_mask_funct(funct6, funct3); // could be bool, but would break the make_signed_t in get_mask_funct - for(size_t idx = vstart; idx < vl; idx++) { + for(size_t idx = vstart; idx < vl; idx++) vd_view[idx] = fn(vs2_view[idx], vs1_view[idx]); - } + // the tail is all elements of the destination register beyond the first one - for(size_t idx = 1; idx < VLEN; idx++) { - // always tail agnostic - // this is a nop, placeholder for vta behavior - vd_view[idx] = vd_view[idx]; - } - return; + for(size_t idx = 1; idx < VLEN; idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } template uint64_t vcpop(uint8_t* V, uint64_t vl, uint64_t vstart, bool vm, unsigned vs2) { - uint64_t elem_count = VLEN; - auto vs2_view = read_vmask(V, elem_count, vs2); - vmask_view mask_reg = read_vmask(V, elem_count); + uint64_t vlmax = VLEN; + auto vs2_view = read_vmask(V, vlmax, vs2); + vmask_view mask_reg = read_vmask(V, vlmax); unsigned running_total = 0; for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; @@ -1745,9 +1632,9 @@ template uint64_t vcpop(uint8_t* V, uint64_t vl, uint64_t vstart return running_total; } template uint64_t vfirst(uint8_t* V, uint64_t vl, uint64_t vstart, bool vm, unsigned vs2) { - uint64_t elem_count = VLEN; - auto vs2_view = read_vmask(V, elem_count, vs2); - vmask_view mask_reg = read_vmask(V, elem_count); + uint64_t vlmax = VLEN; + auto vs2_view = read_vmask(V, vlmax, vs2); + vmask_view mask_reg = read_vmask(V, vlmax); for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; if(mask_active && vs2_view[idx]) @@ -1792,33 +1679,29 @@ inline std::function get_mask_set_funct(unsigned enc) { } } template void mask_set_op(uint8_t* V, unsigned enc, uint64_t vl, uint64_t vstart, bool vm, unsigned vd, unsigned vs2) { - uint64_t elem_count = VLEN; - auto vs2_view = read_vmask(V, elem_count, vs2); - auto vd_view = read_vmask(V, elem_count, vd); - vmask_view mask_reg = read_vmask(V, elem_count); + uint64_t vlmax = VLEN; + auto vs2_view = read_vmask(V, vlmax, vs2); + auto vd_view = read_vmask(V, vlmax, vd); + vmask_view mask_reg = read_vmask(V, vlmax); auto fn = get_mask_set_funct(enc); bool marker = false; for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { + if(mask_active) vd_view[idx] = fn(marker, vs2_view[idx]); - } } // the tail is all elements of the destination register beyond the first one - for(size_t idx = vl; idx < VLEN; idx++) { - // always tail agnostic - // this is a nop, placeholder for vta behavior - vd_view[idx] = vd_view[idx]; - } + for(size_t idx = 1; idx < VLEN; idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } template void viota(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - auto vs2_view = read_vmask(V, elem_count, vs2); - auto vd_view = get_vreg(V, vd, elem_count); - vmask_view mask_reg = read_vmask(V, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + auto vs2_view = read_vmask(V, vlmax, vs2); + auto vd_view = get_vreg(V, vd, vlmax); + vmask_view mask_reg = read_vmask(V, vlmax); unsigned current = 0; - for(size_t idx = vstart; idx < std::min(vl, elem_count); idx++) { + for(size_t idx = vstart; idx < std::min(vl, vlmax); idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; if(mask_active) { vd_view[idx] = current; @@ -1826,68 +1709,61 @@ void viota(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uns current += 1; } } - return; } template void vid(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd) { - uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew(); - auto vd_view = get_vreg(V, vd, elem_count); - vmask_view mask_reg = read_vmask(V, elem_count); - for(size_t idx = vstart; idx < std::min(vl, elem_count); idx++) { + uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew(); + auto vd_view = get_vreg(V, vd, vlmax); + vmask_view mask_reg = read_vmask(V, vlmax); + for(size_t idx = vstart; idx < std::min(vl, vlmax); idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { + if(mask_active) vd_view[idx] = idx; - } } - return; } template uint64_t scalar_move(uint8_t* V, vtype_t vtype, unsigned vd, uint64_t val, bool to_vector) { unsigned vlmax = VLEN * vtype.lmul() / vtype.sew(); auto vd_view = get_vreg(V, vd, vlmax); if(to_vector) { vd_view[0] = val; - for(size_t idx = 1; idx < vlmax; idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } + if(vtype.vta()) + for(size_t idx = 1; idx < vlmax; idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } return static_cast(static_cast>(vd_view[0])); } template void vector_slideup(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) { - uint64_t elem_count = VLEN * vtype.lmul() / (sizeof(src_elem_t) * 8); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); - auto vd_view = get_vreg(V, vd, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / (sizeof(src_elem_t) * 8); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); + auto vd_view = get_vreg(V, vd, vlmax); for(size_t idx = std::max(vstart, imm); idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { - vd_view[idx] = idx - imm < elem_count ? vs2_view[idx - imm] : 0; - } else { - vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx]; - } + if(mask_active) + vd_view[idx] = idx - imm < vlmax ? vs2_view[idx - imm] : 0; + else if(vtype.vma()) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } - for(size_t idx = vl; idx < elem_count; idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } template void vector_slidedown(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) { - uint64_t elem_count = VLEN * vtype.lmul() / (sizeof(src_elem_t) * 8); - vmask_view mask_reg = read_vmask(V, elem_count); - auto vs2_view = get_vreg(V, vs2, elem_count); - auto vd_view = get_vreg(V, vd, elem_count); + uint64_t vlmax = VLEN * vtype.lmul() / (sizeof(src_elem_t) * 8); + vmask_view mask_reg = read_vmask(V, vlmax); + auto vs2_view = get_vreg(V, vs2, vlmax); + auto vd_view = get_vreg(V, vd, vlmax); for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { - vd_view[idx] = std::numeric_limits::max() - idx > imm && idx + imm < elem_count ? vs2_view[idx + imm] : 0; - } else { - vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx]; - } + if(mask_active) + vd_view[idx] = std::numeric_limits::max() - idx > imm && idx + imm < vlmax ? vs2_view[idx + imm] : 0; + else if(vtype.vma()) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } - for(size_t idx = vl; idx < elem_count; idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } template void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) { @@ -1896,20 +1772,19 @@ void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bo auto vd_view = get_vreg(V, vd, 1); if(vm || mask_reg[0]) vd_view[0] = imm; - else - vd_view[0] = vtype.vma() ? vd_view[0] : vd_view[0]; + else if(vtype.vma()) + vd_view[0] = agnostic_behavior(vd_view[0]); } template void vector_slide1down(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) { vector_slidedown(V, vl, vstart, vtype, vm, vd, vs2, 1); - if(vl > 0) { - vmask_view mask_reg = read_vmask(V, vl); - auto vd_view = get_vreg(V, vd, vl); - if(vm || mask_reg[vl - 1]) - vd_view[vl - 1] = imm; - else - vd_view[0] = vtype.vma() ? vd_view[0] : vd_view[0]; - } + + vmask_view mask_reg = read_vmask(V, vl); + auto vd_view = get_vreg(V, vd, vl); + if(vm || mask_reg[vl - 1]) + vd_view[vl - 1] = imm; + else if(vtype.vma()) + vd_view[0] = agnostic_behavior(vd_view[0]); } template void vector_vector_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, unsigned vs1) { @@ -1918,18 +1793,16 @@ void vector_vector_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtyp auto vs1_view = get_vreg(V, vs1, vlmax); auto vs2_view = get_vreg(V, vs2, vlmax); auto vd_view = get_vreg(V, vd, vlmax); - for(size_t idx = vstart; idx < std::min(vlmax, vl); idx++) { + for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { + if(mask_active) vd_view[idx] = (vs1_view[idx] >= vlmax) ? 0 : vs2_view[vs1_view[idx]]; - } else { - vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx]; - } + else if(vtype.vma()) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } - for(size_t idx = vl; idx < vlmax; idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } template void vector_imm_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) { @@ -1937,18 +1810,16 @@ void vector_imm_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, vmask_view mask_reg = read_vmask(V, vlmax); auto vs2_view = get_vreg(V, vs2, vlmax); auto vd_view = get_vreg(V, vd, vlmax); - for(size_t idx = vstart; idx < std::min(vlmax, vl); idx++) { + for(size_t idx = vstart; idx < vl; idx++) { bool mask_active = vm ? 1 : mask_reg[idx]; - if(mask_active) { + if(mask_active) vd_view[idx] = (imm >= vlmax) ? 0 : vs2_view[imm]; - } else { - vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx]; - } + else if(vtype.vma()) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } - for(size_t idx = vl; idx < vlmax; idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } - return; + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } template void vector_compress(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, unsigned vd, unsigned vs2, unsigned vs1) { @@ -1957,22 +1828,20 @@ void vector_compress(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, un auto vs2_view = get_vreg(V, vs2, vlmax); auto vd_view = get_vreg(V, vd, vlmax); unsigned current_pos = 0; - for(size_t idx = vstart; idx < std::min(vlmax, vl); idx++) { + for(size_t idx = vstart; idx < vl; idx++) if(mask_reg[idx]) { vd_view[current_pos] = vs2_view[idx]; current_pos += 1; } - } - for(size_t idx = current_pos; idx < vlmax; idx++) { - vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx]; - } - return; + + if(vtype.vta()) + for(size_t idx = vl; idx < vlmax; idx++) + vd_view[idx] = agnostic_behavior(vd_view[idx]); } template void vector_whole_move(uint8_t* V, unsigned vd, unsigned vs2, unsigned count) { auto vd_view = get_vreg(V, vd, 1); auto vs2_view = get_vreg(V, vs2, 1); memcpy(vd_view.start, vs2_view.start, VLEN / 8 * count); - return; } } // namespace softvector \ No newline at end of file