Files
DBT-RISE-TGC/src/vm/vector_functions.hpp

1942 lines
98 KiB
C++

////////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2025, MINRES Technologies GmbH
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met:
//
// 1. Redistributions of source code must retain the above copyright notice,
// this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright notice,
// this list of conditions and the following disclaimer in the documentation
// and/or other materials provided with the distribution.
//
// 3. Neither the name of the copyright holder nor the names of its contributors
// may be used to endorse or promote products derived from this software
// without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
// POSSIBILITY OF SUCH DAMAGE.
//
// Contributors:
// alex@minres.com - initial API and implementation
////////////////////////////////////////////////////////////////////////////////
#pragma once
extern "C" {
#include <softfloat.h>
}
#include "softfloat_types.h"
#include "specialize.h"
#include "vm/fp_functions.h"
#include "vm/vector_functions.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <functional>
#include <limits>
#include <stdexcept>
#include <type_traits>
#ifndef _VM_VECTOR_FUNCTIONS_H_
#error __FILE__ should only be included from vector_functions.h
#endif
#include <math.h>
#ifdef __SIZEOF_INT128__
template <> struct std::make_signed<__uint128_t> { using type = __int128_t; };
template <> struct std::make_signed<__int128_t> { using type = __int128_t; };
#endif
namespace softvector {
template <typename elem_t> struct vreg_view {
uint8_t* start;
size_t vlmax;
elem_t& operator[](size_t idx) {
assert(idx < vlmax);
return *(reinterpret_cast<elem_t*>(start) + idx);
}
};
template <unsigned VLEN, typename elem_t> vreg_view<elem_t> get_vreg(uint8_t* V, uint8_t reg_idx, uint16_t vlmax) {
assert(V + vlmax * sizeof(elem_t) <= V + VLEN * RFS / 8);
return {V + VLEN / 8 * reg_idx, vlmax};
}
template <unsigned VLEN> vmask_view read_vmask(uint8_t* V, uint16_t vlmax, uint8_t reg_idx) {
uint8_t* mask_start = V + VLEN / 8 * reg_idx;
assert(mask_start + vlmax / 8 <= V + VLEN * RFS / 8);
return {mask_start, vlmax};
}
template <typename elem_t> constexpr elem_t shift_mask() {
static_assert(std::numeric_limits<elem_t>::is_integer, "shift_mask only supports integer types");
return std::numeric_limits<elem_t>::digits - 1;
}
template <typename T> constexpr T agnostic_behavior(T val) {
#ifdef AGNOSTIC_ONES
return std::numeric_limits<T>::max();
#else
return val;
#endif
}
enum FUNCT3 {
OPIVV = 0b000,
OPFVV = 0b001,
OPMVV = 0b010,
OPIVI = 0b011,
OPIVX = 0b100,
OPFVF = 0b101,
OPMVX = 0b110,
};
template <class, typename enable = void> struct twice;
template <> struct twice<int8_t> { using type = int16_t; };
template <> struct twice<uint8_t> { using type = uint16_t; };
template <> struct twice<int16_t> { using type = int32_t; };
template <> struct twice<uint16_t> { using type = uint32_t; };
template <> struct twice<int32_t> { using type = int64_t; };
template <> struct twice<uint32_t> { using type = uint64_t; };
#ifdef __SIZEOF_INT128__
template <> struct twice<int64_t> { using type = __int128_t; };
template <> struct twice<uint64_t> { using type = __uint128_t; };
#endif
template <class T> using twice_t = typename twice<T>::type; // for convenience
template <typename TO, typename FROM> constexpr TO sext(FROM val) {
return static_cast<std::make_signed_t<TO>>(static_cast<std::make_signed_t<FROM>>(val));
};
template <unsigned VLEN, typename eew_t>
uint64_t vector_load_store(void* core, std::function<bool(void*, uint64_t, uint64_t, uint8_t*)> load_store_fn, uint8_t* V, uint64_t vl,
uint64_t vstart, vtype_t vtype, bool vm, uint8_t vd, uint64_t rs1, uint8_t segment_size, int64_t stride,
bool use_stride) {
unsigned vlmax = VLEN * vtype.lmul() / vtype.sew();
auto emul_stride = std::max<unsigned>(vlmax, VLEN / (sizeof(eew_t) * 8));
auto vd_view = get_vreg<VLEN, eew_t>(V, vd, emul_stride * segment_size);
vmask_view mask_reg = read_vmask(V, VLEN, vlmax);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
signed stride_offset = stride * idx;
auto seg_offset = use_stride ? 0 : segment_size * sizeof(eew_t) * idx;
for(size_t s_idx = 0; s_idx < segment_size; s_idx++) {
eew_t* addressed_elem = &vd_view[idx + emul_stride * s_idx];
uint64_t addr = rs1 + stride_offset + seg_offset + s_idx * sizeof(eew_t);
if(!load_store_fn(core, addr, sizeof(eew_t), reinterpret_cast<uint8_t*>(addressed_elem)))
return idx;
}
} else if(vtype.vma())
for(size_t s_idx = 0; s_idx < segment_size; s_idx++)
vd_view[idx + emul_stride * s_idx] = agnostic_behavior(vd_view[idx + emul_stride * s_idx]);
}
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
for(size_t s_idx = 0; s_idx < segment_size; s_idx++)
vd_view[idx + emul_stride * s_idx] = agnostic_behavior(vd_view[idx + emul_stride * s_idx]);
return 0;
}
// eew for index registers, sew for data register
template <unsigned XLEN, unsigned VLEN, typename eew_t, typename sew_t>
uint64_t vector_load_store_index(void* core, std::function<bool(void*, uint64_t, uint64_t, uint8_t*)> load_store_fn, uint8_t* V,
uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uint8_t vd, uint64_t rs1, uint8_t vs2,
uint8_t segment_size) {
// All load stores are ordered in this implementation
unsigned vlmax = VLEN * vtype.lmul() / vtype.sew();
auto emul_stride = std::max<unsigned>(vlmax, VLEN / (sizeof(sew_t) * 8));
auto vd_view = get_vreg<VLEN, sew_t>(V, vd, emul_stride * segment_size);
auto vs2_view = get_vreg<VLEN, eew_t>(V, vs2, vlmax);
vmask_view mask_reg = read_vmask(V, VLEN, vlmax);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
uint64_t index_offset = vs2_view[idx] & std::numeric_limits<std::conditional_t<XLEN == 32, uint32_t, uint64_t>>::max();
for(size_t s_idx = 0; s_idx < segment_size; s_idx++) {
sew_t* addressed_elem = &vd_view[idx + emul_stride * s_idx];
uint64_t addr = rs1 + index_offset + s_idx * sizeof(sew_t);
if(!load_store_fn(core, addr, sizeof(sew_t), reinterpret_cast<uint8_t*>(addressed_elem)))
return idx;
}
} else if(vtype.vma())
for(size_t s_idx = 0; s_idx < segment_size; s_idx++)
vd_view[idx + emul_stride * s_idx] = agnostic_behavior(vd_view[idx + emul_stride * s_idx]);
}
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
for(size_t s_idx = 0; s_idx < segment_size; s_idx++)
vd_view[idx + emul_stride * s_idx] = agnostic_behavior(vd_view[idx + emul_stride * s_idx]);
return 0;
}
template <typename dest_elem_t, typename src2_elem_t = dest_elem_t, typename src1_elem_t = dest_elem_t>
std::function<dest_elem_t(dest_elem_t, src2_elem_t, src1_elem_t)> get_funct(unsigned funct6, unsigned funct3) {
if(funct3 == OPIVV || funct3 == OPIVX || funct3 == OPIVI)
switch(funct6) {
case 0b000000: // VADD
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs2 + vs1; };
case 0b000010: // VSUB
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs2 - vs1; };
case 0b000011: // VRSUB
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs1 - vs2; };
case 0b000100: // VMINU
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return std::min<src2_elem_t>(vs2, vs1); };
case 0b000101: // VMIN
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return std::min<std::make_signed_t<src2_elem_t>>(vs2, vs1); };
case 0b000110: // VMAXU
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return std::max<src2_elem_t>(vs2, vs1); };
case 0b000111: // VMAX
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return std::max<std::make_signed_t<src2_elem_t>>(vs2, vs1); };
case 0b001001: // VAND
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs1 & vs2; };
case 0b001010: // VOR
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs1 | vs2; };
case 0b001011: // VXOR
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs1 ^ vs2; };
case 0b010000: // VADC
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs2 + vs1; };
case 0b010010: // VSBC
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return static_cast<std::make_signed_t<dest_elem_t>>(static_cast<std::make_signed_t<src2_elem_t>>(vs2) -
static_cast<std::make_signed_t<src1_elem_t>>(vs1));
};
case 0b100101: // VSLL
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs2 << (vs1 & shift_mask<src2_elem_t>()); };
case 0b101000: // VSRL
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs2 >> (vs1 & shift_mask<src2_elem_t>()); };
case 0b101001: // VSRA
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return static_cast<std::make_signed_t<src2_elem_t>>(vs2) >> (vs1 & shift_mask<src2_elem_t>());
};
case 0b101100: // VNSRL
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs2 >> (vs1 & shift_mask<src2_elem_t>()); };
case 0b101101: // VNSRA
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return static_cast<std::make_signed_t<src2_elem_t>>(vs2) >> (vs1 & shift_mask<src2_elem_t>());
};
default:
throw new std::runtime_error("Unknown funct6 in get_funct");
}
else if(funct3 == OPMVV || funct3 == OPMVX)
switch(funct6) {
case 0b100000: // VDIVU
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) -> dest_elem_t {
if(vs1 == 0)
return -1;
else
return vs2 / vs1;
};
case 0b100001: // VDIV
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) -> dest_elem_t {
if(vs1 == 0)
return -1;
else if(vs2 == std::numeric_limits<std::make_signed_t<src2_elem_t>>::min() &&
static_cast<std::make_signed_t<src1_elem_t>>(vs1) == -1)
return vs2;
else
return static_cast<std::make_signed_t<src2_elem_t>>(vs2) / static_cast<std::make_signed_t<src1_elem_t>>(vs1);
};
case 0b100010: // VREMU
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) -> dest_elem_t {
if(vs1 == 0)
return vs2;
else
return vs2 % vs1;
};
case 0b100011: // VREM
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) -> dest_elem_t {
if(vs1 == 0)
return vs2;
else if(vs2 == std::numeric_limits<std::make_signed_t<src2_elem_t>>::min() &&
static_cast<std::make_signed_t<src1_elem_t>>(vs1) == -1)
return 0;
else
return static_cast<std::make_signed_t<src2_elem_t>>(vs2) % static_cast<std::make_signed_t<src1_elem_t>>(vs1);
};
case 0b100100: // VMULHU
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return (static_cast<twice_t<src2_elem_t>>(vs2) * static_cast<twice_t<src2_elem_t>>(vs1)) >> sizeof(dest_elem_t) * 8;
};
case 0b100101: // VMUL
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return static_cast<std::make_signed_t<src2_elem_t>>(vs2) * static_cast<std::make_signed_t<src1_elem_t>>(vs1);
};
case 0b100110: // VMULHSU
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return (sext<twice_t<src2_elem_t>>(vs2) * static_cast<twice_t<src2_elem_t>>(vs1)) >> sizeof(dest_elem_t) * 8;
};
case 0b100111: // VMULH
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return (sext<twice_t<src2_elem_t>>(vs2) * sext<twice_t<src1_elem_t>>(vs1)) >> sizeof(dest_elem_t) * 8;
};
case 0b101001: // VMADD
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs1 * vd + vs2; };
case 0b101011: // VNMSUB
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return -1 * (vs1 * vd) + vs2; };
case 0b101101: // VMACC
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs1 * vs2 + vd; };
case 0b101111: // VNMSAC
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return -1 * (vs1 * vs2) + vd; };
case 0b110000: // VWADDU
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return static_cast<dest_elem_t>(vs2) + static_cast<dest_elem_t>(vs1);
};
case 0b110001: // VWADD
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return sext<dest_elem_t>(vs2) + sext<dest_elem_t>(vs1); };
case 0b110010: // VWSUBU
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return static_cast<dest_elem_t>(vs2) - static_cast<dest_elem_t>(vs1);
};
case 0b110011: // VWSUB
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return sext<dest_elem_t>(vs2) - sext<dest_elem_t>(vs1); };
case 0b110100: // VWADDU.W
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return static_cast<dest_elem_t>(vs2) + static_cast<dest_elem_t>(vs1);
};
case 0b110101: // VWADD.W
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return sext<dest_elem_t>(vs2) + sext<dest_elem_t>(vs1); };
case 0b110110: // VWSUBU.W
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return static_cast<dest_elem_t>(vs2) - static_cast<dest_elem_t>(vs1);
};
case 0b110111: // VWSUB.W
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return sext<dest_elem_t>(vs2) - sext<dest_elem_t>(vs1); };
case 0b111000: // VWMULU
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return (static_cast<dest_elem_t>(vs2) * static_cast<dest_elem_t>(vs1));
};
case 0b111010: // VWMULSU
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return sext<dest_elem_t>(vs2) * static_cast<dest_elem_t>(vs1); };
case 0b111011: // VWMUL
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return sext<dest_elem_t>(vs2) * sext<dest_elem_t>(vs1); };
case 0b111100: // VWMACCU
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return static_cast<dest_elem_t>(vs1) * static_cast<dest_elem_t>(vs2) + vd;
};
case 0b111101: // VWMACC
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return sext<dest_elem_t>(vs1) * sext<dest_elem_t>(vs2) + vd; };
case 0b111110: // VWMACCUS
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return static_cast<dest_elem_t>(vs1) * sext<dest_elem_t>(vs2) + vd;
};
case 0b111111: // VWMACCSU
return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return sext<dest_elem_t>(vs1) * static_cast<dest_elem_t>(vs2) + vd;
};
default:
throw new std::runtime_error("Unknown funct6 in get_funct");
}
else
throw new std::runtime_error("Unknown funct3 in get_funct");
}
template <unsigned VLEN, typename dest_elem_t, typename src2_elem_t, typename src1_elem_t>
void vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd,
unsigned vs2, unsigned vs1) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs1_view = get_vreg<VLEN, src1_elem_t>(V, vs1, vlmax);
auto vs2_view = get_vreg<VLEN, src2_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
auto fn = get_funct<dest_elem_t, src2_elem_t, src1_elem_t>(funct6, funct3);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]);
else if(vtype.vma())
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <unsigned VLEN, typename dest_elem_t, typename src2_elem_t, typename src1_elem_t>
void vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd,
unsigned vs2, typename std::make_signed<src1_elem_t>::type imm) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, src2_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
auto fn = get_funct<dest_elem_t, src2_elem_t, src1_elem_t>(funct6, funct3);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm);
} else {
vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
}
}
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <unsigned VLEN, typename elem_t>
void vector_vector_carry(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, unsigned vd,
unsigned vs2, unsigned vs1, signed carry) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs1_view = get_vreg<VLEN, elem_t>(V, vs1, vlmax);
auto vs2_view = get_vreg<VLEN, elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, elem_t>(V, vd, vlmax);
auto fn = get_funct<elem_t, elem_t, elem_t>(funct6, funct3);
for(size_t idx = vstart; idx < vl; idx++)
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]) + carry * mask_reg[idx];
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <unsigned VLEN, typename elem_t>
void vector_imm_carry(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, unsigned vd, unsigned vs2,
typename std::make_signed<elem_t>::type imm, signed carry) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, elem_t>(V, vd, vlmax);
auto fn = get_funct<elem_t, elem_t, elem_t>(funct6, funct3);
for(size_t idx = vstart; idx < vl; idx++)
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm) + carry * mask_reg[idx];
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <unsigned VLEN, typename scr_elem_t>
void vector_vector_merge(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, unsigned vs1) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs1_view = get_vreg<VLEN, scr_elem_t>(V, vs1, vlmax);
auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, vlmax);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = vs1_view[idx];
else
vd_view[idx] = vs2_view[idx];
}
}
template <unsigned VLEN, typename scr_elem_t>
void vector_imm_merge(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, vlmax);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = imm;
else
vd_view[idx] = vs2_view[idx];
}
}
template <typename elem_t> std::function<bool(elem_t, elem_t)> get_mask_funct(unsigned funct6, unsigned funct3) {
if(funct3 == OPIVV || funct3 == OPIVX || funct3 == OPIVI)
switch(funct6) {
case 0b011000: // VMSEQ
return [](elem_t vs2, elem_t vs1) { return vs2 == vs1; };
case 0b011001: // VMSNE
return [](elem_t vs2, elem_t vs1) { return vs2 != vs1; };
case 0b011010: // VMSLTU
return [](elem_t vs2, elem_t vs1) { return vs2 < vs1; };
case 0b011011: // VMSLT
return [](elem_t vs2, elem_t vs1) {
return static_cast<std::make_signed_t<elem_t>>(vs2) < static_cast<std::make_signed_t<elem_t>>(vs1);
};
case 0b011100: // VMSLEU
return [](elem_t vs2, elem_t vs1) { return vs2 <= vs1; };
case 0b011101: // VMSLE
return [](elem_t vs2, elem_t vs1) {
return static_cast<std::make_signed_t<elem_t>>(vs2) <= static_cast<std::make_signed_t<elem_t>>(vs1);
};
case 0b011110: // VMSGTU
return [](elem_t vs2, elem_t vs1) { return vs2 > vs1; };
case 0b011111: // VMSGT
return [](elem_t vs2, elem_t vs1) {
return static_cast<std::make_signed_t<elem_t>>(vs2) > static_cast<std::make_signed_t<elem_t>>(vs1);
};
default:
throw new std::runtime_error("Unknown funct6 in get_mask_funct");
}
else if(funct3 == OPMVV || funct3 == OPMVX)
switch(funct6) {
case 0b011000: // VMANDN
return [](elem_t vs2, elem_t vs1) { return vs2 & !vs1; };
case 0b011001: // VMAND
return [](elem_t vs2, elem_t vs1) { return vs2 & vs1; };
case 0b011010: // VMOR
return [](elem_t vs2, elem_t vs1) { return vs2 | vs1; };
case 0b011011: // VMXOR
return [](elem_t vs2, elem_t vs1) { return vs2 ^ vs1; };
case 0b011100: // VMORN
return [](elem_t vs2, elem_t vs1) { return vs2 | !vs1; };
case 0b011101: // VMNAND
return [](elem_t vs2, elem_t vs1) { return !(vs2 & vs1); };
case 0b011110: // VMNOR
return [](elem_t vs2, elem_t vs1) { return !(vs2 | vs1); };
case 0b011111: // VMXNOR
return [](elem_t vs2, elem_t vs1) { return !(vs2 ^ vs1); };
default:
throw new std::runtime_error("Unknown funct6 in get_mask_funct");
}
else
throw new std::runtime_error("Unknown funct3 in get_mask_funct");
}
template <unsigned VLEN, typename elem_t>
void mask_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd,
unsigned vs2, unsigned vs1) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs1_view = get_vreg<VLEN, elem_t>(V, vs1, vlmax);
auto vs2_view = get_vreg<VLEN, elem_t>(V, vs2, vlmax);
vmask_view vd_mask_view = read_vmask<VLEN>(V, VLEN, vd);
auto fn = get_mask_funct<elem_t>(funct6, funct3);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_mask_view[idx] = fn(vs2_view[idx], vs1_view[idx]);
else if(vtype.vma())
vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]);
}
if(vtype.vta())
for(size_t idx = vl; idx < VLEN; idx++)
vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]);
}
template <unsigned VLEN, typename elem_t>
void mask_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd,
unsigned vs2, typename std::make_signed<elem_t>::type imm) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, elem_t>(V, vs2, vlmax);
vmask_view vd_mask_view = read_vmask<VLEN>(V, VLEN, vd);
auto fn = get_mask_funct<elem_t>(funct6, funct3);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_mask_view[idx] = fn(vs2_view[idx], imm);
else if(vtype.vma())
vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]);
}
if(vtype.vta())
for(size_t idx = vl; idx < VLEN; idx++)
vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]);
}
template <typename dest_elem_t, typename src2_elem_t = dest_elem_t>
std::function<dest_elem_t(src2_elem_t)> get_unary_fn(unsigned unary_op) {
switch(unary_op) {
case 0b00111: // VSEXT.VF2
case 0b00101: // VSEXT.VF4
case 0b00011: // VSEXT.VF8
return [](src2_elem_t vs2) { return static_cast<std::make_signed_t<src2_elem_t>>(vs2); };
case 0b00110: // VZEXT.VF2
case 0b00100: // VZEXT.VF4
case 0b00010: // VZEXT.VF8
return [](src2_elem_t vs2) { return vs2; };
default:
throw new std::runtime_error("Unknown funct in get_unary_fn");
}
}
template <unsigned VLEN, typename dest_elem_t, typename src2_elem_t>
void vector_unary_op(uint8_t* V, unsigned unary_op, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, src2_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
auto fn = get_unary_fn<dest_elem_t, src2_elem_t>(unary_op);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = fn(vs2_view[idx]);
else
vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
}
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <typename elem_t> std::function<bool(elem_t, elem_t, elem_t)> get_carry_funct(unsigned funct) {
switch(funct) {
case 0b010001: // VMADC
return [](elem_t vs2, elem_t vs1, elem_t carry) {
return static_cast<elem_t>(vs2 + vs1 + carry) < std::max(vs1, vs2) || static_cast<elem_t>(vs2 + vs1) < std::max(vs1, vs2);
};
case 0b010011: // VMSBC
return [](elem_t vs2, elem_t vs1, elem_t carry) {
return vs2 < static_cast<elem_t>(vs1 + carry) || (vs1 == std::numeric_limits<elem_t>::max() && carry);
};
default:
throw new std::runtime_error("Unknown funct in get_carry_funct");
}
}
template <unsigned VLEN, typename elem_t>
void carry_vector_vector_op(uint8_t* V, unsigned funct, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2,
unsigned vs1) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs1_view = get_vreg<VLEN, elem_t>(V, vs1, vlmax);
auto vs2_view = get_vreg<VLEN, elem_t>(V, vs2, vlmax);
vmask_view vd_mask_view = read_vmask<VLEN>(V, vlmax, vd);
auto fn = get_carry_funct<elem_t>(funct);
for(size_t idx = vstart; idx < vl; idx++) {
elem_t carry = vm ? 0 : mask_reg[idx];
vd_mask_view[idx] = fn(vs2_view[idx], vs1_view[idx], carry);
}
for(size_t idx = vl; idx < vlmax; idx++)
vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]);
}
template <unsigned VLEN, typename elem_t>
void carry_vector_imm_op(uint8_t* V, unsigned funct, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2,
typename std::make_signed<elem_t>::type imm) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, elem_t>(V, vs2, vlmax);
vmask_view vd_mask_view = read_vmask<VLEN>(V, vlmax, vd);
auto fn = get_carry_funct<elem_t>(funct);
for(size_t idx = vstart; idx < vl; idx++) {
elem_t carry = vm ? 0 : mask_reg[idx];
vd_mask_view[idx] = fn(vs2_view[idx], imm, carry);
}
for(size_t idx = vl; idx < vlmax; idx++)
vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]);
}
template <typename T> bool get_rounding_increment(T v, uint64_t d, int64_t vxrm) {
if(d == 0)
return 0;
switch(vxrm & 0b11) {
case 0b00:
return (v >> (d - 1)) & 1;
case 0b01:
return ((v >> (d - 1)) & 1) && (((v & ((1 << (d - 1)) - 1)) != 0) || ((v >> d) & 1));
case 0b10:
return false;
case 0b11:
return (!(v & (static_cast<T>(1) << d)) && ((v & ((static_cast<T>(1) << d) - 1)) != 0));
}
return false;
}
template <typename T> T roundoff(T v, uint64_t d, int64_t vxrm) {
unsigned r = get_rounding_increment(v, d, vxrm);
return (v >> d) + r;
}
template <typename dest_elem_t, typename src2_elem_t = dest_elem_t, typename src1_elem_t = dest_elem_t>
std::function<bool(uint64_t, vtype_t, dest_elem_t&, src2_elem_t, src1_elem_t)> get_sat_funct(unsigned funct6, unsigned funct3) {
if(funct3 == OPIVV || funct3 == OPIVX || funct3 == OPIVI)
switch(funct6) {
case 0b100000: // VSADDU
return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) {
auto res = static_cast<twice_t<src1_elem_t>>(vs2) + static_cast<twice_t<src1_elem_t>>(vs1);
if(res > std::numeric_limits<dest_elem_t>::max()) {
vd = std::numeric_limits<dest_elem_t>::max();
return 1;
} else {
vd = res;
return 0;
}
};
case 0b100001: // VSADD
return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) {
auto res = static_cast<twice_t<std::make_signed_t<src2_elem_t>>>(static_cast<std::make_signed_t<src2_elem_t>>(vs2)) +
static_cast<twice_t<std::make_signed_t<src1_elem_t>>>(static_cast<std::make_signed_t<src1_elem_t>>(vs1));
if(res < std::numeric_limits<std::make_signed_t<dest_elem_t>>::min()) {
vd = std::numeric_limits<std::make_signed_t<dest_elem_t>>::min();
return 1;
} else if(res > std::numeric_limits<std::make_signed_t<dest_elem_t>>::max()) {
vd = std::numeric_limits<std::make_signed_t<dest_elem_t>>::max();
return 1;
} else {
vd = res;
return 0;
}
};
case 0b100010: // VSSUBU
return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) {
if(vs2 < vs1) {
vd = 0;
return 1;
} else {
vd = vs2 - vs1;
return 0;
}
};
case 0b100011: // VSSUB
return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) {
auto res = static_cast<twice_t<std::make_signed_t<src2_elem_t>>>(static_cast<std::make_signed_t<src2_elem_t>>(vs2)) -
static_cast<twice_t<std::make_signed_t<src1_elem_t>>>(static_cast<std::make_signed_t<src1_elem_t>>(vs1));
if(res < std::numeric_limits<std::make_signed_t<dest_elem_t>>::min()) {
vd = std::numeric_limits<std::make_signed_t<dest_elem_t>>::min();
return 1;
} else if(res > std::numeric_limits<std::make_signed_t<dest_elem_t>>::max()) {
vd = std::numeric_limits<std::make_signed_t<dest_elem_t>>::max();
return 1;
} else {
vd = res;
return 0;
}
};
case 0b100111: // VSMUL
return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) {
auto big_val = static_cast<twice_t<std::make_signed_t<src2_elem_t>>>(static_cast<std::make_signed_t<src2_elem_t>>(vs2)) *
static_cast<twice_t<std::make_signed_t<src1_elem_t>>>(static_cast<std::make_signed_t<src1_elem_t>>(vs1));
auto res = roundoff(big_val, vtype.sew() - 1, vxrm);
if(res < std::numeric_limits<std::make_signed_t<dest_elem_t>>::min()) {
vd = std::numeric_limits<std::make_signed_t<dest_elem_t>>::min();
return 1;
} else if(res > std::numeric_limits<std::make_signed_t<dest_elem_t>>::max()) {
vd = std::numeric_limits<std::make_signed_t<dest_elem_t>>::max();
return 1;
} else {
vd = res;
return 0;
}
};
case 0b101010: // VSSRL
return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) {
vd = roundoff(vs2, vs1 & shift_mask<src1_elem_t>(), vxrm);
return 0;
};
case 0b101011: // VSSRA
return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) {
vd = roundoff(static_cast<std::make_signed_t<src2_elem_t>>(vs2), vs1 & shift_mask<src1_elem_t>(), vxrm);
return 0;
};
case 0b101110: // VNCLIPU
return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) {
auto res = roundoff(vs2, vs1 & shift_mask<src2_elem_t>(), vxrm);
if(res > std::numeric_limits<dest_elem_t>::max()) {
vd = std::numeric_limits<dest_elem_t>::max();
return 1;
} else {
vd = res;
return 0;
}
};
case 0b101111: // VNCLIP
return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) {
auto res = roundoff(static_cast<std::make_signed_t<src2_elem_t>>(vs2), vs1 & shift_mask<src2_elem_t>(), vxrm);
if(res < std::numeric_limits<std::make_signed_t<dest_elem_t>>::min()) {
vd = std::numeric_limits<std::make_signed_t<dest_elem_t>>::min();
return 1;
} else if(res > std::numeric_limits<std::make_signed_t<dest_elem_t>>::max()) {
vd = std::numeric_limits<std::make_signed_t<dest_elem_t>>::max();
return 1;
} else {
vd = res;
return 0;
}
};
default:
throw new std::runtime_error("Unknown funct6 in get_sat_funct");
}
else if(funct3 == OPMVV || funct3 == OPMVX)
switch(funct6) {
case 0b001000: // VAADDU
return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) {
auto res = static_cast<dest_elem_t>(vs2) + static_cast<twice_t<src1_elem_t>>(vs1);
vd = roundoff(res, 1, vxrm);
return 0;
};
case 0b001001: // VAADD
return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) {
auto res = sext<twice_t<src2_elem_t>>(vs2) + sext<twice_t<src1_elem_t>>(vs1);
vd = roundoff(res, 1, vxrm);
return 0;
};
case 0b001010: // VASUBU
return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) {
auto res = static_cast<dest_elem_t>(vs2) - static_cast<twice_t<src1_elem_t>>(vs1);
vd = roundoff(res, 1, vxrm);
return 0;
};
case 0b001011: // VASUB
return [](uint64_t vxrm, vtype_t vtype, dest_elem_t& vd, src2_elem_t vs2, src1_elem_t vs1) {
auto res = sext<twice_t<src2_elem_t>>(vs2) - sext<twice_t<src1_elem_t>>(vs1);
vd = roundoff(res, 1, vxrm);
return 0;
};
default:
throw new std::runtime_error("Unknown funct6 in get_sat_funct");
}
else
throw new std::runtime_error("Unknown funct3 in get_sat_funct");
}
template <unsigned VLEN, typename dest_elem_t, typename src2_elem_t, typename src1_elem_t>
bool sat_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, int64_t vxrm, bool vm,
unsigned vd, unsigned vs2, unsigned vs1) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
bool saturated = false;
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs1_view = get_vreg<VLEN, src1_elem_t>(V, vs1, vlmax);
auto vs2_view = get_vreg<VLEN, src2_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
auto fn = get_sat_funct<dest_elem_t, src2_elem_t, src1_elem_t>(funct6, funct3);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
saturated |= fn(vxrm, vtype, vd_view[idx], vs2_view[idx], vs1_view[idx]);
} else {
vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
}
}
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++) {
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
return saturated;
}
template <unsigned VLEN, typename dest_elem_t, typename src2_elem_t, typename src1_elem_t>
bool sat_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, int64_t vxrm, bool vm,
unsigned vd, unsigned vs2, typename std::make_signed<src1_elem_t>::type imm) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
bool saturated = false;
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, src2_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
auto fn = get_sat_funct<dest_elem_t, src2_elem_t, src1_elem_t>(funct6, funct3);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
saturated |= fn(vxrm, vtype, vd_view[idx], vs2_view[idx], imm);
} else {
vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
}
}
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++) {
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
return saturated;
}
template <typename dest_elem_t, typename src_elem_t>
std::function<void(dest_elem_t&, src_elem_t)> get_red_funct(unsigned funct6, unsigned funct3) {
if(funct3 == OPIVV || funct3 == OPIVX || funct3 == OPIVI)
switch(funct6) {
case 0b110000: // VWREDSUMU
return [](dest_elem_t& running_total, src_elem_t vs2) { return running_total += static_cast<dest_elem_t>(vs2); };
case 0b110001: // VWREDSUM
return [](dest_elem_t& running_total, src_elem_t vs2) {
// cast the signed vs2 elem to unsigned to enable wrap around on overflow
return running_total += static_cast<dest_elem_t>(sext<dest_elem_t>(vs2));
};
default:
throw new std::runtime_error("Unknown funct6 in get_red_funct");
}
else if(funct3 == OPMVV || funct3 == OPMVX)
switch(funct6) {
case 0b000000: // VREDSUM
return [](dest_elem_t& running_total, src_elem_t vs2) { return running_total += vs2; };
case 0b000001: // VREDAND
return [](dest_elem_t& running_total, src_elem_t vs2) { return running_total &= vs2; };
case 0b000010: // VREDOR
return [](dest_elem_t& running_total, src_elem_t vs2) { return running_total |= vs2; };
case 0b000011: // VREDXOR
return [](dest_elem_t& running_total, src_elem_t vs2) { running_total ^= vs2; };
case 0b000100: // VREDMINU
return [](dest_elem_t& running_total, src_elem_t vs2) { running_total = std::min<dest_elem_t>(running_total, vs2); };
case 0b000101: // VREDMIN
return [](dest_elem_t& running_total, src_elem_t vs2) {
running_total = std::min(static_cast<std::make_signed_t<dest_elem_t>>(running_total),
static_cast<std::make_signed_t<dest_elem_t>>(static_cast<std::make_signed_t<src_elem_t>>(vs2)));
};
case 0b000110: // VREDMAXU
return
[](dest_elem_t& running_total, src_elem_t vs2) { running_total = std::max(running_total, static_cast<dest_elem_t>(vs2)); };
case 0b000111: // VREDMAX
return [](dest_elem_t& running_total, src_elem_t vs2) {
running_total = std::max(static_cast<std::make_signed_t<dest_elem_t>>(running_total),
static_cast<std::make_signed_t<dest_elem_t>>(static_cast<std::make_signed_t<src_elem_t>>(vs2)));
};
default:
throw new std::runtime_error("Unknown funct6 in get_red_funct");
}
else
throw new std::runtime_error("Unknown funct3 in get_red_funct");
}
template <unsigned VLEN, typename dest_elem_t, typename src_elem_t>
void vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd,
unsigned vs2, unsigned vs1) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs1_elem = get_vreg<VLEN, dest_elem_t>(V, vs1, vlmax)[0];
auto vs2_view = get_vreg<VLEN, src_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
auto fn = get_red_funct<dest_elem_t, src_elem_t>(funct6, funct3);
dest_elem_t& running_total = vd_view[0] = vs1_elem;
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
fn(running_total, vs2_view[idx]);
}
}
// the tail is all elements of the destination register beyond the first one
if(vtype.vta())
for(size_t idx = 1; idx < VLEN / vtype.sew(); idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
// might be that these exist somewhere in softfloat
template <typename src_elem_t> constexpr bool isNaN(src_elem_t x);
template <> constexpr bool isNaN<uint16_t>(uint16_t x) { return ((x & 0x7C00) == 0x7C00) && ((x & 0x03FF) != 0); }
template <> constexpr bool isNaN<uint32_t>(uint32_t x) { return ((x & 0x7F800000) == 0x7F800000) && ((x & 0x007FFFFF) != 0); }
template <> constexpr bool isNaN<uint64_t>(uint64_t x) {
return ((x & 0x7FF0000000000000) == 0x7FF0000000000000) && ((x & 0x000FFFFFFFFFFFFF) != 0);
}
template <typename src_elem_t> constexpr bool isNegZero(src_elem_t x);
template <> constexpr bool isNegZero<uint16_t>(uint16_t x) { return x == 0x8000; }
template <> constexpr bool isNegZero<uint32_t>(uint32_t x) { return x == 0x80000000; }
template <> constexpr bool isNegZero<uint64_t>(uint64_t x) { return x == 0x8000000000000000; }
template <typename src_elem_t> constexpr bool isPosZero(src_elem_t x);
template <> constexpr bool isPosZero<uint16_t>(uint16_t x) { return x == 0x0000; }
template <> constexpr bool isPosZero<uint32_t>(uint32_t x) { return x == 0x00000000; }
template <> constexpr bool isPosZero<uint64_t>(uint64_t x) { return x == 0x0000000000000000; }
template <typename dest_elem_t, typename src_elem_t> dest_elem_t widen_float(src_elem_t val) {
throw new std::runtime_error("Trying to widen a weird 'float'");
}
template <> inline uint64_t widen_float<uint64_t, uint32_t>(uint32_t val) { return f32_to_f64(float32_t{val}).v; }
template <typename elem_size_t> elem_size_t fp_add(uint8_t, elem_size_t, elem_size_t);
template <> inline uint16_t fp_add<uint16_t>(uint8_t mode, uint16_t v2, uint16_t v1) { return fadd_h(v2, v1, mode); }
template <> inline uint32_t fp_add<uint32_t>(uint8_t mode, uint32_t v2, uint32_t v1) { return fadd_s(v2, v1, mode); }
template <> inline uint64_t fp_add<uint64_t>(uint8_t mode, uint64_t v2, uint64_t v1) { return fadd_d(v2, v1, mode); }
template <typename elem_size_t> elem_size_t fp_sub(uint8_t, elem_size_t, elem_size_t);
template <> inline uint16_t fp_sub<uint16_t>(uint8_t mode, uint16_t v2, uint16_t v1) { return fsub_h(v2, v1, mode); }
template <> inline uint32_t fp_sub<uint32_t>(uint8_t mode, uint32_t v2, uint32_t v1) { return fsub_s(v2, v1, mode); }
template <> inline uint64_t fp_sub<uint64_t>(uint8_t mode, uint64_t v2, uint64_t v1) { return fsub_d(v2, v1, mode); }
template <typename elem_size_t> elem_size_t fp_mul(uint8_t, elem_size_t, elem_size_t);
template <> inline uint16_t fp_mul<uint16_t>(uint8_t mode, uint16_t v2, uint16_t v1) { return fmul_h(v2, v1, mode); }
template <> inline uint32_t fp_mul<uint32_t>(uint8_t mode, uint32_t v2, uint32_t v1) { return fmul_s(v2, v1, mode); }
template <> inline uint64_t fp_mul<uint64_t>(uint8_t mode, uint64_t v2, uint64_t v1) { return fmul_d(v2, v1, mode); }
template <typename elem_size_t> elem_size_t fp_div(uint8_t, elem_size_t, elem_size_t);
template <> inline uint16_t fp_div<uint16_t>(uint8_t mode, uint16_t v2, uint16_t v1) { return fdiv_h(v2, v1, mode); }
template <> inline uint32_t fp_div<uint32_t>(uint8_t mode, uint32_t v2, uint32_t v1) { return fdiv_s(v2, v1, mode); }
template <> inline uint64_t fp_div<uint64_t>(uint8_t mode, uint64_t v2, uint64_t v1) { return fdiv_d(v2, v1, mode); }
template <typename elem_size_t> elem_size_t fp_madd(uint8_t, elem_size_t, elem_size_t, elem_size_t);
template <> inline uint16_t fp_madd<uint16_t>(uint8_t mode, uint16_t v2, uint16_t v1, uint16_t v3) { return fmadd_h(v1, v2, v3, 0, mode); }
template <> inline uint32_t fp_madd<uint32_t>(uint8_t mode, uint32_t v2, uint32_t v1, uint32_t v3) { return fmadd_s(v1, v2, v3, 0, mode); }
template <> inline uint64_t fp_madd<uint64_t>(uint8_t mode, uint64_t v2, uint64_t v1, uint64_t v3) { return fmadd_d(v1, v2, v3, 0, mode); }
template <typename elem_size_t> elem_size_t fp_nmadd(uint8_t, elem_size_t, elem_size_t, elem_size_t);
template <> inline uint16_t fp_nmadd<uint16_t>(uint8_t mode, uint16_t v2, uint16_t v1, uint16_t v3) { return fmadd_h(v1, v2, v3, 2, mode); }
template <> inline uint32_t fp_nmadd<uint32_t>(uint8_t mode, uint32_t v2, uint32_t v1, uint32_t v3) { return fmadd_s(v1, v2, v3, 2, mode); }
template <> inline uint64_t fp_nmadd<uint64_t>(uint8_t mode, uint64_t v2, uint64_t v1, uint64_t v3) { return fmadd_d(v1, v2, v3, 2, mode); }
template <typename elem_size_t> elem_size_t fp_msub(uint8_t, elem_size_t, elem_size_t, elem_size_t);
template <> inline uint16_t fp_msub<uint16_t>(uint8_t mode, uint16_t v2, uint16_t v1, uint16_t v3) { return fmadd_h(v1, v2, v3, 1, mode); }
template <> inline uint32_t fp_msub<uint32_t>(uint8_t mode, uint32_t v2, uint32_t v1, uint32_t v3) { return fmadd_s(v1, v2, v3, 1, mode); }
template <> inline uint64_t fp_msub<uint64_t>(uint8_t mode, uint64_t v2, uint64_t v1, uint64_t v3) { return fmadd_d(v1, v2, v3, 1, mode); }
template <typename elem_size_t> elem_size_t fp_nmsub(uint8_t, elem_size_t, elem_size_t, elem_size_t);
template <> inline uint16_t fp_nmsub<uint16_t>(uint8_t mode, uint16_t v2, uint16_t v1, uint16_t v3) { return fmadd_h(v1, v2, v3, 3, mode); }
template <> inline uint32_t fp_nmsub<uint32_t>(uint8_t mode, uint32_t v2, uint32_t v1, uint32_t v3) { return fmadd_s(v1, v2, v3, 3, mode); }
template <> inline uint64_t fp_nmsub<uint64_t>(uint8_t mode, uint64_t v2, uint64_t v1, uint64_t v3) { return fmadd_d(v1, v2, v3, 3, mode); }
template <typename elem_size_t> elem_size_t fp_min(elem_size_t, elem_size_t);
template <> inline uint16_t fp_min<uint16_t>(uint16_t v2, uint16_t v1) {
if(isNaN(v1) && isNaN(v2))
return defaultNaNF16UI;
else if(isNaN(v1))
return v2;
else if(isNaN(v2))
return v1;
else if(isNegZero(v1) && isNegZero(v2))
return v1;
else if(isNegZero(v2) && isNegZero(v1))
return v2;
else if(fcmp_h(v1, v2, 2))
return v1;
else
return v2;
}
template <> inline uint32_t fp_min<uint32_t>(uint32_t v2, uint32_t v1) {
if(isNaN(v1) && isNaN(v2))
return defaultNaNF32UI;
else if(isNaN(v1))
return v2;
else if(isNaN(v2))
return v1;
else if(isNegZero(v1) && isNegZero(v2))
return v1;
else if(isNegZero(v2) && isNegZero(v1))
return v2;
else if(fcmp_s(v1, v2, 2))
return v1;
else
return v2;
}
template <> inline uint64_t fp_min<uint64_t>(uint64_t v2, uint64_t v1) {
if(isNaN(v1) && isNaN(v2))
return defaultNaNF64UI;
else if(isNaN(v1))
return v2;
else if(isNaN(v2))
return v1;
else if(isNegZero(v1) && isNegZero(v2))
return v1;
else if(isNegZero(v2) && isNegZero(v1))
return v2;
else if(fcmp_d(v1, v2, 2))
return v1;
else
return v2;
}
template <typename elem_size_t> elem_size_t fp_max(elem_size_t, elem_size_t);
template <> inline uint16_t fp_max<uint16_t>(uint16_t v2, uint16_t v1) {
if(isNaN(v1) && isNaN(v2))
return defaultNaNF16UI;
else if(isNaN(v1))
return v2;
else if(isNaN(v2))
return v1;
else if(isNegZero(v1) && isNegZero(v2))
return v2;
else if(isNegZero(v2) && isNegZero(v1))
return v1;
else if(fcmp_h(v1, v2, 2))
return v2;
else
return v1;
}
template <> inline uint32_t fp_max<uint32_t>(uint32_t v2, uint32_t v1) {
if(isNaN(v1) && isNaN(v2))
return defaultNaNF32UI;
else if(isNaN(v1))
return v2;
else if(isNaN(v2))
return v1;
else if(isNegZero(v1) && isNegZero(v2))
return v2;
else if(isNegZero(v2) && isNegZero(v1))
return v1;
else if(fcmp_s(v1, v2, 2))
return v2;
else
return v1;
}
template <> inline uint64_t fp_max<uint64_t>(uint64_t v2, uint64_t v1) {
if(isNaN(v1) && isNaN(v2))
return defaultNaNF64UI;
else if(isNaN(v1))
return v2;
else if(isNaN(v2))
return v1;
else if(isNegZero(v1) && isNegZero(v2))
return v2;
else if(isNegZero(v2) && isNegZero(v1))
return v1;
else if(fcmp_d(v1, v2, 2))
return v2;
else
return v1;
}
template <typename dest_elem_t, typename src2_elem_t = dest_elem_t, typename src1_elem_t = dest_elem_t>
std::function<dest_elem_t(uint8_t, uint8_t&, dest_elem_t, src2_elem_t, src1_elem_t)> get_fp_funct(unsigned funct6, unsigned funct3) {
if(funct3 == OPFVV || funct3 == OPFVF)
switch(funct6) {
case 0b000000: // VFADD
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_add<src2_elem_t>(rm, vs2, vs1);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b000010: // VFSUB
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_sub<src2_elem_t>(rm, vs2, vs1);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b000100: // VFMIN
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return fp_min<src2_elem_t>(vs2, vs1);
};
case 0b000110: // VFMAX
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
return fp_max<src2_elem_t>(vs2, vs1);
};
case 0b100000: // VFDIV
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_div<src2_elem_t>(rm, vs2, vs1);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b100001: // VFRDIV
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_div<src2_elem_t>(rm, vs1, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b100100: // VFMUL
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_mul<src2_elem_t>(rm, vs2, vs1);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b100111: // VFRSUB
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_sub<src2_elem_t>(rm, vs1, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b101000: // VFMADD
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_madd<src2_elem_t>(rm, vs1, vd, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b101001: // VFNMADD
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_nmadd<src2_elem_t>(rm, vs1, vd, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b101010: // VFMSUB
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_msub<src2_elem_t>(rm, vs1, vd, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b101011: // VFNMSUB
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_nmsub<src2_elem_t>(rm, vs1, vd, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b101100: // VFMACC
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_madd<src2_elem_t>(rm, vs1, vs2, vd);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b101101: // VFNMAC
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_nmadd<src2_elem_t>(rm, vs1, vs2, vd);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b101110: // VFMSAC
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_msub<src2_elem_t>(rm, vs1, vs2, vd);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b101111: // VFNMSAC
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_nmsub<src2_elem_t>(rm, vs1, vs2, vd);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b110000: // VFWADD
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_add<dest_elem_t>(rm, widen_float<dest_elem_t>(vs2), widen_float<dest_elem_t>(vs1));
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b110010: // VFWSUB
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_sub<dest_elem_t>(rm, widen_float<dest_elem_t>(vs2), widen_float<dest_elem_t>(vs1));
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b110100: // VFWADD.W
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_add<dest_elem_t>(rm, vs2, widen_float<dest_elem_t>(vs1));
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b110110: // VFWSUB.W
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_sub<dest_elem_t>(rm, vs2, widen_float<dest_elem_t>(vs1));
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b111000: // VFWMUL
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_mul<dest_elem_t>(rm, widen_float<dest_elem_t>(vs2), widen_float<dest_elem_t>(vs1));
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b111100: // VFWMACC
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_madd<dest_elem_t>(rm, widen_float<dest_elem_t>(vs1), widen_float<dest_elem_t>(vs2), vd);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b111101: // VFWNMACC
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_nmadd<dest_elem_t>(rm, widen_float<dest_elem_t>(vs1), widen_float<dest_elem_t>(vs2), vd);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b111110: // VFWMSAC
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_msub<dest_elem_t>(rm, widen_float<dest_elem_t>(vs1), widen_float<dest_elem_t>(vs2), vd);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b111111: // VFWNMSAC
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t val = fp_nmsub<dest_elem_t>(rm, widen_float<dest_elem_t>(vs1), widen_float<dest_elem_t>(vs2), vd);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b001000: // VFSGNJ
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t mask = std::numeric_limits<dest_elem_t>::max() >> 1;
dest_elem_t sign_mask = std::numeric_limits<std::make_signed_t<dest_elem_t>>::min();
return (vs2 & mask) | (vs1 & sign_mask);
};
case 0b001001: // VFSGNJN
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t mask = std::numeric_limits<dest_elem_t>::max() >> 1;
dest_elem_t sign_mask = std::numeric_limits<std::make_signed_t<dest_elem_t>>::min();
return (vs2 & mask) | (~vs1 & sign_mask);
};
case 0b001010: // VFSGNJX
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
dest_elem_t mask = std::numeric_limits<dest_elem_t>::max() >> 1;
dest_elem_t sign_mask = std::numeric_limits<std::make_signed_t<dest_elem_t>>::min();
return vs2 ^ (vs1 & sign_mask);
};
default:
throw new std::runtime_error("Unknown funct6 in get_fp_funct");
}
else
throw new std::runtime_error("Unknown funct3 in get_fp_funct");
}
template <unsigned VLEN, typename dest_elem_t, typename src2_elem_t, typename src1_elem_t>
void fp_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd,
unsigned vs2, unsigned vs1, uint8_t rm) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs1_view = get_vreg<VLEN, src1_elem_t>(V, vs1, vlmax);
auto vs2_view = get_vreg<VLEN, src2_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
auto fn = get_fp_funct<dest_elem_t, src2_elem_t, src1_elem_t>(funct6, funct3);
uint8_t accrued_flags = 0;
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = fn(rm, accrued_flags, vd_view[idx], vs2_view[idx], vs1_view[idx]);
else if(vtype.vma())
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
softfloat_exceptionFlags = accrued_flags;
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++) {
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
}
template <unsigned VLEN, typename dest_elem_t, typename src2_elem_t, typename src1_elem_t>
void fp_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd,
unsigned vs2, src1_elem_t imm, uint8_t rm) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, src2_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
auto fn = get_fp_funct<dest_elem_t, src2_elem_t, src1_elem_t>(funct6, funct3);
uint8_t accrued_flags = 0;
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = fn(rm, accrued_flags, vd_view[idx], vs2_view[idx], imm);
else if(vtype.vma())
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
softfloat_exceptionFlags = accrued_flags;
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <typename dest_elem_t, typename src_elem_t>
std::function<void(uint8_t, uint8_t&, dest_elem_t&, src_elem_t)> get_fp_red_funct(unsigned funct6, unsigned funct3) {
if(funct3 == OPFVV || funct3 == OPFVF)
switch(funct6) {
case 0b000001: // VFREDUSUM
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t& running_total, src_elem_t vs2) {
running_total = fp_add<dest_elem_t>(rm, running_total, vs2);
accrued_flags |= softfloat_exceptionFlags;
};
case 0b000011: // VFREDOSUM
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t& running_total, src_elem_t vs2) {
running_total = fp_add<dest_elem_t>(rm, running_total, vs2);
accrued_flags |= softfloat_exceptionFlags;
};
case 0b000101: // VFREDMIN
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t& running_total, src_elem_t vs2) {
running_total = fp_min<dest_elem_t>(running_total, vs2);
};
case 0b000111: // VFREDMAX
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t& running_total, src_elem_t vs2) {
running_total = fp_max<dest_elem_t>(running_total, vs2);
};
case 0b110001: // VFWREDUSUM
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t& running_total, src_elem_t vs2) {
running_total = fp_add<dest_elem_t>(rm, running_total, widen_float<dest_elem_t>(vs2));
accrued_flags |= softfloat_exceptionFlags;
};
case 0b110011: // VFWREDOSUM
return [](uint8_t rm, uint8_t& accrued_flags, dest_elem_t& running_total, src_elem_t vs2) {
running_total = fp_add<dest_elem_t>(rm, running_total, widen_float<dest_elem_t>(vs2));
accrued_flags |= softfloat_exceptionFlags;
};
default:
throw new std::runtime_error("Unknown funct6 in get_fp_red_funct");
}
else
throw new std::runtime_error("Unknown funct3 in get_fp_red_funct");
}
template <unsigned VLEN, typename dest_elem_t, typename src_elem_t>
void fp_vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd,
unsigned vs2, unsigned vs1, uint8_t rm) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs1_elem = get_vreg<VLEN, dest_elem_t>(V, vs1, vlmax)[0];
auto vs2_view = get_vreg<VLEN, src_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
auto fn = get_fp_red_funct<dest_elem_t, src_elem_t>(funct6, funct3);
dest_elem_t& running_total = vd_view[0] = vs1_elem;
uint8_t accrued_flags = 0;
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
fn(rm, accrued_flags, running_total, vs2_view[idx]);
}
softfloat_exceptionFlags = accrued_flags;
// the tail is all elements of the destination register beyond the first one
if(vtype.vta())
for(size_t idx = 1; idx < VLEN / vtype.sew(); idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <typename elem_size_t> elem_size_t fp_sqrt(uint8_t, elem_size_t);
template <> inline uint16_t fp_sqrt<uint16_t>(uint8_t mode, uint16_t v2) { return fsqrt_h(v2, mode); }
template <> inline uint32_t fp_sqrt<uint32_t>(uint8_t mode, uint32_t v2) { return fsqrt_s(v2, mode); }
template <> inline uint64_t fp_sqrt<uint64_t>(uint8_t mode, uint64_t v2) { return fsqrt_d(v2, mode); }
template <typename elem_size_t> elem_size_t fp_rsqrt7(elem_size_t);
template <> inline uint16_t fp_rsqrt7<uint16_t>(uint16_t v2) { return frsqrt7_h(v2); }
template <> inline uint32_t fp_rsqrt7<uint32_t>(uint32_t v2) { return frsqrt7_s(v2); }
template <> inline uint64_t fp_rsqrt7<uint64_t>(uint64_t v2) { return frsqrt7_d(v2); }
template <typename elem_size_t> elem_size_t fp_rec7(uint8_t, elem_size_t);
template <> inline uint16_t fp_rec7<uint16_t>(uint8_t mode, uint16_t v2) { return frec7_h(v2, mode); }
template <> inline uint32_t fp_rec7<uint32_t>(uint8_t mode, uint32_t v2) { return frec7_s(v2, mode); }
template <> inline uint64_t fp_rec7<uint64_t>(uint8_t mode, uint64_t v2) { return frec7_d(v2, mode); }
template <typename elem_size_t> elem_size_t fp_fclass(elem_size_t);
template <> inline uint16_t fp_fclass<uint16_t>(uint16_t v2) { return fclass_h(v2); }
template <> inline uint32_t fp_fclass<uint32_t>(uint32_t v2) { return fclass_s(v2); }
template <> inline uint64_t fp_fclass<uint64_t>(uint64_t v2) { return fclass_d(v2); }
template <typename dest_elem_size_t, typename src_elem_size_t> dest_elem_size_t fp_f_to_ui(uint8_t, src_elem_size_t);
template <typename dest_elem_size_t, typename src_elem_size_t> dest_elem_size_t fp_f_to_i(uint8_t, src_elem_size_t);
template <typename dest_elem_size_t, typename src_elem_size_t> dest_elem_size_t fp_ui_to_f(uint8_t, src_elem_size_t);
template <typename dest_elem_size_t, typename src_elem_size_t> dest_elem_size_t fp_i_to_f(uint8_t, src_elem_size_t);
template <typename dest_elem_t, typename src_elem_t> dest_elem_t fp_f_to_f(uint8_t rm, src_elem_t val);
template <> inline uint16_t fp_f_to_ui<uint16_t, uint16_t>(uint8_t rm, uint16_t v2) { return f16toui32(v2, rm); }
template <> inline uint32_t fp_f_to_ui<uint32_t, uint32_t>(uint8_t rm, uint32_t v2) { return f32toui32(v2, rm); }
template <> inline uint64_t fp_f_to_ui<uint64_t, uint64_t>(uint8_t rm, uint64_t v2) { return f64toui64(v2, rm); }
template <> inline uint16_t fp_f_to_i<uint16_t, uint16_t>(uint8_t rm, uint16_t v2) { return f16toi32(v2, rm); }
template <> inline uint32_t fp_f_to_i<uint32_t, uint32_t>(uint8_t rm, uint32_t v2) { return f32toi32(v2, rm); }
template <> inline uint64_t fp_f_to_i<uint64_t, uint64_t>(uint8_t rm, uint64_t v2) { return f64toi64(v2, rm); }
template <> inline uint16_t fp_ui_to_f<uint16_t, uint16_t>(uint8_t rm, uint16_t v2) { return ui32tof16(v2, rm); }
template <> inline uint32_t fp_ui_to_f<uint32_t, uint32_t>(uint8_t rm, uint32_t v2) { return ui32tof32(v2, rm); }
template <> inline uint64_t fp_ui_to_f<uint64_t, uint64_t>(uint8_t rm, uint64_t v2) { return ui64tof64(v2, rm); }
template <> inline uint16_t fp_i_to_f<uint16_t, uint16_t>(uint8_t rm, uint16_t v2) { return i32tof16(v2, rm); }
template <> inline uint32_t fp_i_to_f<uint32_t, uint32_t>(uint8_t rm, uint32_t v2) { return i32tof32(v2, rm); }
template <> inline uint64_t fp_i_to_f<uint64_t, uint64_t>(uint8_t rm, uint64_t v2) { return i64tof64(v2, rm); }
template <typename elem_t> std::function<elem_t(uint8_t, uint8_t&, elem_t)> get_fp_unary_fn(unsigned encoding_space, unsigned unary_op) {
if(encoding_space == 0b010011) // VFUNARY1
switch(unary_op) {
case 0b00000: // VFSQRT
return [](uint8_t rm, uint8_t& accrued_flags, elem_t vs2) {
elem_t val = fp_sqrt(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b00100: // VFRSQRT7
return [](uint8_t rm, uint8_t& accrued_flags, elem_t vs2) {
elem_t val = fp_rsqrt7(vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b00101: // VFREC7
return [](uint8_t rm, uint8_t& accrued_flags, elem_t vs2) {
elem_t val = fp_rec7(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b10000: // VFCLASS
return [](uint8_t rm, uint8_t& accrued_flags, elem_t vs2) {
elem_t val = fp_fclass(vs2);
return val;
};
default:
throw new std::runtime_error("Unknown funct in get_fp_unary_fn");
}
else if(encoding_space == 0b010010) // VFUNARY0
switch(unary_op) {
case 0b00000: // VFCVT.XU.F.V
case 0b00110: // VFCVT.RTZ.XU.F.V
return [](uint8_t rm, uint8_t& accrued_flags, elem_t vs2) {
elem_t val = fp_f_to_ui<elem_t, elem_t>(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b00001: // VFCVT.X.F.V
case 0b00111: // VFCVT.RTZ.X.F.V
return [](uint8_t rm, uint8_t& accrued_flags, elem_t vs2) {
elem_t val = fp_f_to_i<elem_t, elem_t>(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b00010: // VFCVT.F.XU.V
return [](uint8_t rm, uint8_t& accrued_flags, elem_t vs2) {
elem_t val = fp_ui_to_f<elem_t, elem_t>(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b00011: // VFCVT.F.X.V
return [](uint8_t rm, uint8_t& accrued_flags, elem_t vs2) {
elem_t val = fp_i_to_f<elem_t, elem_t>(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
default:
throw new std::runtime_error("Unknown funct in get_fp_unary_fn");
}
else
throw new std::runtime_error("Unknown funct in get_fp_unary_fn");
}
template <unsigned VLEN, typename elem_t>
void fp_vector_unary_op(uint8_t* V, unsigned encoding_space, unsigned unary_op, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm,
unsigned vd, unsigned vs2, uint8_t rm) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, elem_t>(V, vd, vlmax);
auto fn = get_fp_unary_fn<elem_t>(encoding_space, unary_op);
uint8_t accrued_flags = 0;
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = fn(rm, accrued_flags, vs2_view[idx]);
else if(vtype.vma())
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
softfloat_exceptionFlags = accrued_flags;
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <> inline uint16_t fp_f_to_ui<uint16_t, uint8_t>(uint8_t rm, uint8_t v2) {
throw new std::runtime_error("Attempting illegal widening conversion");
}
template <> inline uint32_t fp_f_to_ui<uint32_t, uint16_t>(uint8_t rm, uint16_t v2) { return f16toui32(v2, rm); }
template <> inline uint64_t fp_f_to_ui<uint64_t, uint32_t>(uint8_t rm, uint32_t v2) { return f32toui64(v2, rm); }
template <> inline uint16_t fp_f_to_i<uint16_t, uint8_t>(uint8_t rm, uint8_t v2) {
throw new std::runtime_error("Attempting illegal widening conversion");
}
template <> inline uint32_t fp_f_to_i<uint32_t, uint16_t>(uint8_t rm, uint16_t v2) { return f16toi32(v2, rm); }
template <> inline uint64_t fp_f_to_i<uint64_t, uint32_t>(uint8_t rm, uint32_t v2) { return f32toi64(v2, rm); }
template <> inline uint16_t fp_ui_to_f<uint16_t, uint8_t>(uint8_t rm, uint8_t v2) { return ui32tof16(v2, rm); }
template <> inline uint32_t fp_ui_to_f<uint32_t, uint16_t>(uint8_t rm, uint16_t v2) { return ui32tof32(v2, rm); }
template <> inline uint64_t fp_ui_to_f<uint64_t, uint32_t>(uint8_t rm, uint32_t v2) { return ui32tof64(v2, rm); }
template <> inline uint16_t fp_i_to_f<uint16_t, uint8_t>(uint8_t rm, uint8_t v2) { return i32tof16(v2, rm); }
template <> inline uint32_t fp_i_to_f<uint32_t, uint16_t>(uint8_t rm, uint16_t v2) { return i32tof32(v2, rm); }
template <> inline uint64_t fp_i_to_f<uint64_t, uint32_t>(uint8_t rm, uint32_t v2) { return i32tof64(v2, rm); }
template <> inline uint16_t fp_f_to_f<uint16_t, uint8_t>(uint8_t rm, uint8_t val) {
throw new std::runtime_error("Attempting illegal widening conversion");
}
template <> inline uint32_t fp_f_to_f<uint32_t, uint16_t>(uint8_t rm, uint16_t val) { return f16tof32(val, rm); }
template <> inline uint64_t fp_f_to_f<uint64_t, uint32_t>(uint8_t rm, uint32_t val) { return f32tof64(val, rm); }
template <typename dest_elem_t, typename src_elem_t>
std::function<dest_elem_t(uint8_t, uint8_t&, src_elem_t)> get_fp_widening_fn(unsigned unary_op) {
switch(unary_op) {
case 0b01000: // VFWCVT.XU.F.V
case 0b01110: // VFWCVT.RTZ.XU.F.V
return [](uint8_t rm, uint8_t& accrued_flags, src_elem_t vs2) {
dest_elem_t val = fp_f_to_ui<dest_elem_t, src_elem_t>(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b01001: // VFWCVT.X.F.V
case 0b01111: // VFWCVT.RTZ.X.F.V
return [](uint8_t rm, uint8_t& accrued_flags, src_elem_t vs2) {
dest_elem_t val = fp_f_to_i<dest_elem_t, src_elem_t>(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b01010: // VFWCVT.F.XU.V
return [](uint8_t rm, uint8_t& accrued_flags, src_elem_t vs2) {
dest_elem_t val = fp_ui_to_f<dest_elem_t, src_elem_t>(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b01011: // VFWCVT.F.X.V
return [](uint8_t rm, uint8_t& accrued_flags, src_elem_t vs2) {
dest_elem_t val = fp_i_to_f<dest_elem_t, src_elem_t>(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b01100: // VFWCVT.F.F.V
return [](uint8_t rm, uint8_t& accrued_flags, src_elem_t vs2) {
dest_elem_t val = fp_f_to_f<dest_elem_t, src_elem_t>(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
default:
throw new std::runtime_error("Unknown funct in get_fp_unary_fn");
}
}
template <unsigned VLEN, typename dest_elem_t, typename src_elem_t>
void fp_vector_unary_w(uint8_t* V, unsigned unary_op, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2,
uint8_t rm) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, src_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
auto fn = get_fp_widening_fn<dest_elem_t, src_elem_t>(unary_op);
uint8_t accrued_flags = 0;
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = fn(rm, accrued_flags, vs2_view[idx]);
else if(vtype.vma())
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
softfloat_exceptionFlags = accrued_flags;
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <> inline uint8_t fp_f_to_ui<uint8_t, uint16_t>(uint8_t rm, uint16_t v2) { return f16toui32(v2, rm); }
template <> inline uint16_t fp_f_to_ui<uint16_t, uint32_t>(uint8_t rm, uint32_t v2) { return f32toui32(v2, rm); }
template <> inline uint32_t fp_f_to_ui<uint32_t, uint64_t>(uint8_t rm, uint64_t v2) { return f64toui32(v2, rm); }
template <> inline uint8_t fp_f_to_i<uint8_t, uint16_t>(uint8_t rm, uint16_t v2) { return f16toi32(v2, rm); }
template <> inline uint16_t fp_f_to_i<uint16_t, uint32_t>(uint8_t rm, uint32_t v2) { return f32toi32(v2, rm); }
template <> inline uint32_t fp_f_to_i<uint32_t, uint64_t>(uint8_t rm, uint64_t v2) { return f64toi32(v2, rm); }
template <> inline uint8_t fp_ui_to_f<uint8_t, uint16_t>(uint8_t rm, uint16_t v2) {
throw new std::runtime_error("Attempting illegal narrowing conversion");
}
template <> inline uint16_t fp_ui_to_f<uint16_t, uint32_t>(uint8_t rm, uint32_t v2) { return ui32tof16(v2, rm); }
template <> inline uint32_t fp_ui_to_f<uint32_t, uint64_t>(uint8_t rm, uint64_t v2) { return ui64tof32(v2, rm); }
template <> inline uint8_t fp_i_to_f<uint8_t, uint16_t>(uint8_t rm, uint16_t v2) {
throw new std::runtime_error("Attempting illegal narrowing conversion");
}
template <> inline uint16_t fp_i_to_f<uint16_t, uint32_t>(uint8_t rm, uint32_t v2) { return i32tof16(v2, rm); }
template <> inline uint32_t fp_i_to_f<uint32_t, uint64_t>(uint8_t rm, uint64_t v2) { return i64tof32(v2, rm); }
template <> inline uint8_t fp_f_to_f<uint8_t, uint16_t>(uint8_t rm, uint16_t val) {
throw new std::runtime_error("Attempting illegal narrowing conversion");
}
template <> inline uint16_t fp_f_to_f<uint16_t, uint32_t>(uint8_t rm, uint32_t val) { return f32tof16(val, rm); }
template <> inline uint32_t fp_f_to_f<uint32_t, uint64_t>(uint8_t rm, uint64_t val) { return f64tof32(val, rm); }
template <typename dest_elem_t, typename src_elem_t>
std::function<dest_elem_t(uint8_t, uint8_t&, src_elem_t)> get_fp_narrowing_fn(unsigned unary_op) {
switch(unary_op) {
case 0b10000: // VFNCVT.XU.F.W
case 0b10110: // VFNCVT.RTZ.XU.F.W
return [](uint8_t rm, uint8_t& accrued_flags, src_elem_t vs2) {
dest_elem_t val = fp_f_to_ui<dest_elem_t, src_elem_t>(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b10001: // VFNCVT.X.F.W
case 0b10111: // VFNCVT.RTZ.X.F.W
return [](uint8_t rm, uint8_t& accrued_flags, src_elem_t vs2) {
dest_elem_t val = fp_f_to_i<dest_elem_t, src_elem_t>(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b10010: // VFNCVT.F.XU.W
return [](uint8_t rm, uint8_t& accrued_flags, src_elem_t vs2) {
dest_elem_t val = fp_ui_to_f<dest_elem_t, src_elem_t>(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b10011: // VFNCVT.F.X.W
return [](uint8_t rm, uint8_t& accrued_flags, src_elem_t vs2) {
dest_elem_t val = fp_i_to_f<dest_elem_t, src_elem_t>(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b10100: // VFNCVT.F.F.W
case 0b10101: // VFNCVT.ROD.F.F.W
return [](uint8_t rm, uint8_t& accrued_flags, src_elem_t vs2) {
dest_elem_t val = fp_f_to_f<dest_elem_t, src_elem_t>(rm, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
default:
throw new std::runtime_error("Unknown funct in get_fp_narrowing_fn");
}
}
template <unsigned VLEN, typename dest_elem_t, typename src_elem_t>
void fp_vector_unary_n(uint8_t* V, unsigned unary_op, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2,
uint8_t rm) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, src_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
auto fn = get_fp_narrowing_fn<dest_elem_t, src_elem_t>(unary_op);
uint8_t accrued_flags = 0;
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = fn(rm, accrued_flags, vs2_view[idx]);
else if(vtype.vma())
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
softfloat_exceptionFlags = accrued_flags;
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <typename elem_size_t> bool fp_eq(elem_size_t, elem_size_t);
template <> inline bool fp_eq<uint16_t>(uint16_t v2, uint16_t v1) { return fcmp_h(v2, v1, 0); }
template <> inline bool fp_eq<uint32_t>(uint32_t v2, uint32_t v1) { return fcmp_s(v2, v1, 0); }
template <> inline bool fp_eq<uint64_t>(uint64_t v2, uint64_t v1) { return fcmp_d(v2, v1, 0); }
template <typename elem_size_t> bool fp_le(elem_size_t, elem_size_t);
template <> inline bool fp_le<uint16_t>(uint16_t v2, uint16_t v1) { return fcmp_h(v2, v1, 1); }
template <> inline bool fp_le<uint32_t>(uint32_t v2, uint32_t v1) { return fcmp_s(v2, v1, 1); }
template <> inline bool fp_le<uint64_t>(uint64_t v2, uint64_t v1) { return fcmp_d(v2, v1, 1); }
template <typename elem_size_t> bool fp_lt(elem_size_t, elem_size_t);
template <> inline bool fp_lt<uint16_t>(uint16_t v2, uint16_t v1) { return fcmp_h(v2, v1, 2); }
template <> inline bool fp_lt<uint32_t>(uint32_t v2, uint32_t v1) { return fcmp_s(v2, v1, 2); }
template <> inline bool fp_lt<uint64_t>(uint64_t v2, uint64_t v1) { return fcmp_d(v2, v1, 2); }
template <typename elem_t> std::function<bool(uint8_t, uint8_t&, elem_t, elem_t)> get_fp_mask_funct(unsigned funct6) {
switch(funct6) {
case 0b011000: // VMFEQ
return [](uint8_t rm, uint8_t& accrued_flags, elem_t vs2, elem_t vs1) {
elem_t val = fp_eq(vs2, vs1);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b011001: // VMFLE
return [](uint8_t rm, uint8_t& accrued_flags, elem_t vs2, elem_t vs1) {
elem_t val = fp_le(vs2, vs1);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b011011: // VMFLT
return [](uint8_t rm, uint8_t& accrued_flags, elem_t vs2, elem_t vs1) {
elem_t val = fp_lt(vs2, vs1);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b011100: // VMFNE
return [](uint8_t rm, uint8_t& accrued_flags, elem_t vs2, elem_t vs1) {
elem_t val = !fp_eq(vs2, vs1);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b011101: // VMFGT
return [](uint8_t rm, uint8_t& accrued_flags, elem_t vs2, elem_t vs1) {
elem_t val = fp_lt(vs1, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
case 0b011111: // VMFGE
return [](uint8_t rm, uint8_t& accrued_flags, elem_t vs2, elem_t vs1) {
elem_t val = fp_le(vs1, vs2);
accrued_flags |= softfloat_exceptionFlags;
return val;
};
default:
throw new std::runtime_error("Unknown funct6 in get_fp_mask_funct");
}
}
template <unsigned VLEN, typename elem_t>
void mask_fp_vector_vector_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2,
unsigned vs1, uint8_t rm) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs1_view = get_vreg<VLEN, elem_t>(V, vs1, vlmax);
auto vs2_view = get_vreg<VLEN, elem_t>(V, vs2, vlmax);
vmask_view vd_mask_view = read_vmask<VLEN>(V, VLEN, vd);
auto fn = get_fp_mask_funct<elem_t>(funct6);
uint8_t accrued_flags = 0;
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_mask_view[idx] = fn(rm, accrued_flags, vs2_view[idx], vs1_view[idx]);
else if(vtype.vma())
vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]);
}
softfloat_exceptionFlags = accrued_flags;
if(vtype.vta())
for(size_t idx = vl; idx < VLEN; idx++)
vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]);
}
template <unsigned VLEN, typename elem_t>
void mask_fp_vector_imm_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2,
elem_t imm, uint8_t rm) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, elem_t>(V, vs2, vlmax);
vmask_view vd_mask_view = read_vmask<VLEN>(V, VLEN, vd);
auto fn = get_fp_mask_funct<elem_t>(funct6);
uint8_t accrued_flags = 0;
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_mask_view[idx] = fn(rm, accrued_flags, vs2_view[idx], imm);
else if(vtype.vma())
vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]);
}
softfloat_exceptionFlags = accrued_flags;
if(vtype.vta())
for(size_t idx = vl; idx < VLEN; idx++)
vd_mask_view[idx] = agnostic_behavior(vd_mask_view[idx]);
}
template <unsigned VLEN>
void mask_mask_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, unsigned vd, unsigned vs2, unsigned vs1) {
uint64_t vlmax = VLEN;
auto vs1_view = read_vmask<VLEN>(V, vlmax, vs1);
auto vs2_view = read_vmask<VLEN>(V, vlmax, vs2);
auto vd_view = read_vmask<VLEN>(V, vlmax, vd);
auto fn = get_mask_funct<unsigned>(funct6, funct3); // could be bool, but would break the make_signed_t in get_mask_funct
for(size_t idx = vstart; idx < vl; idx++)
vd_view[idx] = fn(vs2_view[idx], vs1_view[idx]);
// the tail is all elements of the destination register beyond the first one
for(size_t idx = 1; idx < VLEN; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <unsigned VLEN> uint64_t vcpop(uint8_t* V, uint64_t vl, uint64_t vstart, bool vm, unsigned vs2) {
uint64_t vlmax = VLEN;
auto vs2_view = read_vmask<VLEN>(V, vlmax, vs2);
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
unsigned running_total = 0;
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active && vs2_view[idx])
running_total += 1;
}
return running_total;
}
template <unsigned VLEN> uint64_t vfirst(uint8_t* V, uint64_t vl, uint64_t vstart, bool vm, unsigned vs2) {
uint64_t vlmax = VLEN;
auto vs2_view = read_vmask<VLEN>(V, vlmax, vs2);
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active && vs2_view[idx])
return idx;
}
return -1;
}
inline std::function<bool(bool&, bool)> get_mask_set_funct(unsigned enc) {
switch(enc) {
case 0b00001: // VMSBF
return [](bool& marker, bool vs2) {
if(marker)
return 0;
if(vs2) {
marker = true;
return 0;
} else
return 1;
};
case 0b00010: // VMSOF
return [](bool& marker, bool vs2) {
if(marker)
return 0;
if(vs2) {
marker = true;
return 1;
} else
return 0;
};
case 0b00011: // VMSIF
return [](bool& marker, bool vs2) {
if(marker)
return 0;
if(vs2) {
marker = true;
return 1;
} else
return 1;
};
default:
throw new std::runtime_error("Unknown enc in get_mask_set_funct");
}
}
template <unsigned VLEN> void mask_set_op(uint8_t* V, unsigned enc, uint64_t vl, uint64_t vstart, bool vm, unsigned vd, unsigned vs2) {
uint64_t vlmax = VLEN;
auto vs2_view = read_vmask<VLEN>(V, vlmax, vs2);
auto vd_view = read_vmask<VLEN>(V, vlmax, vd);
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto fn = get_mask_set_funct(enc);
bool marker = false;
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = fn(marker, vs2_view[idx]);
}
// the tail is all elements of the destination register beyond the first one
for(size_t idx = 1; idx < VLEN; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <unsigned VLEN, typename src_elem_t>
void viota(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
auto vs2_view = read_vmask<VLEN>(V, vlmax, vs2);
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, vlmax);
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
unsigned current = 0;
for(size_t idx = vstart; idx < std::min(vl, vlmax); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_view[idx] = current;
if(vs2_view[idx])
current += 1;
}
}
}
template <unsigned VLEN, typename src_elem_t> void vid(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, vlmax);
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
for(size_t idx = vstart; idx < std::min(vl, vlmax); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = idx;
}
}
template <unsigned VLEN, typename src_elem_t> uint64_t scalar_move(uint8_t* V, vtype_t vtype, unsigned vd, uint64_t val, bool to_vector) {
unsigned vlmax = VLEN * vtype.lmul() / vtype.sew();
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, vlmax);
if(to_vector) {
vd_view[0] = val;
if(vtype.vta())
for(size_t idx = 1; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
return static_cast<int64_t>(static_cast<std::make_signed_t<src_elem_t>>(vd_view[0]));
}
template <unsigned VLEN, typename src_elem_t>
void vector_slideup(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
uint64_t vlmax = VLEN * vtype.lmul() / (sizeof(src_elem_t) * 8);
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, src_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, vlmax);
for(size_t idx = std::max(vstart, imm); idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = idx - imm < vlmax ? vs2_view[idx - imm] : 0;
else if(vtype.vma())
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <unsigned VLEN, typename src_elem_t>
void vector_slidedown(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
uint64_t vlmax = VLEN * vtype.lmul() / (sizeof(src_elem_t) * 8);
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, src_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, vlmax);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = std::numeric_limits<uint64_t>::max() - idx > imm && idx + imm < vlmax ? vs2_view[idx + imm] : 0;
else if(vtype.vma())
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <unsigned VLEN, typename src_elem_t>
void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
vector_slideup<VLEN, src_elem_t>(V, vl, vstart, vtype, vm, vd, vs2, 1);
vmask_view mask_reg = read_vmask<VLEN>(V, 1);
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, 1);
if(vm || mask_reg[0])
vd_view[0] = imm;
else if(vtype.vma())
vd_view[0] = agnostic_behavior(vd_view[0]);
}
template <unsigned VLEN, typename src_elem_t>
void vector_slide1down(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
vector_slidedown<VLEN, src_elem_t>(V, vl, vstart, vtype, vm, vd, vs2, 1);
vmask_view mask_reg = read_vmask<VLEN>(V, vl);
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, vl);
if(vm || mask_reg[vl - 1])
vd_view[vl - 1] = imm;
else if(vtype.vma())
vd_view[0] = agnostic_behavior(vd_view[0]);
}
template <unsigned VLEN, typename dest_elem_t, typename scr_elem_t>
void vector_vector_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, unsigned vs1) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs1_view = get_vreg<VLEN, scr_elem_t>(V, vs1, vlmax);
auto vs2_view = get_vreg<VLEN, dest_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = (vs1_view[idx] >= vlmax) ? 0 : vs2_view[vs1_view[idx]];
else if(vtype.vma())
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <unsigned VLEN, typename scr_elem_t>
void vector_imm_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, vlmax);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = (imm >= vlmax) ? 0 : vs2_view[imm];
else if(vtype.vma())
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <unsigned VLEN, typename scr_elem_t>
void vector_compress(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, unsigned vd, unsigned vs2, unsigned vs1) {
uint64_t vlmax = VLEN * vtype.lmul() / vtype.sew();
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax, vs1);
auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, vlmax);
unsigned current_pos = 0;
for(size_t idx = vstart; idx < vl; idx++)
if(mask_reg[idx]) {
vd_view[current_pos] = vs2_view[idx];
current_pos += 1;
}
if(vtype.vta())
for(size_t idx = vl; idx < vlmax; idx++)
vd_view[idx] = agnostic_behavior(vd_view[idx]);
}
template <unsigned VLEN> void vector_whole_move(uint8_t* V, unsigned vd, unsigned vs2, unsigned count) {
auto vd_view = get_vreg<VLEN, uint8_t>(V, vd, 1);
auto vs2_view = get_vreg<VLEN, uint8_t>(V, vs2, 1);
memcpy(vd_view.start, vs2_view.start, VLEN / 8 * count);
}
} // namespace softvector