updates indexed load to use vreg_views
This commit is contained in:
parent
e24c1874c4
commit
9b7a9fa273
@ -226,11 +226,27 @@ if(vector != null) {%>
|
||||
throw new std::runtime_error("Unsupported width bit value");
|
||||
}
|
||||
}
|
||||
uint64_t vlxseg(uint8_t* V, uint8_t vd, uint8_t vs2, uint64_t rs1_val, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t elem_byte_size, uint16_t elem_count, uint8_t segment_size, bool ordered){
|
||||
return softvector::vector_load_store_index(this->get_arch(), softvector::softvec_read, V, traits::VLEN, traits::XLEN, vd, vs2, rs1_val, vl, vstart, vtype, vm, elem_byte_size, elem_count, segment_size, ordered);
|
||||
|
||||
using indexed_load_store_t = std::function<uint64_t(void*, std::function<bool(void*, uint64_t, uint64_t, uint8_t*)>, uint8_t*, uint64_t, uint64_t, softvector::vtype_t, bool, uint8_t, uint64_t, uint8_t, uint8_t)>;
|
||||
template <typename T1, typename T2> indexed_load_store_t getFunction() {
|
||||
return [this](void* core, std::function<uint64_t(void*, uint64_t, uint64_t, uint8_t*)> load_store_fn, uint8_t* V, uint64_t vl,
|
||||
uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t vd, uint64_t rs1, uint8_t vs2, uint8_t segment_size) {
|
||||
return softvector::vector_load_store_index<${xlen}, ${vlen}, T1, T2>(core, load_store_fn, V, vl, vstart, vtype, vm, vd, rs1, vs2, segment_size);
|
||||
};
|
||||
}
|
||||
uint64_t vsxseg(uint8_t* V, uint8_t vs3, uint8_t vs2, uint64_t rs1_val, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t elem_byte_size, uint16_t elem_count, uint8_t segment_size, bool ordered){
|
||||
return softvector::vector_load_store_index(this->get_arch(), softvector::softvec_write, V, traits::VLEN, traits::XLEN, vs3, vs2, rs1_val, vl, vstart, vtype, vm, elem_byte_size, elem_count, segment_size, ordered);
|
||||
|
||||
const std::array<std::array<indexed_load_store_t, 4>, 4> functionTable = {{
|
||||
{getFunction<uint8_t, uint8_t>(), getFunction<uint8_t, uint16_t>(), getFunction<uint8_t, uint32_t>(), getFunction<uint8_t, uint64_t>()},
|
||||
{getFunction<uint16_t, uint8_t>(), getFunction<uint16_t, uint16_t>(), getFunction<uint16_t, uint32_t>(), getFunction<uint16_t, uint64_t>()},
|
||||
{getFunction<uint32_t, uint8_t>(), getFunction<uint32_t, uint16_t>(), getFunction<uint32_t, uint32_t>(), getFunction<uint32_t, uint64_t>()},
|
||||
{getFunction<uint64_t, uint8_t>(), getFunction<uint64_t, uint16_t>(), getFunction<uint64_t, uint32_t>(), getFunction<uint64_t, uint64_t>()}
|
||||
}};
|
||||
const size_t map_index_size[9] = { 0, 0, 1, 0, 2, 0, 0, 0, 3 }; // translate number of bytes to index in functionTable
|
||||
uint64_t vlxseg(uint8_t* V, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t vd, uint8_t vs2, uint64_t rs1_val, uint8_t segment_size, uint8_t index_byte_size, uint8_t data_byte_size, bool ordered){
|
||||
return functionTable[map_index_size[index_byte_size]][data_byte_size](this->get_arch(), softvector::softvec_read, V, vl, vstart, vtype, vm, vd, rs1_val, vs2, segment_size);
|
||||
}
|
||||
uint64_t vsxseg(uint8_t* V, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t vs3, uint8_t vs2, uint64_t rs1_val, uint8_t segment_size, uint8_t index_byte_size, uint8_t data_byte_size, bool ordered){
|
||||
return functionTable[map_index_size[index_byte_size]][data_byte_size](this->get_arch(), softvector::softvec_write, V, vl, vstart, vtype, vm, vs3, rs1_val, vs2, segment_size);
|
||||
}
|
||||
void vector_vector_op(uint8_t* V, uint8_t funct6, uint8_t funct3, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t vd, uint8_t vs2, uint8_t vs1, uint8_t sew_val, int8_t carry = 0){
|
||||
switch(sew_val){
|
||||
|
@ -84,82 +84,4 @@ vmask_view read_vmask(uint8_t* V, uint16_t VLEN, uint16_t elem_count, uint8_t re
|
||||
assert(mask_start + elem_count / 8 <= V + VLEN * RFS / 8);
|
||||
return {mask_start, elem_count};
|
||||
}
|
||||
uint64_t read_n_bits(uint8_t* V, unsigned n) {
|
||||
switch(n) {
|
||||
case 8:
|
||||
return *reinterpret_cast<uint8_t*>(V);
|
||||
case 16:
|
||||
return *reinterpret_cast<uint16_t*>(V);
|
||||
case 32:
|
||||
return *reinterpret_cast<uint32_t*>(V);
|
||||
case 64:
|
||||
return *reinterpret_cast<uint64_t*>(V);
|
||||
default:
|
||||
throw new std::invalid_argument("Invalid arg in read_n_bits");
|
||||
}
|
||||
}
|
||||
// this function behaves similar to vector_load_store(...) with the key difference that the SEW and LMUL from the parameters apply to the
|
||||
// index registers (instead of the data registers) and the SEW and LMUL encoded in vtype apply to the data registers
|
||||
uint64_t vector_load_store_index(void* core, std::function<bool(void*, uint64_t, uint64_t, uint8_t*)> load_store_fn, uint8_t* V,
|
||||
uint16_t VLEN, uint8_t XLEN, uint8_t addressed_register, uint8_t index_register, uint64_t base_addr,
|
||||
uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uint8_t index_elem_size_byte, uint64_t elem_count,
|
||||
uint8_t segment_size, bool ordered) {
|
||||
// index_eew = index_elem_size_byte * 8
|
||||
// for now ignore the ordered parameter, as all indexed operations are implementes as ordered
|
||||
assert(segment_size > 0);
|
||||
assert((elem_count & (elem_count - 1)) == 0); // check that elem_count is power of 2
|
||||
assert(elem_count <= VLEN * RFS / 8);
|
||||
unsigned data_emul_stride = vtype.lmul() < 1 ? 1 : vtype.lmul();
|
||||
assert(data_emul_stride * segment_size <= 8);
|
||||
unsigned data_elem_size_byte = vtype.sew() / 8;
|
||||
vmask_view mask_reg = read_vmask(V, VLEN, elem_count);
|
||||
// elements w/ index smaller than vstart are in the prestart and get skipped
|
||||
// body is from vstart to min(elem_count, vl)
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
unsigned trap_idx = idx;
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
uint8_t* offset_elem = V + (index_register * VLEN / 8) + (index_elem_size_byte * idx);
|
||||
assert(offset_elem <= (V + VLEN * RFS / 8 - index_elem_size_byte)); // ensure reading index_elem_size_bytes is legal
|
||||
// read sew bits from offset_elem, truncate / extend to XLEN bits
|
||||
uint64_t offset_val = read_n_bits(offset_elem, index_elem_size_byte * 8);
|
||||
assert(XLEN == 64 | XLEN == 32);
|
||||
uint64_t mask = XLEN == 64 ? std::numeric_limits<uint64_t>::max() : std::numeric_limits<uint32_t>::max();
|
||||
unsigned index_offset = offset_val & mask;
|
||||
for(unsigned s_idx = 0; s_idx < segment_size; s_idx++) {
|
||||
// base + selected register + current_elem + current_segment
|
||||
uint8_t* addressed_elem =
|
||||
V + (addressed_register * VLEN / 8) + (data_elem_size_byte * idx) + (VLEN / 8 * s_idx * data_emul_stride);
|
||||
assert(addressed_elem <= V + VLEN * RFS / 8);
|
||||
// base + offset + current_segment
|
||||
uint64_t addr = base_addr + index_offset + s_idx * data_elem_size_byte;
|
||||
if(!load_store_fn(core, addr, data_elem_size_byte, addressed_elem))
|
||||
return trap_idx;
|
||||
}
|
||||
} else {
|
||||
for(unsigned s_idx = 0; s_idx < segment_size; s_idx++) {
|
||||
// base + selected register + current_elem + current_segment
|
||||
uint8_t* addressed_elem =
|
||||
V + (addressed_register * VLEN / 8) + (data_elem_size_byte * idx) + (VLEN / 8 * s_idx * data_emul_stride);
|
||||
assert(addressed_elem <= V + VLEN * RFS / 8);
|
||||
// this only updates the first 8 bits, so eew > 8 would not work correctly
|
||||
*addressed_elem = vtype.vma() ? *addressed_elem : *addressed_elem;
|
||||
}
|
||||
}
|
||||
}
|
||||
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
|
||||
// elements w/ index larger than vl are in the tail
|
||||
unsigned maximum_elems = VLEN * vtype.lmul() / (data_elem_size_byte * 8);
|
||||
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
for(unsigned s_idx = 0; s_idx < segment_size; s_idx++) {
|
||||
// base + selected register + current_elem + current_segment
|
||||
uint8_t* addressed_elem =
|
||||
V + (addressed_register * VLEN / 8) + (data_elem_size_byte * idx) + (VLEN / 8 * s_idx * data_emul_stride);
|
||||
assert(addressed_elem <= V + VLEN * RFS / 8);
|
||||
// this only updates the first 8 bits, so eew > 8 would not work correctly
|
||||
*addressed_elem = vtype.vta() ? *addressed_elem : *addressed_elem;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
} // namespace softvector
|
@ -76,6 +76,10 @@ template <unsigned VLEN, typename eew_t>
|
||||
uint64_t vector_load_store(void* core, std::function<bool(void*, uint64_t, uint64_t, uint8_t*)> load_store_fn, uint8_t* V, uint64_t vl,
|
||||
uint64_t vstart, vtype_t vtype, bool vm, uint8_t vd, uint64_t rs1, uint8_t segment_size, int64_t stride = 0,
|
||||
bool use_stride = false);
|
||||
template <unsigned XLEN, unsigned VLEN, typename eew_t, typename sew_t>
|
||||
uint64_t vector_load_store_index(void* core, std::function<bool(void*, uint64_t, uint64_t, uint8_t*)> load_store_fn, uint8_t* V,
|
||||
uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uint8_t vd, uint64_t rs1, uint8_t vs2,
|
||||
uint8_t segment_size);
|
||||
template <unsigned VLEN, typename dest_elem_t, typename src2_elem_t = dest_elem_t, typename src1_elem_t = src2_elem_t>
|
||||
void vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd,
|
||||
unsigned vs2, unsigned vs1, carry_t carry = carry_t::NO_CARRY, bool merge = false);
|
||||
|
@ -121,10 +121,42 @@ uint64_t vector_load_store(void* core, std::function<bool(void*, uint64_t, uint6
|
||||
}
|
||||
}
|
||||
for(size_t idx = vl; idx < vlmax; idx++) {
|
||||
// vtype.vta();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
// eew for index registers, sew for data register
|
||||
template <unsigned XLEN, unsigned VLEN, typename eew_t, typename sew_t>
|
||||
uint64_t vector_load_store_index(void* core, std::function<bool(void*, uint64_t, uint64_t, uint8_t*)> load_store_fn, uint8_t* V,
|
||||
uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uint8_t vd, uint64_t rs1, uint8_t vs2,
|
||||
uint8_t segment_size) {
|
||||
// All load stores are ordered in this implementation
|
||||
unsigned vlmax = VLEN * vtype.lmul() / vtype.sew();
|
||||
auto emul_stride = std::max<unsigned>(vlmax, VLEN / (sizeof(sew_t) * 8));
|
||||
auto vd_view = get_vreg<VLEN, sew_t>(V, vd, emul_stride * segment_size);
|
||||
auto vs2_view = get_vreg<VLEN, eew_t>(V, vs2, vlmax);
|
||||
vmask_view mask_reg = read_vmask(V, VLEN, vlmax);
|
||||
for(size_t idx = vstart; idx < vl; idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
uint64_t index_offset = vs2_view[idx] & std::numeric_limits<std::conditional_t<XLEN == 32, uint32_t, uint64_t>>::max();
|
||||
for(size_t s_idx = 0; s_idx < segment_size; s_idx++) {
|
||||
sew_t* addressed_elem = &vd_view[idx + emul_stride * s_idx];
|
||||
uint64_t addr = rs1 + index_offset + s_idx * sizeof(sew_t);
|
||||
if(!load_store_fn(core, addr, sizeof(sew_t), reinterpret_cast<uint8_t*>(addressed_elem)))
|
||||
return idx;
|
||||
}
|
||||
} else {
|
||||
for(size_t s_idx = 0; s_idx < segment_size; s_idx++) {
|
||||
// vtype.vma();
|
||||
}
|
||||
}
|
||||
}
|
||||
for(size_t idx = vl; idx < vlmax; idx++) {
|
||||
// vtype.vta();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <typename dest_elem_t, typename src2_elem_t = dest_elem_t, typename src1_elem_t = dest_elem_t>
|
||||
std::function<dest_elem_t(dest_elem_t, src2_elem_t, src1_elem_t)> get_funct(unsigned funct6, unsigned funct3) {
|
||||
if(funct3 == OPIVV || funct3 == OPIVX || funct3 == OPIVI)
|
||||
|
Loading…
x
Reference in New Issue
Block a user