corrects vectorslide, changes all loop index type

This commit is contained in:
Eyck-Alexander Jentzsch 2025-03-06 15:25:29 +01:00
parent 42bf6ee380
commit fe9f2a5455
3 changed files with 93 additions and 75 deletions

View File

@ -629,13 +629,13 @@ if(vector != null) {%>
void vector_slideup(uint8_t* V, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm, uint8_t sew_val) {
switch(sew_val){
case 0b000:
return softvector::vector_slide<${vlen}, uint8_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
return softvector::vector_slideup<${vlen}, uint8_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
case 0b001:
return softvector::vector_slide<${vlen}, uint16_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
return softvector::vector_slideup<${vlen}, uint16_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
case 0b010:
return softvector::vector_slide<${vlen}, uint32_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
return softvector::vector_slideup<${vlen}, uint32_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
case 0b011:
return softvector::vector_slide<${vlen}, uint64_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
return softvector::vector_slideup<${vlen}, uint64_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
default:
throw new std::runtime_error("Unsupported sew bit value");
}
@ -643,13 +643,13 @@ if(vector != null) {%>
void vector_slidedown(uint8_t* V, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm, uint8_t sew_val) {
switch(sew_val){
case 0b000:
return softvector::vector_slide<${vlen}, uint8_t>(V, vl, vstart, vtype, vm, vd, vs2, -imm);
return softvector::vector_slidedown<${vlen}, uint8_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
case 0b001:
return softvector::vector_slide<${vlen}, uint16_t>(V, vl, vstart, vtype, vm, vd, vs2, -imm);
return softvector::vector_slidedown<${vlen}, uint16_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
case 0b010:
return softvector::vector_slide<${vlen}, uint32_t>(V, vl, vstart, vtype, vm, vd, vs2, -imm);
return softvector::vector_slidedown<${vlen}, uint32_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
case 0b011:
return softvector::vector_slide<${vlen}, uint64_t>(V, vl, vstart, vtype, vm, vd, vs2, -imm);
return softvector::vector_slidedown<${vlen}, uint64_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
default:
throw new std::runtime_error("Unsupported sew bit value");
}

View File

@ -124,11 +124,13 @@ void viota(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uns
template <unsigned VLEN, typename src_elem_t> void vid(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd);
template <unsigned VLEN, typename src_elem_t> uint64_t scalar_move(uint8_t* V, vtype_t vtype, unsigned vd, uint64_t val, bool to_vector);
template <unsigned VLEN, typename src_elem_t>
void vector_slide(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm);
void vector_slideup(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm);
template <unsigned VLEN, typename src_elem_t>
void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm);
void vector_slidedown(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm);
template <unsigned VLEN, typename src_elem_t>
void vector_slide1down(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm);
void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm);
template <unsigned VLEN, typename src_elem_t>
void vector_slide1down(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm);
template <unsigned VLEN, typename dest_elem_t, typename scr_elem_t = dest_elem_t>
void vector_vector_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, unsigned vs1);
template <unsigned VLEN, typename scr_elem_t>

View File

@ -349,7 +349,7 @@ void vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
// elements w/ index smaller than vstart are in the prestart and get skipped
// body is from vstart to min(elem_count, vl)
if(carry == carry_t::NO_CARRY) {
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]);
@ -358,18 +358,18 @@ void vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
}
}
} else if(carry == carry_t::SUB_CARRY) {
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]) - mask_reg[idx];
}
} else {
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]) + mask_reg[idx];
}
}
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
// elements w/ index larger than vl are in the tail
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return;
@ -385,7 +385,7 @@ void vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, ui
// elements w/ index smaller than vstart are in the prestart and get skipped
// body is from vstart to min(elem_count, vl)
if(carry == carry_t::NO_CARRY) {
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm);
@ -394,18 +394,18 @@ void vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, ui
}
}
} else if(carry == carry_t::SUB_CARRY) {
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm) - mask_reg[idx];
}
} else {
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm) + mask_reg[idx];
}
}
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
// elements w/ index larger than vl are in the tail
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return;
@ -417,7 +417,7 @@ void vector_vector_merge(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype
auto vs1_view = get_vreg<VLEN, scr_elem_t>(V, vs1, elem_count);
auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, elem_count);
auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, elem_count);
for(unsigned idx = vstart; idx < vl; idx++) {
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = vs1_view[idx];
@ -431,7 +431,7 @@ void vector_imm_merge(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, b
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, elem_count);
auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, elem_count);
for(unsigned idx = vstart; idx < vl; idx++) {
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active)
vd_view[idx] = imm;
@ -503,7 +503,7 @@ void mask_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_
auto fn = get_mask_funct<elem_t>(funct6, funct3);
// elements w/ index smaller than vstart are in the prestart and get skipped
// body is from vstart to min(elem_count, vl)
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_mask_view[idx] = fn(vs2_view[idx], vs1_view[idx]);
@ -513,7 +513,7 @@ void mask_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_
}
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
// elements w/ index larger than vl are in the tail
for(unsigned idx = std::min(elem_count, vl); idx < VLEN; idx++) {
for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) {
vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx];
}
return;
@ -528,7 +528,7 @@ void mask_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t v
auto fn = get_mask_funct<elem_t>(funct6, funct3);
// elements w/ index smaller than vstart are in the prestart and get skipped
// body is from vstart to min(elem_count, vl)
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_mask_view[idx] = fn(vs2_view[idx], imm);
@ -538,7 +538,7 @@ void mask_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t v
}
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
// elements w/ index larger than vl are in the tail
for(unsigned idx = std::min(elem_count, vl); idx < VLEN; idx++) {
for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) {
vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx];
}
return;
@ -567,7 +567,7 @@ void vector_unary_op(uint8_t* V, unsigned unary_op, uint64_t vl, uint64_t vstart
auto fn = get_unary_fn<dest_elem_t, src2_elem_t>(unary_op);
// elements w/ index smaller than vstart are in the prestart and get skipped
// body is from vstart to min(elem_count, vl)
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_view[idx] = fn(vs2_view[idx]);
@ -578,7 +578,7 @@ void vector_unary_op(uint8_t* V, unsigned unary_op, uint64_t vl, uint64_t vstart
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
// elements w/ index larger than vl are in the tail
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return;
@ -608,13 +608,13 @@ void carry_vector_vector_op(uint8_t* V, unsigned funct, uint64_t vl, uint64_t vs
auto fn = get_carry_funct<elem_t>(funct);
// elements w/ index smaller than vstart are in the prestart and get skipped
// body is from vstart to min(elem_count, vl)
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
elem_t carry = vm ? 0 : mask_reg[idx];
vd_mask_view[idx] = fn(vs2_view[idx], vs1_view[idx], carry);
}
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
// elements w/ index larger than vl are in the tail
for(unsigned idx = std::min(elem_count, vl); idx < VLEN; idx++) {
for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) {
// always tail agnostic
}
return;
@ -629,13 +629,13 @@ void carry_vector_imm_op(uint8_t* V, unsigned funct, uint64_t vl, uint64_t vstar
auto fn = get_carry_funct<elem_t>(funct);
// elements w/ index smaller than vstart are in the prestart and get skipped
// body is from vstart to min(elem_count, vl)
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
elem_t carry = vm ? 0 : mask_reg[idx];
vd_mask_view[idx] = fn(vs2_view[idx], imm, carry);
}
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
// elements w/ index larger than vl are in the tail
for(unsigned idx = std::min(elem_count, vl); idx < VLEN; idx++) {
for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) {
// always tail agnostic
}
return;
@ -814,7 +814,7 @@ bool sat_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t
auto fn = get_sat_funct<dest_elem_t, src2_elem_t, src1_elem_T>(funct6, funct3);
// elements w/ index smaller than vstart are in the prestart and get skipped
// body is from vstart to min(elem_count, vl)
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
saturated |= fn(vxrm, vtype, vd_view[idx], vs2_view[idx], vs1_view[idx]);
@ -825,7 +825,7 @@ bool sat_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
// elements w/ index larger than vl are in the tail
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return saturated;
@ -841,7 +841,7 @@ bool sat_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl
auto fn = get_sat_funct<dest_elem_t, src2_elem_t, src1_elem_T>(funct6, funct3);
// elements w/ index smaller than vstart are in the prestart and get skipped
// body is from vstart to min(elem_count, vl)
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
saturated |= fn(vxrm, vtype, vd_view[idx], vs2_view[idx], imm);
@ -852,7 +852,7 @@ bool sat_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
// elements w/ index larger than vl are in the tail
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return saturated;
@ -916,7 +916,7 @@ void vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, ui
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, elem_count);
auto fn = get_red_funct<dest_elem_t, src_elem_t>(funct6, funct3);
dest_elem_t& running_total = {vs1_elem};
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
fn(running_total, vs2_view[idx]);
@ -924,7 +924,7 @@ void vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, ui
}
vd_view[0] = running_total;
// the tail is all elements of the destination register beyond the first one
for(unsigned idx = 1; idx < VLEN / (vtype.sew() * RFS); idx++) {
for(size_t idx = 1; idx < VLEN / (vtype.sew() * RFS); idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return;
@ -1228,7 +1228,7 @@ void fp_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t
uint8_t accrued_flags = 0;
// elements w/ index smaller than vstart are in the prestart and get skipped
// body is from vstart to min(elem_count, vl)
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_view[idx] = fn(rm, accrued_flags, vd_view[idx], vs2_view[idx], vs1_view[idx]);
@ -1240,7 +1240,7 @@ void fp_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
// elements w/ index larger than vl are in the tail
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return;
@ -1256,7 +1256,7 @@ void fp_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
uint8_t accrued_flags = 0;
// elements w/ index smaller than vstart are in the prestart and get skipped
// body is from vstart to min(elem_count, vl)
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_view[idx] = fn(rm, accrued_flags, vd_view[idx], vs2_view[idx], imm);
@ -1268,7 +1268,7 @@ void fp_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
// elements w/ index larger than vl are in the tail
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return;
@ -1324,7 +1324,7 @@ void fp_vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
auto fn = get_fp_red_funct<dest_elem_t, src_elem_t>(funct6, funct3);
dest_elem_t& running_total = {vs1_elem};
uint8_t accrued_flags = 0;
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
fn(rm, accrued_flags, running_total, vs2_view[idx]);
@ -1333,7 +1333,7 @@ void fp_vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
vd_view[0] = running_total;
softfloat_exceptionFlags = accrued_flags;
// the tail is all elements of the destination register beyond the first one
for(unsigned idx = 1; idx < VLEN / (vtype.sew() * RFS); idx++) {
for(size_t idx = 1; idx < VLEN / (vtype.sew() * RFS); idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return;
@ -1517,7 +1517,7 @@ void fp_vector_unary_op(uint8_t* V, unsigned encoding_space, unsigned unary_op,
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, elem_count);
auto fn = get_fp_unary_fn<dest_elem_t, src_elem_t>(encoding_space, unary_op);
uint8_t accrued_flags = 0;
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_view[idx] = fn(rm, accrued_flags, vs2_view[idx]);
@ -1527,7 +1527,7 @@ void fp_vector_unary_op(uint8_t* V, unsigned encoding_space, unsigned unary_op,
}
softfloat_exceptionFlags = accrued_flags;
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return;
@ -1593,7 +1593,7 @@ void mask_fp_vector_vector_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t
vmask_view vd_mask_view = read_vmask<VLEN>(V, VLEN, vd);
auto fn = get_fp_mask_funct<elem_t>(funct6);
uint8_t accrued_flags = 0;
for(unsigned idx = vstart; idx < vl; idx++) {
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_mask_view[idx] = fn(rm, accrued_flags, vs2_view[idx], vs1_view[idx]);
@ -1602,7 +1602,7 @@ void mask_fp_vector_vector_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t
}
}
softfloat_exceptionFlags = accrued_flags;
for(unsigned idx = vl; idx < VLEN; idx++) {
for(size_t idx = vl; idx < VLEN; idx++) {
vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx];
}
return;
@ -1616,7 +1616,7 @@ void mask_fp_vector_imm_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t vs
vmask_view vd_mask_view = read_vmask<VLEN>(V, VLEN, vd);
auto fn = get_fp_mask_funct<elem_t>(funct6);
uint8_t accrued_flags = 0;
for(unsigned idx = vstart; idx < vl; idx++) {
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_mask_view[idx] = fn(rm, accrued_flags, vs2_view[idx], imm);
@ -1625,7 +1625,7 @@ void mask_fp_vector_imm_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t vs
}
}
softfloat_exceptionFlags = accrued_flags;
for(unsigned idx = vl; idx < VLEN; idx++) {
for(size_t idx = vl; idx < VLEN; idx++) {
vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx];
}
return;
@ -1637,11 +1637,11 @@ void mask_mask_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uin
auto vs2_view = read_vmask<VLEN>(V, elem_count, vs2);
auto vd_view = read_vmask<VLEN>(V, elem_count, vd);
auto fn = get_mask_funct<unsigned>(funct6, funct3); // could be bool, but would break the make_signed_t in get_mask_funct
for(unsigned idx = vstart; idx < vl; idx++) {
for(size_t idx = vstart; idx < vl; idx++) {
vd_view[idx] = fn(vs2_view[idx], vs1_view[idx]);
}
// the tail is all elements of the destination register beyond the first one
for(unsigned idx = 1; idx < VLEN; idx++) {
for(size_t idx = 1; idx < VLEN; idx++) {
// always tail agnostic
// this is a nop, placeholder for vta behavior
vd_view[idx] = vd_view[idx];
@ -1653,7 +1653,7 @@ template <unsigned VLEN> uint64_t vcpop(uint8_t* V, uint64_t vl, uint64_t vstart
auto vs2_view = read_vmask<VLEN>(V, elem_count, vs2);
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
unsigned running_total = 0;
for(unsigned idx = vstart; idx < vl; idx++) {
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active && vs2_view[idx])
running_total += 1;
@ -1664,7 +1664,7 @@ template <unsigned VLEN> uint64_t vfirst(uint8_t* V, uint64_t vl, uint64_t vstar
uint64_t elem_count = VLEN;
auto vs2_view = read_vmask<VLEN>(V, elem_count, vs2);
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
for(unsigned idx = vstart; idx < vl; idx++) {
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active && vs2_view[idx])
return idx;
@ -1714,14 +1714,14 @@ template <unsigned VLEN> void mask_set_op(uint8_t* V, unsigned enc, uint64_t vl,
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
auto fn = get_mask_set_funct(enc);
bool marker = false;
for(unsigned idx = vstart; idx < vl; idx++) {
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_view[idx] = fn(marker, vs2_view[idx]);
}
}
// the tail is all elements of the destination register beyond the first one
for(unsigned idx = vl; idx < VLEN; idx++) {
for(size_t idx = vl; idx < VLEN; idx++) {
// always tail agnostic
// this is a nop, placeholder for vta behavior
vd_view[idx] = vd_view[idx];
@ -1734,7 +1734,7 @@ void viota(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uns
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, elem_count);
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
unsigned current = 0;
for(unsigned idx = vstart; idx < std::min(vl, elem_count); idx++) {
for(size_t idx = vstart; idx < std::min(vl, elem_count); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_view[idx] = current;
@ -1748,7 +1748,7 @@ template <unsigned VLEN, typename src_elem_t> void vid(uint8_t* V, uint64_t vl,
uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew();
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, elem_count);
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
for(unsigned idx = vstart; idx < std::min(vl, elem_count); idx++) {
for(size_t idx = vstart; idx < std::min(vl, elem_count); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_view[idx] = idx;
@ -1761,37 +1761,53 @@ template <unsigned VLEN, typename src_elem_t> uint64_t scalar_move(uint8_t* V, v
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, vlmax);
if(to_vector) {
vd_view[0] = val;
for(unsigned idx = 1; idx < vlmax; idx++) {
for(size_t idx = 1; idx < vlmax; idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
}
return static_cast<int64_t>(static_cast<std::make_signed_t<src_elem_t>>(vd_view[0]));
}
template <unsigned VLEN, typename src_elem_t>
void vector_slide(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm) {
void vector_slideup(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
uint64_t elem_count = VLEN * vtype.lmul() / (sizeof(src_elem_t) * 8);
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
auto vs2_view = get_vreg<VLEN, src_elem_t>(V, vs2, elem_count);
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, elem_count);
for(unsigned idx = std::max<int64_t>(vstart, imm); idx < vl; idx++) {
for(size_t idx = std::max(vstart, imm); idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
src_elem_t src_elem = 0;
if(imm >= 0 || (idx - imm < elem_count))
src_elem = vs2_view[idx - imm];
if(mask_active) {
vd_view[idx] = src_elem;
vd_view[idx] = idx - imm < elem_count ? vs2_view[idx - imm] : 0;
} else {
vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
}
}
for(unsigned idx = vl; idx < elem_count; idx++) {
for(size_t idx = vl; idx < elem_count; idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return;
}
template <unsigned VLEN, typename src_elem_t>
void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm) {
vector_slide<VLEN, src_elem_t>(V, vl, vstart, vtype, vm, vd, vs2, 1);
void vector_slidedown(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
uint64_t elem_count = VLEN * vtype.lmul() / (sizeof(src_elem_t) * 8);
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
auto vs2_view = get_vreg<VLEN, src_elem_t>(V, vs2, elem_count);
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, elem_count);
for(size_t idx = vstart; idx < vl; idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_view[idx] = std::numeric_limits<uint64_t>::max() - idx > imm && idx + imm < elem_count ? vs2_view[idx + imm] : 0;
} else {
vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
}
}
for(size_t idx = vl; idx < elem_count; idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return;
}
template <unsigned VLEN, typename src_elem_t>
void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
vector_slideup<VLEN, src_elem_t>(V, vl, vstart, vtype, vm, vd, vs2, 1);
vmask_view mask_reg = read_vmask<VLEN>(V, 1);
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, 1);
if(vm || mask_reg[0])
@ -1800,8 +1816,8 @@ void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bo
vd_view[0] = vtype.vma() ? vd_view[0] : vd_view[0];
}
template <unsigned VLEN, typename src_elem_t>
void vector_slide1down(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm) {
vector_slide<VLEN, src_elem_t>(V, vl, vstart, vtype, vm, vd, vs2, -1);
void vector_slide1down(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
vector_slidedown<VLEN, src_elem_t>(V, vl, vstart, vtype, vm, vd, vs2, 1);
if(vl > 0) {
vmask_view mask_reg = read_vmask<VLEN>(V, vl);
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, vl);
@ -1818,7 +1834,7 @@ void vector_vector_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtyp
auto vs1_view = get_vreg<VLEN, scr_elem_t>(V, vs1, vlmax);
auto vs2_view = get_vreg<VLEN, dest_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
for(unsigned idx = vstart; idx < std::min(vlmax, vl); idx++) {
for(size_t idx = vstart; idx < std::min(vlmax, vl); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_view[idx] = (vs1_view[idx] >= vlmax) ? 0 : vs2_view[vs1_view[idx]];
@ -1826,7 +1842,7 @@ void vector_vector_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtyp
vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
}
}
for(unsigned idx = vl; idx < vlmax; idx++) {
for(size_t idx = vl; idx < vlmax; idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return;
@ -1837,7 +1853,7 @@ void vector_imm_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype,
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, vlmax);
for(unsigned idx = vstart; idx < std::min(vlmax, vl); idx++) {
for(size_t idx = vstart; idx < std::min(vlmax, vl); idx++) {
bool mask_active = vm ? 1 : mask_reg[idx];
if(mask_active) {
vd_view[idx] = (imm >= vlmax) ? 0 : vs2_view[imm];
@ -1845,7 +1861,7 @@ void vector_imm_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype,
vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
}
}
for(unsigned idx = vl; idx < vlmax; idx++) {
for(size_t idx = vl; idx < vlmax; idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return;
@ -1857,13 +1873,13 @@ void vector_compress(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, un
auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, vlmax);
auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, vlmax);
unsigned current_pos = 0;
for(unsigned idx = vstart; idx < std::min(vlmax, vl); idx++) {
for(size_t idx = vstart; idx < std::min(vlmax, vl); idx++) {
if(mask_reg[idx]) {
vd_view[current_pos] = vs2_view[idx];
current_pos += 1;
}
}
for(unsigned idx = current_pos; idx < vlmax; idx++) {
for(size_t idx = current_pos; idx < vlmax; idx++) {
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
}
return;