corrects vectorslide, changes all loop index type
This commit is contained in:
parent
42bf6ee380
commit
fe9f2a5455
@ -629,13 +629,13 @@ if(vector != null) {%>
|
||||
void vector_slideup(uint8_t* V, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm, uint8_t sew_val) {
|
||||
switch(sew_val){
|
||||
case 0b000:
|
||||
return softvector::vector_slide<${vlen}, uint8_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
|
||||
return softvector::vector_slideup<${vlen}, uint8_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
|
||||
case 0b001:
|
||||
return softvector::vector_slide<${vlen}, uint16_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
|
||||
return softvector::vector_slideup<${vlen}, uint16_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
|
||||
case 0b010:
|
||||
return softvector::vector_slide<${vlen}, uint32_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
|
||||
return softvector::vector_slideup<${vlen}, uint32_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
|
||||
case 0b011:
|
||||
return softvector::vector_slide<${vlen}, uint64_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
|
||||
return softvector::vector_slideup<${vlen}, uint64_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
|
||||
default:
|
||||
throw new std::runtime_error("Unsupported sew bit value");
|
||||
}
|
||||
@ -643,13 +643,13 @@ if(vector != null) {%>
|
||||
void vector_slidedown(uint8_t* V, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm, uint8_t sew_val) {
|
||||
switch(sew_val){
|
||||
case 0b000:
|
||||
return softvector::vector_slide<${vlen}, uint8_t>(V, vl, vstart, vtype, vm, vd, vs2, -imm);
|
||||
return softvector::vector_slidedown<${vlen}, uint8_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
|
||||
case 0b001:
|
||||
return softvector::vector_slide<${vlen}, uint16_t>(V, vl, vstart, vtype, vm, vd, vs2, -imm);
|
||||
return softvector::vector_slidedown<${vlen}, uint16_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
|
||||
case 0b010:
|
||||
return softvector::vector_slide<${vlen}, uint32_t>(V, vl, vstart, vtype, vm, vd, vs2, -imm);
|
||||
return softvector::vector_slidedown<${vlen}, uint32_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
|
||||
case 0b011:
|
||||
return softvector::vector_slide<${vlen}, uint64_t>(V, vl, vstart, vtype, vm, vd, vs2, -imm);
|
||||
return softvector::vector_slidedown<${vlen}, uint64_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
|
||||
default:
|
||||
throw new std::runtime_error("Unsupported sew bit value");
|
||||
}
|
||||
|
@ -124,11 +124,13 @@ void viota(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uns
|
||||
template <unsigned VLEN, typename src_elem_t> void vid(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd);
|
||||
template <unsigned VLEN, typename src_elem_t> uint64_t scalar_move(uint8_t* V, vtype_t vtype, unsigned vd, uint64_t val, bool to_vector);
|
||||
template <unsigned VLEN, typename src_elem_t>
|
||||
void vector_slide(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm);
|
||||
void vector_slideup(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm);
|
||||
template <unsigned VLEN, typename src_elem_t>
|
||||
void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm);
|
||||
void vector_slidedown(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm);
|
||||
template <unsigned VLEN, typename src_elem_t>
|
||||
void vector_slide1down(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm);
|
||||
void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm);
|
||||
template <unsigned VLEN, typename src_elem_t>
|
||||
void vector_slide1down(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm);
|
||||
template <unsigned VLEN, typename dest_elem_t, typename scr_elem_t = dest_elem_t>
|
||||
void vector_vector_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, unsigned vs1);
|
||||
template <unsigned VLEN, typename scr_elem_t>
|
||||
|
@ -349,7 +349,7 @@ void vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
|
||||
// elements w/ index smaller than vstart are in the prestart and get skipped
|
||||
// body is from vstart to min(elem_count, vl)
|
||||
if(carry == carry_t::NO_CARRY) {
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]);
|
||||
@ -358,18 +358,18 @@ void vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
|
||||
}
|
||||
}
|
||||
} else if(carry == carry_t::SUB_CARRY) {
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]) - mask_reg[idx];
|
||||
}
|
||||
} else {
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]) + mask_reg[idx];
|
||||
}
|
||||
}
|
||||
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
|
||||
// elements w/ index larger than vl are in the tail
|
||||
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
|
||||
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return;
|
||||
@ -385,7 +385,7 @@ void vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, ui
|
||||
// elements w/ index smaller than vstart are in the prestart and get skipped
|
||||
// body is from vstart to min(elem_count, vl)
|
||||
if(carry == carry_t::NO_CARRY) {
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm);
|
||||
@ -394,18 +394,18 @@ void vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, ui
|
||||
}
|
||||
}
|
||||
} else if(carry == carry_t::SUB_CARRY) {
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm) - mask_reg[idx];
|
||||
}
|
||||
} else {
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm) + mask_reg[idx];
|
||||
}
|
||||
}
|
||||
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
|
||||
// elements w/ index larger than vl are in the tail
|
||||
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
|
||||
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return;
|
||||
@ -417,7 +417,7 @@ void vector_vector_merge(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype
|
||||
auto vs1_view = get_vreg<VLEN, scr_elem_t>(V, vs1, elem_count);
|
||||
auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, elem_count);
|
||||
auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, elem_count);
|
||||
for(unsigned idx = vstart; idx < vl; idx++) {
|
||||
for(size_t idx = vstart; idx < vl; idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active)
|
||||
vd_view[idx] = vs1_view[idx];
|
||||
@ -431,7 +431,7 @@ void vector_imm_merge(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, b
|
||||
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
|
||||
auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, elem_count);
|
||||
auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, elem_count);
|
||||
for(unsigned idx = vstart; idx < vl; idx++) {
|
||||
for(size_t idx = vstart; idx < vl; idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active)
|
||||
vd_view[idx] = imm;
|
||||
@ -503,7 +503,7 @@ void mask_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_
|
||||
auto fn = get_mask_funct<elem_t>(funct6, funct3);
|
||||
// elements w/ index smaller than vstart are in the prestart and get skipped
|
||||
// body is from vstart to min(elem_count, vl)
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_mask_view[idx] = fn(vs2_view[idx], vs1_view[idx]);
|
||||
@ -513,7 +513,7 @@ void mask_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_
|
||||
}
|
||||
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
|
||||
// elements w/ index larger than vl are in the tail
|
||||
for(unsigned idx = std::min(elem_count, vl); idx < VLEN; idx++) {
|
||||
for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) {
|
||||
vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx];
|
||||
}
|
||||
return;
|
||||
@ -528,7 +528,7 @@ void mask_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t v
|
||||
auto fn = get_mask_funct<elem_t>(funct6, funct3);
|
||||
// elements w/ index smaller than vstart are in the prestart and get skipped
|
||||
// body is from vstart to min(elem_count, vl)
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_mask_view[idx] = fn(vs2_view[idx], imm);
|
||||
@ -538,7 +538,7 @@ void mask_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t v
|
||||
}
|
||||
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
|
||||
// elements w/ index larger than vl are in the tail
|
||||
for(unsigned idx = std::min(elem_count, vl); idx < VLEN; idx++) {
|
||||
for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) {
|
||||
vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx];
|
||||
}
|
||||
return;
|
||||
@ -567,7 +567,7 @@ void vector_unary_op(uint8_t* V, unsigned unary_op, uint64_t vl, uint64_t vstart
|
||||
auto fn = get_unary_fn<dest_elem_t, src2_elem_t>(unary_op);
|
||||
// elements w/ index smaller than vstart are in the prestart and get skipped
|
||||
// body is from vstart to min(elem_count, vl)
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_view[idx] = fn(vs2_view[idx]);
|
||||
@ -578,7 +578,7 @@ void vector_unary_op(uint8_t* V, unsigned unary_op, uint64_t vl, uint64_t vstart
|
||||
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
|
||||
// elements w/ index larger than vl are in the tail
|
||||
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
|
||||
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return;
|
||||
@ -608,13 +608,13 @@ void carry_vector_vector_op(uint8_t* V, unsigned funct, uint64_t vl, uint64_t vs
|
||||
auto fn = get_carry_funct<elem_t>(funct);
|
||||
// elements w/ index smaller than vstart are in the prestart and get skipped
|
||||
// body is from vstart to min(elem_count, vl)
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
elem_t carry = vm ? 0 : mask_reg[idx];
|
||||
vd_mask_view[idx] = fn(vs2_view[idx], vs1_view[idx], carry);
|
||||
}
|
||||
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
|
||||
// elements w/ index larger than vl are in the tail
|
||||
for(unsigned idx = std::min(elem_count, vl); idx < VLEN; idx++) {
|
||||
for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) {
|
||||
// always tail agnostic
|
||||
}
|
||||
return;
|
||||
@ -629,13 +629,13 @@ void carry_vector_imm_op(uint8_t* V, unsigned funct, uint64_t vl, uint64_t vstar
|
||||
auto fn = get_carry_funct<elem_t>(funct);
|
||||
// elements w/ index smaller than vstart are in the prestart and get skipped
|
||||
// body is from vstart to min(elem_count, vl)
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
elem_t carry = vm ? 0 : mask_reg[idx];
|
||||
vd_mask_view[idx] = fn(vs2_view[idx], imm, carry);
|
||||
}
|
||||
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
|
||||
// elements w/ index larger than vl are in the tail
|
||||
for(unsigned idx = std::min(elem_count, vl); idx < VLEN; idx++) {
|
||||
for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) {
|
||||
// always tail agnostic
|
||||
}
|
||||
return;
|
||||
@ -814,7 +814,7 @@ bool sat_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t
|
||||
auto fn = get_sat_funct<dest_elem_t, src2_elem_t, src1_elem_T>(funct6, funct3);
|
||||
// elements w/ index smaller than vstart are in the prestart and get skipped
|
||||
// body is from vstart to min(elem_count, vl)
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
saturated |= fn(vxrm, vtype, vd_view[idx], vs2_view[idx], vs1_view[idx]);
|
||||
@ -825,7 +825,7 @@ bool sat_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t
|
||||
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
|
||||
// elements w/ index larger than vl are in the tail
|
||||
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
|
||||
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return saturated;
|
||||
@ -841,7 +841,7 @@ bool sat_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl
|
||||
auto fn = get_sat_funct<dest_elem_t, src2_elem_t, src1_elem_T>(funct6, funct3);
|
||||
// elements w/ index smaller than vstart are in the prestart and get skipped
|
||||
// body is from vstart to min(elem_count, vl)
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
saturated |= fn(vxrm, vtype, vd_view[idx], vs2_view[idx], imm);
|
||||
@ -852,7 +852,7 @@ bool sat_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl
|
||||
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
|
||||
// elements w/ index larger than vl are in the tail
|
||||
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
|
||||
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return saturated;
|
||||
@ -916,7 +916,7 @@ void vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, ui
|
||||
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, elem_count);
|
||||
auto fn = get_red_funct<dest_elem_t, src_elem_t>(funct6, funct3);
|
||||
dest_elem_t& running_total = {vs1_elem};
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
fn(running_total, vs2_view[idx]);
|
||||
@ -924,7 +924,7 @@ void vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, ui
|
||||
}
|
||||
vd_view[0] = running_total;
|
||||
// the tail is all elements of the destination register beyond the first one
|
||||
for(unsigned idx = 1; idx < VLEN / (vtype.sew() * RFS); idx++) {
|
||||
for(size_t idx = 1; idx < VLEN / (vtype.sew() * RFS); idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return;
|
||||
@ -1228,7 +1228,7 @@ void fp_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t
|
||||
uint8_t accrued_flags = 0;
|
||||
// elements w/ index smaller than vstart are in the prestart and get skipped
|
||||
// body is from vstart to min(elem_count, vl)
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_view[idx] = fn(rm, accrued_flags, vd_view[idx], vs2_view[idx], vs1_view[idx]);
|
||||
@ -1240,7 +1240,7 @@ void fp_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t
|
||||
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
|
||||
// elements w/ index larger than vl are in the tail
|
||||
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
|
||||
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return;
|
||||
@ -1256,7 +1256,7 @@ void fp_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
|
||||
uint8_t accrued_flags = 0;
|
||||
// elements w/ index smaller than vstart are in the prestart and get skipped
|
||||
// body is from vstart to min(elem_count, vl)
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_view[idx] = fn(rm, accrued_flags, vd_view[idx], vs2_view[idx], imm);
|
||||
@ -1268,7 +1268,7 @@ void fp_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
|
||||
// elements w/ index larger than elem_count are in the tail (fractional LMUL)
|
||||
// elements w/ index larger than vl are in the tail
|
||||
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
|
||||
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return;
|
||||
@ -1324,7 +1324,7 @@ void fp_vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
|
||||
auto fn = get_fp_red_funct<dest_elem_t, src_elem_t>(funct6, funct3);
|
||||
dest_elem_t& running_total = {vs1_elem};
|
||||
uint8_t accrued_flags = 0;
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
fn(rm, accrued_flags, running_total, vs2_view[idx]);
|
||||
@ -1333,7 +1333,7 @@ void fp_vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
|
||||
vd_view[0] = running_total;
|
||||
softfloat_exceptionFlags = accrued_flags;
|
||||
// the tail is all elements of the destination register beyond the first one
|
||||
for(unsigned idx = 1; idx < VLEN / (vtype.sew() * RFS); idx++) {
|
||||
for(size_t idx = 1; idx < VLEN / (vtype.sew() * RFS); idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return;
|
||||
@ -1517,7 +1517,7 @@ void fp_vector_unary_op(uint8_t* V, unsigned encoding_space, unsigned unary_op,
|
||||
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, elem_count);
|
||||
auto fn = get_fp_unary_fn<dest_elem_t, src_elem_t>(encoding_space, unary_op);
|
||||
uint8_t accrued_flags = 0;
|
||||
for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_view[idx] = fn(rm, accrued_flags, vs2_view[idx]);
|
||||
@ -1527,7 +1527,7 @@ void fp_vector_unary_op(uint8_t* V, unsigned encoding_space, unsigned unary_op,
|
||||
}
|
||||
softfloat_exceptionFlags = accrued_flags;
|
||||
unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
|
||||
for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return;
|
||||
@ -1593,7 +1593,7 @@ void mask_fp_vector_vector_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t
|
||||
vmask_view vd_mask_view = read_vmask<VLEN>(V, VLEN, vd);
|
||||
auto fn = get_fp_mask_funct<elem_t>(funct6);
|
||||
uint8_t accrued_flags = 0;
|
||||
for(unsigned idx = vstart; idx < vl; idx++) {
|
||||
for(size_t idx = vstart; idx < vl; idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_mask_view[idx] = fn(rm, accrued_flags, vs2_view[idx], vs1_view[idx]);
|
||||
@ -1602,7 +1602,7 @@ void mask_fp_vector_vector_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t
|
||||
}
|
||||
}
|
||||
softfloat_exceptionFlags = accrued_flags;
|
||||
for(unsigned idx = vl; idx < VLEN; idx++) {
|
||||
for(size_t idx = vl; idx < VLEN; idx++) {
|
||||
vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx];
|
||||
}
|
||||
return;
|
||||
@ -1616,7 +1616,7 @@ void mask_fp_vector_imm_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t vs
|
||||
vmask_view vd_mask_view = read_vmask<VLEN>(V, VLEN, vd);
|
||||
auto fn = get_fp_mask_funct<elem_t>(funct6);
|
||||
uint8_t accrued_flags = 0;
|
||||
for(unsigned idx = vstart; idx < vl; idx++) {
|
||||
for(size_t idx = vstart; idx < vl; idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_mask_view[idx] = fn(rm, accrued_flags, vs2_view[idx], imm);
|
||||
@ -1625,7 +1625,7 @@ void mask_fp_vector_imm_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t vs
|
||||
}
|
||||
}
|
||||
softfloat_exceptionFlags = accrued_flags;
|
||||
for(unsigned idx = vl; idx < VLEN; idx++) {
|
||||
for(size_t idx = vl; idx < VLEN; idx++) {
|
||||
vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx];
|
||||
}
|
||||
return;
|
||||
@ -1637,11 +1637,11 @@ void mask_mask_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uin
|
||||
auto vs2_view = read_vmask<VLEN>(V, elem_count, vs2);
|
||||
auto vd_view = read_vmask<VLEN>(V, elem_count, vd);
|
||||
auto fn = get_mask_funct<unsigned>(funct6, funct3); // could be bool, but would break the make_signed_t in get_mask_funct
|
||||
for(unsigned idx = vstart; idx < vl; idx++) {
|
||||
for(size_t idx = vstart; idx < vl; idx++) {
|
||||
vd_view[idx] = fn(vs2_view[idx], vs1_view[idx]);
|
||||
}
|
||||
// the tail is all elements of the destination register beyond the first one
|
||||
for(unsigned idx = 1; idx < VLEN; idx++) {
|
||||
for(size_t idx = 1; idx < VLEN; idx++) {
|
||||
// always tail agnostic
|
||||
// this is a nop, placeholder for vta behavior
|
||||
vd_view[idx] = vd_view[idx];
|
||||
@ -1653,7 +1653,7 @@ template <unsigned VLEN> uint64_t vcpop(uint8_t* V, uint64_t vl, uint64_t vstart
|
||||
auto vs2_view = read_vmask<VLEN>(V, elem_count, vs2);
|
||||
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
|
||||
unsigned running_total = 0;
|
||||
for(unsigned idx = vstart; idx < vl; idx++) {
|
||||
for(size_t idx = vstart; idx < vl; idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active && vs2_view[idx])
|
||||
running_total += 1;
|
||||
@ -1664,7 +1664,7 @@ template <unsigned VLEN> uint64_t vfirst(uint8_t* V, uint64_t vl, uint64_t vstar
|
||||
uint64_t elem_count = VLEN;
|
||||
auto vs2_view = read_vmask<VLEN>(V, elem_count, vs2);
|
||||
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
|
||||
for(unsigned idx = vstart; idx < vl; idx++) {
|
||||
for(size_t idx = vstart; idx < vl; idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active && vs2_view[idx])
|
||||
return idx;
|
||||
@ -1714,14 +1714,14 @@ template <unsigned VLEN> void mask_set_op(uint8_t* V, unsigned enc, uint64_t vl,
|
||||
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
|
||||
auto fn = get_mask_set_funct(enc);
|
||||
bool marker = false;
|
||||
for(unsigned idx = vstart; idx < vl; idx++) {
|
||||
for(size_t idx = vstart; idx < vl; idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_view[idx] = fn(marker, vs2_view[idx]);
|
||||
}
|
||||
}
|
||||
// the tail is all elements of the destination register beyond the first one
|
||||
for(unsigned idx = vl; idx < VLEN; idx++) {
|
||||
for(size_t idx = vl; idx < VLEN; idx++) {
|
||||
// always tail agnostic
|
||||
// this is a nop, placeholder for vta behavior
|
||||
vd_view[idx] = vd_view[idx];
|
||||
@ -1734,7 +1734,7 @@ void viota(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uns
|
||||
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, elem_count);
|
||||
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
|
||||
unsigned current = 0;
|
||||
for(unsigned idx = vstart; idx < std::min(vl, elem_count); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(vl, elem_count); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_view[idx] = current;
|
||||
@ -1748,7 +1748,7 @@ template <unsigned VLEN, typename src_elem_t> void vid(uint8_t* V, uint64_t vl,
|
||||
uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew();
|
||||
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, elem_count);
|
||||
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
|
||||
for(unsigned idx = vstart; idx < std::min(vl, elem_count); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(vl, elem_count); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_view[idx] = idx;
|
||||
@ -1761,37 +1761,53 @@ template <unsigned VLEN, typename src_elem_t> uint64_t scalar_move(uint8_t* V, v
|
||||
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, vlmax);
|
||||
if(to_vector) {
|
||||
vd_view[0] = val;
|
||||
for(unsigned idx = 1; idx < vlmax; idx++) {
|
||||
for(size_t idx = 1; idx < vlmax; idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
}
|
||||
return static_cast<int64_t>(static_cast<std::make_signed_t<src_elem_t>>(vd_view[0]));
|
||||
}
|
||||
template <unsigned VLEN, typename src_elem_t>
|
||||
void vector_slide(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm) {
|
||||
void vector_slideup(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
|
||||
uint64_t elem_count = VLEN * vtype.lmul() / (sizeof(src_elem_t) * 8);
|
||||
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
|
||||
auto vs2_view = get_vreg<VLEN, src_elem_t>(V, vs2, elem_count);
|
||||
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, elem_count);
|
||||
for(unsigned idx = std::max<int64_t>(vstart, imm); idx < vl; idx++) {
|
||||
for(size_t idx = std::max(vstart, imm); idx < vl; idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
src_elem_t src_elem = 0;
|
||||
if(imm >= 0 || (idx - imm < elem_count))
|
||||
src_elem = vs2_view[idx - imm];
|
||||
if(mask_active) {
|
||||
vd_view[idx] = src_elem;
|
||||
vd_view[idx] = idx - imm < elem_count ? vs2_view[idx - imm] : 0;
|
||||
} else {
|
||||
vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
}
|
||||
for(unsigned idx = vl; idx < elem_count; idx++) {
|
||||
for(size_t idx = vl; idx < elem_count; idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return;
|
||||
}
|
||||
template <unsigned VLEN, typename src_elem_t>
|
||||
void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm) {
|
||||
vector_slide<VLEN, src_elem_t>(V, vl, vstart, vtype, vm, vd, vs2, 1);
|
||||
void vector_slidedown(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
|
||||
uint64_t elem_count = VLEN * vtype.lmul() / (sizeof(src_elem_t) * 8);
|
||||
vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
|
||||
auto vs2_view = get_vreg<VLEN, src_elem_t>(V, vs2, elem_count);
|
||||
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, elem_count);
|
||||
for(size_t idx = vstart; idx < vl; idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_view[idx] = std::numeric_limits<uint64_t>::max() - idx > imm && idx + imm < elem_count ? vs2_view[idx + imm] : 0;
|
||||
} else {
|
||||
vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
}
|
||||
for(size_t idx = vl; idx < elem_count; idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return;
|
||||
}
|
||||
template <unsigned VLEN, typename src_elem_t>
|
||||
void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
|
||||
vector_slideup<VLEN, src_elem_t>(V, vl, vstart, vtype, vm, vd, vs2, 1);
|
||||
vmask_view mask_reg = read_vmask<VLEN>(V, 1);
|
||||
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, 1);
|
||||
if(vm || mask_reg[0])
|
||||
@ -1800,8 +1816,8 @@ void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bo
|
||||
vd_view[0] = vtype.vma() ? vd_view[0] : vd_view[0];
|
||||
}
|
||||
template <unsigned VLEN, typename src_elem_t>
|
||||
void vector_slide1down(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm) {
|
||||
vector_slide<VLEN, src_elem_t>(V, vl, vstart, vtype, vm, vd, vs2, -1);
|
||||
void vector_slide1down(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
|
||||
vector_slidedown<VLEN, src_elem_t>(V, vl, vstart, vtype, vm, vd, vs2, 1);
|
||||
if(vl > 0) {
|
||||
vmask_view mask_reg = read_vmask<VLEN>(V, vl);
|
||||
auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, vl);
|
||||
@ -1818,7 +1834,7 @@ void vector_vector_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtyp
|
||||
auto vs1_view = get_vreg<VLEN, scr_elem_t>(V, vs1, vlmax);
|
||||
auto vs2_view = get_vreg<VLEN, dest_elem_t>(V, vs2, vlmax);
|
||||
auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
|
||||
for(unsigned idx = vstart; idx < std::min(vlmax, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(vlmax, vl); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_view[idx] = (vs1_view[idx] >= vlmax) ? 0 : vs2_view[vs1_view[idx]];
|
||||
@ -1826,7 +1842,7 @@ void vector_vector_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtyp
|
||||
vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
}
|
||||
for(unsigned idx = vl; idx < vlmax; idx++) {
|
||||
for(size_t idx = vl; idx < vlmax; idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return;
|
||||
@ -1837,7 +1853,7 @@ void vector_imm_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype,
|
||||
vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
|
||||
auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, vlmax);
|
||||
auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, vlmax);
|
||||
for(unsigned idx = vstart; idx < std::min(vlmax, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(vlmax, vl); idx++) {
|
||||
bool mask_active = vm ? 1 : mask_reg[idx];
|
||||
if(mask_active) {
|
||||
vd_view[idx] = (imm >= vlmax) ? 0 : vs2_view[imm];
|
||||
@ -1845,7 +1861,7 @@ void vector_imm_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype,
|
||||
vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
}
|
||||
for(unsigned idx = vl; idx < vlmax; idx++) {
|
||||
for(size_t idx = vl; idx < vlmax; idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return;
|
||||
@ -1857,13 +1873,13 @@ void vector_compress(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, un
|
||||
auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, vlmax);
|
||||
auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, vlmax);
|
||||
unsigned current_pos = 0;
|
||||
for(unsigned idx = vstart; idx < std::min(vlmax, vl); idx++) {
|
||||
for(size_t idx = vstart; idx < std::min(vlmax, vl); idx++) {
|
||||
if(mask_reg[idx]) {
|
||||
vd_view[current_pos] = vs2_view[idx];
|
||||
current_pos += 1;
|
||||
}
|
||||
}
|
||||
for(unsigned idx = current_pos; idx < vlmax; idx++) {
|
||||
for(size_t idx = current_pos; idx < vlmax; idx++) {
|
||||
vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
|
||||
}
|
||||
return;
|
||||
|
Loading…
x
Reference in New Issue
Block a user