mirror of
https://github.com/riscv-software-src/opensbi.git
synced 2026-06-17 16:41:19 +01:00
lib: sbi: Rework misaligned vector load/store
Fix the following issues with misaligned vector load/store: a. Stack overflow: the mask[VLEN_MAX / 8] variable consumes 8K stack space, given VLEN_MAX=65536, overflowing the default-sized stack. There's no need to fetch the whole mask in one go, instead, make it on-demand. Use a 128-byte mask as local buffer to hold the sliding window of mask. For rvv load, this is allowed -- from the spec: "The destination vector register group for a masked vector instruction cannot overlap the source mask register (v0), unless the destination vector register is being written with a mask value (e.g., compares) or the scalar result of a reduction" We don't need to worry about the mask getting overwritten. b. Maintain the value of vstart upon abort (uptrap) to avoid duplicate work. After fault resolution, the instruction can restart from the faulting vstart. For Fault-Only-First loads, reset vstart to 0, as previously done so, to conform to spec. c. Explicitly set VS dirty in VSSTATUS with SET_VS_DIRTY() if faulting from V=1, and if any vector register, including vstart/vl/vtype, gets changed in the handler. It can add 1 unnecessary op to set VS dirty in M/SSTATUS (not VSSTATUS), where the HW already did, but for code simplicity, do it anyway. The overhead should be negligible. Signed-off-by: Bo Gan <ganboing@gmail.com> Tested-by: Anirudh Srinivasan <asrinivasan@oss.tenstorrent.com> Reviewed-by: Anup Patel <anup@brainfault.org> Link: https://lore.kernel.org/r/20260609060024.706-5-ganboing@gmail.com Signed-off-by: Anup Patel <anup@brainfault.org>
This commit is contained in:
@@ -20,6 +20,12 @@ struct sbi_vector_context {
|
|||||||
uint8_t vregs[];
|
uint8_t vregs[];
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#define SET_VS_DIRTY(regs) do { \
|
||||||
|
if (sbi_regs_from_virt(regs)) \
|
||||||
|
csr_set(CSR_VSSTATUS, MSTATUS_VS); \
|
||||||
|
regs->mstatus |= MSTATUS_VS; \
|
||||||
|
} while(0)
|
||||||
|
|
||||||
#ifdef OPENSBI_CC_SUPPORT_VECTOR
|
#ifdef OPENSBI_CC_SUPPORT_VECTOR
|
||||||
void sbi_vector_save(struct sbi_vector_context *dst);
|
void sbi_vector_save(struct sbi_vector_context *dst);
|
||||||
void sbi_vector_restore(const struct sbi_vector_context *src);
|
void sbi_vector_restore(const struct sbi_vector_context *src);
|
||||||
|
|||||||
+58
-25
@@ -16,11 +16,11 @@
|
|||||||
#include <sbi/sbi_trap_ldst.h>
|
#include <sbi/sbi_trap_ldst.h>
|
||||||
#include <sbi/sbi_trap.h>
|
#include <sbi/sbi_trap.h>
|
||||||
#include <sbi/sbi_unpriv.h>
|
#include <sbi/sbi_unpriv.h>
|
||||||
#include <sbi/sbi_trap.h>
|
#include <sbi/sbi_vector.h>
|
||||||
|
|
||||||
#ifdef OPENSBI_CC_SUPPORT_VECTOR
|
#ifdef OPENSBI_CC_SUPPORT_VECTOR
|
||||||
|
|
||||||
#define VLEN_MAX 65536
|
#define MASK_BUFFLEN 1024
|
||||||
|
|
||||||
static inline void set_vreg(ulong vlenb, ulong which,
|
static inline void set_vreg(ulong vlenb, ulong which,
|
||||||
ulong pos, ulong size, const uint8_t *bytes)
|
ulong pos, ulong size, const uint8_t *bytes)
|
||||||
@@ -168,7 +168,7 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
|
|||||||
ulong vl = csr_read(CSR_VL);
|
ulong vl = csr_read(CSR_VL);
|
||||||
ulong vtype = csr_read(CSR_VTYPE);
|
ulong vtype = csr_read(CSR_VTYPE);
|
||||||
ulong vlenb = csr_read(CSR_VLENB);
|
ulong vlenb = csr_read(CSR_VLENB);
|
||||||
ulong vstart = csr_read(CSR_VSTART);
|
ulong vstart = csr_read(CSR_VSTART), orig_vstart = vstart;
|
||||||
ulong base = GET_RS1(insn, regs);
|
ulong base = GET_RS1(insn, regs);
|
||||||
ulong stride = GET_RS2(insn, regs);
|
ulong stride = GET_RS2(insn, regs);
|
||||||
ulong vd = GET_VD(insn);
|
ulong vd = GET_VD(insn);
|
||||||
@@ -178,8 +178,9 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
|
|||||||
ulong vlmul = GET_VLMUL(vtype);
|
ulong vlmul = GET_VLMUL(vtype);
|
||||||
bool illegal = GET_MEW(insn);
|
bool illegal = GET_MEW(insn);
|
||||||
bool masked = IS_MASKED(insn);
|
bool masked = IS_MASKED(insn);
|
||||||
uint8_t mask[VLEN_MAX / 8];
|
uint8_t mask[MASK_BUFFLEN / 8];
|
||||||
uint8_t bytes[8 * sizeof(uint64_t)];
|
uint8_t bytes[8 * sizeof(uint64_t)];
|
||||||
|
ulong mask_len = MASK_BUFFLEN < vlenb * 8 ? MASK_BUFFLEN : vlenb * 8;
|
||||||
ulong len = GET_LEN(view);
|
ulong len = GET_LEN(view);
|
||||||
ulong nf = GET_NF(insn);
|
ulong nf = GET_NF(insn);
|
||||||
ulong vemul = GET_VEMUL(vlmul, view, vsew);
|
ulong vemul = GET_VEMUL(vlmul, view, vsew);
|
||||||
@@ -200,7 +201,7 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
|
|||||||
stride = nf * len;
|
stride = nf * len;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (illegal || vlenb > VLEN_MAX / 8) {
|
if (illegal) {
|
||||||
struct sbi_trap_info trap = {
|
struct sbi_trap_info trap = {
|
||||||
uptrap.cause = CAUSE_ILLEGAL_INSTRUCTION,
|
uptrap.cause = CAUSE_ILLEGAL_INSTRUCTION,
|
||||||
uptrap.tval = insn,
|
uptrap.tval = insn,
|
||||||
@@ -208,12 +209,16 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
|
|||||||
return sbi_trap_redirect(regs, &trap);
|
return sbi_trap_redirect(regs, &trap);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (masked)
|
|
||||||
get_vreg(vlenb, 0, 0, vlenb, mask);
|
|
||||||
|
|
||||||
do {
|
do {
|
||||||
if (masked && (~mask[vstart / 8] & BIT(vstart % 8)))
|
if (masked) {
|
||||||
|
if (vstart == orig_vstart || vstart % mask_len == 0)
|
||||||
|
/* Fetch a mask_len chunk of mask */
|
||||||
|
get_vreg(vlenb, 0, vstart / mask_len * mask_len,
|
||||||
|
mask_len, mask);
|
||||||
|
|
||||||
|
if (~mask[vstart % mask_len / 8] & BIT(vstart % 8))
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
/* compute element address */
|
/* compute element address */
|
||||||
ulong addr = base + vstart * stride;
|
ulong addr = base + vstart * stride;
|
||||||
@@ -232,16 +237,22 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
|
|||||||
sbi_load_loop(bytes + seg * len,
|
sbi_load_loop(bytes + seg * len,
|
||||||
addr + seg * len, len, &uptrap);
|
addr + seg * len, len, &uptrap);
|
||||||
|
|
||||||
if (uptrap.cause) {
|
if (!uptrap.cause)
|
||||||
|
continue;
|
||||||
|
|
||||||
if (IS_FAULT_ONLY_FIRST_LOAD(insn) && vstart != 0) {
|
if (IS_FAULT_ONLY_FIRST_LOAD(insn) && vstart != 0) {
|
||||||
vl = vstart;
|
vl = vstart;
|
||||||
break;
|
goto done;
|
||||||
}
|
}
|
||||||
|
|
||||||
vsetvl(vl, vtype);
|
vsetvl(vl, vtype);
|
||||||
|
csr_write(CSR_VSTART, vstart);
|
||||||
|
/* Don't forget to set dirty if vstart has changed */
|
||||||
|
if (vstart != orig_vstart)
|
||||||
|
SET_VS_DIRTY(regs);
|
||||||
sbi_misaligned_v_tinst_fixup(&uptrap);
|
sbi_misaligned_v_tinst_fixup(&uptrap);
|
||||||
return sbi_trap_redirect(regs, &uptrap);
|
return sbi_trap_redirect(regs, &uptrap);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
/* write load data to regfile */
|
/* write load data to regfile */
|
||||||
for (ulong seg = 0; seg < nf; seg++)
|
for (ulong seg = 0; seg < nf; seg++)
|
||||||
@@ -249,8 +260,15 @@ int sbi_misaligned_v_ld_emulator(ulong insn, struct sbi_trap_context *tcntx)
|
|||||||
len, &bytes[seg * len]);
|
len, &bytes[seg * len]);
|
||||||
} while (++vstart < vl);
|
} while (++vstart < vl);
|
||||||
|
|
||||||
|
done:
|
||||||
/* restore clobbered vl/vtype */
|
/* restore clobbered vl/vtype */
|
||||||
vsetvl(vl, vtype);
|
vsetvl(vl, vtype); // VSTART resets to 0
|
||||||
|
|
||||||
|
/*
|
||||||
|
* At least 1 element is processed, or vl is changed above in
|
||||||
|
* the FAULT_ONLY_FIRST_LOAD path, thus set dirty.
|
||||||
|
*/
|
||||||
|
SET_VS_DIRTY(regs);
|
||||||
|
|
||||||
/* Return a >0 value for the caller to advance mepc */
|
/* Return a >0 value for the caller to advance mepc */
|
||||||
return 1;
|
return 1;
|
||||||
@@ -263,7 +281,7 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
|
|||||||
ulong vl = csr_read(CSR_VL);
|
ulong vl = csr_read(CSR_VL);
|
||||||
ulong vtype = csr_read(CSR_VTYPE);
|
ulong vtype = csr_read(CSR_VTYPE);
|
||||||
ulong vlenb = csr_read(CSR_VLENB);
|
ulong vlenb = csr_read(CSR_VLENB);
|
||||||
ulong vstart = csr_read(CSR_VSTART);
|
ulong vstart = csr_read(CSR_VSTART), orig_vstart = vstart;
|
||||||
ulong base = GET_RS1(insn, regs);
|
ulong base = GET_RS1(insn, regs);
|
||||||
ulong stride = GET_RS2(insn, regs);
|
ulong stride = GET_RS2(insn, regs);
|
||||||
ulong vd = GET_VD(insn);
|
ulong vd = GET_VD(insn);
|
||||||
@@ -273,8 +291,9 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
|
|||||||
ulong vlmul = GET_VLMUL(vtype);
|
ulong vlmul = GET_VLMUL(vtype);
|
||||||
bool illegal = GET_MEW(insn);
|
bool illegal = GET_MEW(insn);
|
||||||
bool masked = IS_MASKED(insn);
|
bool masked = IS_MASKED(insn);
|
||||||
uint8_t mask[VLEN_MAX / 8];
|
uint8_t mask[MASK_BUFFLEN / 8];
|
||||||
uint8_t bytes[8 * sizeof(uint64_t)];
|
uint8_t bytes[8 * sizeof(uint64_t)];
|
||||||
|
ulong mask_len = MASK_BUFFLEN < vlenb * 8 ? MASK_BUFFLEN : vlenb * 8;
|
||||||
ulong len = GET_LEN(view);
|
ulong len = GET_LEN(view);
|
||||||
ulong nf = GET_NF(insn);
|
ulong nf = GET_NF(insn);
|
||||||
ulong vemul = GET_VEMUL(vlmul, view, vsew);
|
ulong vemul = GET_VEMUL(vlmul, view, vsew);
|
||||||
@@ -295,7 +314,7 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
|
|||||||
stride = nf * len;
|
stride = nf * len;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (illegal || vlenb > VLEN_MAX / 8) {
|
if (illegal) {
|
||||||
struct sbi_trap_info trap = {
|
struct sbi_trap_info trap = {
|
||||||
uptrap.cause = CAUSE_ILLEGAL_INSTRUCTION,
|
uptrap.cause = CAUSE_ILLEGAL_INSTRUCTION,
|
||||||
uptrap.tval = insn,
|
uptrap.tval = insn,
|
||||||
@@ -303,12 +322,16 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
|
|||||||
return sbi_trap_redirect(regs, &trap);
|
return sbi_trap_redirect(regs, &trap);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (masked)
|
|
||||||
get_vreg(vlenb, 0, 0, vlenb, mask);
|
|
||||||
|
|
||||||
do {
|
do {
|
||||||
if (masked && (~mask[vstart / 8] & BIT(vstart % 8)))
|
if (masked) {
|
||||||
|
if (vstart == orig_vstart || vstart % mask_len == 0)
|
||||||
|
/* Fetch a mask_len chunk of mask */
|
||||||
|
get_vreg(vlenb, 0, vstart / mask_len * mask_len,
|
||||||
|
mask_len, mask);
|
||||||
|
|
||||||
|
if (~mask[vstart % mask_len / 8] & BIT(vstart % 8))
|
||||||
continue;
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
/* compute element address */
|
/* compute element address */
|
||||||
ulong addr = base + vstart * stride;
|
ulong addr = base + vstart * stride;
|
||||||
@@ -325,23 +348,33 @@ int sbi_misaligned_v_st_emulator(ulong insn, struct sbi_trap_context *tcntx)
|
|||||||
get_vreg(vlenb, vd + seg * emul, vstart * len,
|
get_vreg(vlenb, vd + seg * emul, vstart * len,
|
||||||
len, &bytes[seg * len]);
|
len, &bytes[seg * len]);
|
||||||
|
|
||||||
csr_write(CSR_VSTART, vstart);
|
|
||||||
|
|
||||||
/* write store data to memory */
|
/* write store data to memory */
|
||||||
for (ulong seg = 0; seg < nf; seg++) {
|
for (ulong seg = 0; seg < nf; seg++) {
|
||||||
sbi_store_loop(bytes + seg * len,
|
sbi_store_loop(bytes + seg * len,
|
||||||
addr + seg * len, len, &uptrap);
|
addr + seg * len, len, &uptrap);
|
||||||
|
|
||||||
if (uptrap.cause) {
|
if (!uptrap.cause)
|
||||||
|
continue;
|
||||||
|
|
||||||
vsetvl(vl, vtype);
|
vsetvl(vl, vtype);
|
||||||
|
csr_write(CSR_VSTART, vstart);
|
||||||
|
/* Don't forget to set dirty if vstart has changed */
|
||||||
|
if (vstart != orig_vstart)
|
||||||
|
SET_VS_DIRTY(regs);
|
||||||
sbi_misaligned_v_tinst_fixup(&uptrap);
|
sbi_misaligned_v_tinst_fixup(&uptrap);
|
||||||
return sbi_trap_redirect(regs, &uptrap);
|
return sbi_trap_redirect(regs, &uptrap);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
} while (++vstart < vl);
|
} while (++vstart < vl);
|
||||||
|
|
||||||
/* restore clobbered vl/vtype */
|
/* restore clobbered vl/vtype */
|
||||||
vsetvl(vl, vtype);
|
vsetvl(vl, vtype); // VSTART resets to 0
|
||||||
|
|
||||||
|
/*
|
||||||
|
* No need to set dirty for memory store, but as VSTART resets to
|
||||||
|
* 0 above, need to set dirty if it's originally not 0.
|
||||||
|
*/
|
||||||
|
if (orig_vstart != 0)
|
||||||
|
SET_VS_DIRTY(regs);
|
||||||
|
|
||||||
/* Return a >0 value for the caller to advance mepc */
|
/* Return a >0 value for the caller to advance mepc */
|
||||||
return 1;
|
return 1;
|
||||||
|
|||||||
Reference in New Issue
Block a user