From 2b857482790d216d773c1278d6feb54c58c846fb Mon Sep 17 00:00:00 2001
From: Eyck-Alexander Jentzsch <alex@minres.com>
Date: Fri, 7 Feb 2025 11:40:59 +0100
Subject: [PATCH] adds load_store_index to vector_functions

---
 gen_input/templates/interp/CORENAME.cpp.gtl | 27 ++++---
 src/vm/vector_functions.cpp                 | 81 ++++++++++++++++++++-
 src/vm/vector_functions.h                   | 10 ++-
 3 files changed, 104 insertions(+), 14 deletions(-)

diff --git a/gen_input/templates/interp/CORENAME.cpp.gtl b/gen_input/templates/interp/CORENAME.cpp.gtl
index d39b9bc..fda1120 100644
--- a/gen_input/templates/interp/CORENAME.cpp.gtl
+++ b/gen_input/templates/interp/CORENAME.cpp.gtl
@@ -169,17 +169,24 @@ if(vector != null) {%>
         this->core.reg.trap_state = 0;
     }
     uint64_t vlseg(uint8_t* V, uint8_t vd, uint64_t rs1_val, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t elem_byte_size, uint16_t elem_count, int8_t EMUL_pow, uint8_t segment_size){
-            return softvector::vector_load_store(this->get_arch(), softvector::softvec_read, V, traits::VLEN, vd, rs1_val, vl, vstart, vtype, vm, elem_byte_size, elem_count, EMUL_pow, segment_size, 1);
-           }
-    uint64_t vsseg(uint8_t* V, uint8_t vd, uint64_t rs1_val, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t elem_byte_size, uint16_t elem_count, int8_t EMUL_pow, uint8_t segment_size){
-            return softvector::vector_load_store(this->get_arch(), softvector::softvec_write, V, traits::VLEN, vd, rs1_val, vl, vstart, vtype, vm, elem_byte_size, elem_count, EMUL_pow, segment_size, 1);
-           }
+        return softvector::vector_load_store(this->get_arch(), softvector::softvec_read, V, traits::VLEN, vd, rs1_val, vl, vstart, vtype, vm, elem_byte_size, elem_count, EMUL_pow, segment_size);
+    }
+    uint64_t vsseg(uint8_t* V, uint8_t vs3, uint64_t rs1_val, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t elem_byte_size, uint16_t elem_count, int8_t EMUL_pow, uint8_t segment_size){
+        return softvector::vector_load_store(this->get_arch(), softvector::softvec_write, V, traits::VLEN, vs3, rs1_val, vl, vstart, vtype, vm, elem_byte_size, elem_count, EMUL_pow, segment_size);
+    }
     uint64_t vlsseg(uint8_t* V, uint8_t vd, uint64_t rs1_val, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t elem_byte_size, uint16_t elem_count, int8_t EMUL_pow, uint8_t segment_size, int64_t stride){
-            return softvector::vector_load_store(this->get_arch(), softvector::softvec_read, V, traits::VLEN, vd, rs1_val, vl, vstart, vtype, vm, elem_byte_size, elem_count, EMUL_pow, segment_size, stride);
-           }
-    uint64_t vssseg(uint8_t* V, uint8_t vd, uint64_t rs1_val, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t elem_byte_size, uint16_t elem_count, int8_t EMUL_pow, uint8_t segment_size, int64_t stride){
-            return softvector::vector_load_store(this->get_arch(), softvector::softvec_write, V, traits::VLEN, vd, rs1_val, vl, vstart, vtype, vm, elem_byte_size, elem_count, EMUL_pow, segment_size, stride);
-           }
+        return softvector::vector_load_store(this->get_arch(), softvector::softvec_read, V, traits::VLEN, vd, rs1_val, vl, vstart, vtype, vm, elem_byte_size, elem_count, EMUL_pow, segment_size, stride);
+        }
+    uint64_t vssseg(uint8_t* V, uint8_t vs3, uint64_t rs1_val, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t elem_byte_size, uint16_t elem_count, int8_t EMUL_pow, uint8_t segment_size, int64_t stride){
+        return softvector::vector_load_store(this->get_arch(), softvector::softvec_write, V, traits::VLEN, vs3, rs1_val, vl, vstart, vtype, vm, elem_byte_size, elem_count, EMUL_pow, segment_size, stride);
+        }
+    uint64_t vlxseg(uint8_t* V, uint8_t vd, uint8_t vs2, uint64_t rs1_val, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t elem_byte_size, uint16_t elem_count, uint8_t segment_size, bool ordered){
+        return softvector::vector_load_store_index(this->get_arch(), softvector::softvec_read, V, traits::VLEN, traits::XLEN, vd, vs2, rs1_val, vl, vstart, vtype, vm, elem_byte_size, elem_count, segment_size, ordered);
+        }
+    uint64_t vsxseg(uint8_t* V, uint8_t vs3, uint8_t vs2, uint64_t rs1_val, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t elem_byte_size, uint16_t elem_count, uint8_t segment_size, bool ordered){
+        return softvector::vector_load_store_index(this->get_arch(), softvector::softvec_write, V, traits::VLEN, traits::XLEN, vs3, vs2, rs1_val, vl, vstart, vtype, vm, elem_byte_size, elem_count, segment_size, ordered);
+        }
+
 <%}%>
     uint64_t fetch_count{0};
     uint64_t tval{0};
diff --git a/src/vm/vector_functions.cpp b/src/vm/vector_functions.cpp
index fbf166d..7f5a9d3 100644
--- a/src/vm/vector_functions.cpp
+++ b/src/vm/vector_functions.cpp
@@ -131,7 +131,7 @@ uint64_t vector_load_store(void* core, std::function<bool(void*, uint64_t, uint6
     // elements w/ index larger than vl are in the tail
     for(unsigned idx = std::min(elem_count, vl); idx < VLEN / 8; idx++) {
         for(unsigned s_idx = 0; s_idx < segment_size; s_idx++) {
-            // base + selected vd + current_elem + current_segment
+            // base + selected register + current_elem + current_segment
             uint8_t* addressed_elem = V + (addressed_register * VLEN / 8) + (elem_size_byte * idx) + (VLEN / 8 * s_idx * emul_stride);
             assert(addressed_elem <= V + VLEN * RFS / 8);
             // this only updates the first 8 bits, so eew > 8 would not work correctly
@@ -140,4 +140,83 @@ uint64_t vector_load_store(void* core, std::function<bool(void*, uint64_t, uint6
     }
     return 0;
 }
+int64_t read_n_bits(uint8_t* V, unsigned n) {
+    switch(n) {
+    case 8:
+        return static_cast<int64_t>(*reinterpret_cast<int8_t*>(V));
+    case 16:
+        return static_cast<int64_t>(*reinterpret_cast<int16_t*>(V));
+    case 32:
+        return static_cast<int64_t>(*reinterpret_cast<int32_t*>(V));
+    case 64:
+        return static_cast<int64_t>(*reinterpret_cast<int64_t*>(V));
+    default:
+        throw new std::invalid_argument("Invalid arg in read_n_bits");
+    }
+}
+// this function behaves similar to vector_load_store(...) with the key difference that the SEW and LMUL from the parameters apply to the
+// index registers (instead of the data registers) and the SEW and LMUL encoded in vtype apply to the data registers
+uint64_t vector_load_store_index(void* core, std::function<bool(void*, uint64_t, uint64_t, uint8_t*)> load_store_fn, uint8_t* V,
+                                 uint16_t VLEN, uint8_t XLEN, uint8_t addressed_register, uint8_t index_register, uint64_t base_addr,
+                                 uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uint8_t index_elem_size_byte, uint64_t elem_count,
+                                 uint8_t segment_size, bool ordered) {
+    // index_eew = index_elem_size_byte * 8
+    // for now ignore the ordered parameter, as all indexed operations are implementes as ordered
+    assert(segment_size > 0);
+    assert((elem_count & (elem_count - 1)) == 0); // check that elem_count is power of 2
+    assert(elem_count <= VLEN * RFS / 8);
+    unsigned data_emul_stride = vtype.lmul() < 0 ? 0 : vtype.lmul();
+    assert(data_emul_stride * segment_size <= 8);
+    unsigned data_elem_size_byte = vtype.sew() / 8;
+    assert(!(addressed_register % data_emul_stride));
+    vreg_view mask_view = read_vmask(V, VLEN, elem_count, 0);
+    // elements w/ index smaller than vstart are in the prestart and get skipped
+    // body is from vstart to min(elem_count, vl)
+    for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+        unsigned trap_idx = idx;
+        uint8_t current_mask_byte = mask_view.get<uint8_t>(idx / 8);
+        bool mask_active = vm ? 1 : current_mask_byte & (1 << idx % 8);
+        if(mask_active) {
+            uint8_t* offset_elem = V + (index_register * VLEN / 8) + (index_elem_size_byte * idx);
+            assert(offset_elem <= (V + VLEN * RFS / 8 - index_elem_size_byte)); // ensure reading index_elem_size_bytes is legal
+            // read sew bits from offset_elem truncate / extend to XLEN bits
+            int64_t offset_val = read_n_bits(offset_elem, index_elem_size_byte * 8);
+            assert(XLEN == 64 | XLEN == 32);
+            uint64_t mask = XLEN == 64 ? std::numeric_limits<uint64_t>::max() : std::numeric_limits<uint32_t>::max();
+            unsigned index_offset = offset_val & mask;
+            for(unsigned s_idx = 0; s_idx < segment_size; s_idx++) {
+                // base + selected register + current_elem + current_segment
+                uint8_t* addressed_elem =
+                    V + (addressed_register * VLEN / 8) + (data_elem_size_byte * idx) + (VLEN / 8 * s_idx * data_emul_stride);
+                assert(addressed_elem <= V + VLEN * RFS / 8);
+                // base + offset + current_segment
+                uint64_t addr = base_addr + index_offset + s_idx * data_elem_size_byte;
+                if(!load_store_fn(core, addr, data_elem_size_byte, addressed_elem))
+                    return trap_idx;
+            }
+        } else {
+            for(unsigned s_idx = 0; s_idx < segment_size; s_idx++) {
+                // base + selected register + current_elem + current_segment
+                uint8_t* addressed_elem =
+                    V + (addressed_register * VLEN / 8) + (data_elem_size_byte * idx) + (VLEN / 8 * s_idx * data_emul_stride);
+                assert(addressed_elem <= V + VLEN * RFS / 8);
+                // this only updates the first 8 bits, so eew > 8 would not work correctly
+                *addressed_elem = vtype.vma() ? *addressed_elem : *addressed_elem;
+            }
+        }
+    }
+    // elements w/ index larger than elem_count are in the tail (fractional LMUL)
+    // elements w/ index larger than vl are in the tail
+    for(unsigned idx = std::min(elem_count, vl); idx < VLEN / 8; idx++) {
+        for(unsigned s_idx = 0; s_idx < segment_size; s_idx++) {
+            // base + selected register + current_elem + current_segment
+            uint8_t* addressed_elem =
+                V + (addressed_register * VLEN / 8) + (data_elem_size_byte * idx) + (VLEN / 8 * s_idx * data_emul_stride);
+            assert(addressed_elem <= V + VLEN * RFS / 8);
+            // this only updates the first 8 bits, so eew > 8 would not work correctly
+            *addressed_elem = vtype.vta() ? *addressed_elem : *addressed_elem;
+        }
+    }
+    return 0;
+}
 } // namespace softvector
\ No newline at end of file
diff --git a/src/vm/vector_functions.h b/src/vm/vector_functions.h
index d1d764e..23dadfe 100644
--- a/src/vm/vector_functions.h
+++ b/src/vm/vector_functions.h
@@ -54,8 +54,12 @@ struct vtype_t {
 };
 bool softvec_read(void* core, uint64_t addr, uint64_t length, uint8_t* data);
 bool softvec_write(void* core, uint64_t addr, uint64_t length, uint8_t* data);
-uint64_t vector_load_store(void* core, std::function<bool(void*, uint64_t, uint64_t, uint8_t*)> load_store_fn, uint8_t* V, uint8_t VLEN,
-                           uint8_t vd, uint64_t base_addr, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uint8_t elem_size_byte,
-                           uint64_t elem_count, int8_t EMUL_pow, uint8_t segment_size, int64_t stride);
+uint64_t vector_load_store(void* core, std::function<bool(void*, uint64_t, uint64_t, uint8_t*)> load_store_fn, uint8_t* V, uint16_t VLEN,
+                           uint8_t addressed_register, uint64_t base_addr, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm,
+                           uint8_t elem_size_byte, uint64_t elem_count, int8_t EMUL_pow, uint8_t segment_size = 1, int64_t stride = 1);
+uint64_t vector_load_store_index(void* core, std::function<bool(void*, uint64_t, uint64_t, uint8_t*)> load_store_fn, uint8_t* V,
+                                 uint16_t VLEN, uint8_t XLEN, uint8_t addressed_register, uint8_t index_register, uint64_t base_addr,
+                                 uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uint8_t elem_size_byte, uint64_t elem_count,
+                                 uint8_t segment_size, bool ordered);
 } // namespace softvector
 #endif /* _VM_VECTOR_FUNCTIONS_H_ */