diff --git a/src/vm/vector_functions.cpp b/src/vm/vector_functions.cpp
index 556f0ec..fbf166d 100644
--- a/src/vm/vector_functions.cpp
+++ b/src/vm/vector_functions.cpp
@@ -41,6 +41,7 @@
 #include <functional>
 #include <limits>
 #include <math.h>
+#include <stdexcept>
 
 namespace softvector {
 unsigned RFS = 32;
@@ -85,40 +86,44 @@ double vtype_t::lmul() {
     return pow(2, signed_vlmul);
 }
 
-vreg_view read_vmask(uint8_t* V, uint8_t VLEN, uint16_t elem_count, uint8_t reg_idx) {
+vreg_view read_vmask(uint8_t* V, uint16_t VLEN, uint16_t elem_count, uint8_t reg_idx) {
     uint8_t* mask_start = V + VLEN / 8 * reg_idx;
     return {mask_start, elem_count / 8u}; // this can return size==0 as elem_count can be as low as 1
 }
-uint64_t vector_load_store(void* core, std::function<bool(void*, uint64_t, uint64_t, uint8_t*)> load_store_fn, uint8_t* V, uint8_t VLEN,
-                           uint8_t vd, uint64_t base_addr, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uint8_t elem_byte_size,
-                           uint64_t elem_count, int8_t EMUL_pow, uint8_t segment_size, int64_t stride) {
+uint64_t vector_load_store(void* core, std::function<bool(void*, uint64_t, uint64_t, uint8_t*)> load_store_fn, uint8_t* V, uint16_t VLEN,
+                           uint8_t addressed_register, uint64_t base_addr, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm,
+                           uint8_t elem_size_byte, uint64_t elem_count, int8_t EMUL_pow, uint8_t segment_size, int64_t stride) {
+    // eew = elem_size_byte * 8
     assert(pow(2, EMUL_pow) * segment_size <= 8);
     assert(segment_size > 0);
     assert((elem_count & (elem_count - 1)) == 0); // check that elem_count is power of 2
     assert(elem_count <= VLEN * RFS / 8);
-    unsigned eew = elem_byte_size * 8;
     unsigned emul_stride = EMUL_pow <= 0 ? 1 : pow(2, EMUL_pow);
     assert(emul_stride * segment_size <= 8);
-    assert(!(vd % emul_stride));
+    assert(!(addressed_register % emul_stride));
     vreg_view mask_view = read_vmask(V, VLEN, elem_count, 0);
     // elements w/ index smaller than vstart are in the prestart and get skipped
     // body is from vstart to min(elem_count, vl)
     for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
         unsigned trap_idx = idx;
-        // vm decides active body element
         uint8_t current_mask_byte = mask_view.get<uint8_t>(idx / 8);
         bool mask_active = vm ? 1 : current_mask_byte & (1 << idx % 8);
-        for(unsigned s_idx = 0; s_idx < segment_size; s_idx++) {
-            // base + selected vd + current_elem + current_segment
-            uint8_t* dest_elem = V + (vd * VLEN / 8) + (eew / 8 * idx) + (VLEN / 8 * s_idx * emul_stride);
-            assert(dest_elem <= V + VLEN * RFS / 8);
-            if(mask_active) {
-                uint64_t addr = base_addr + (eew / 8) * (idx * segment_size + s_idx) * stride;
-                if(!load_store_fn(core, addr, eew / 8, dest_elem))
+        if(mask_active) {
+            for(unsigned s_idx = 0; s_idx < segment_size; s_idx++) {
+                // base + selected register + current_elem + current_segment
+                uint8_t* addressed_elem = V + (addressed_register * VLEN / 8) + (elem_size_byte * idx) + (VLEN / 8 * s_idx * emul_stride);
+                assert(addressed_elem <= V + VLEN * RFS / 8);
+                uint64_t addr = base_addr + (elem_size_byte) * (idx * segment_size + s_idx) * stride;
+                if(!load_store_fn(core, addr, elem_size_byte, addressed_elem))
                     return trap_idx;
-            } else {
+            }
+        } else {
+            for(unsigned s_idx = 0; s_idx < segment_size; s_idx++) {
+                // base + selected register + current_elem + current_segment
+                uint8_t* addressed_elem = V + (addressed_register * VLEN / 8) + (elem_size_byte * idx) + (VLEN / 8 * s_idx * emul_stride);
+                assert(addressed_elem <= V + VLEN * RFS / 8);
                 // this only updates the first 8 bits, so eew > 8 would not work correctly
-                *dest_elem = vtype.vma() ? *dest_elem : *dest_elem;
+                *addressed_elem = vtype.vma() ? *addressed_elem : *addressed_elem;
             }
         }
     }
@@ -127,10 +132,10 @@ uint64_t vector_load_store(void* core, std::function<bool(void*, uint64_t, uint6
     for(unsigned idx = std::min(elem_count, vl); idx < VLEN / 8; idx++) {
         for(unsigned s_idx = 0; s_idx < segment_size; s_idx++) {
             // base + selected vd + current_elem + current_segment
-            uint8_t* dest_elem = V + (vd * VLEN / 8) + (eew / 8 * idx) + (VLEN / 8 * s_idx * emul_stride);
-            assert(dest_elem <= V + VLEN * RFS / 8);
+            uint8_t* addressed_elem = V + (addressed_register * VLEN / 8) + (elem_size_byte * idx) + (VLEN / 8 * s_idx * emul_stride);
+            assert(addressed_elem <= V + VLEN * RFS / 8);
             // this only updates the first 8 bits, so eew > 8 would not work correctly
-            *dest_elem = vtype.vta() ? *dest_elem : *dest_elem;
+            *addressed_elem = vtype.vta() ? *addressed_elem : *addressed_elem;
         }
     }
     return 0;