diff --git a/gen_input/templates/interp/CORENAME.cpp.gtl b/gen_input/templates/interp/CORENAME.cpp.gtl
index 0b3ed96..2cb0e3c 100644
--- a/gen_input/templates/interp/CORENAME.cpp.gtl
+++ b/gen_input/templates/interp/CORENAME.cpp.gtl
@@ -629,13 +629,13 @@ if(vector != null) {%>
     void vector_slideup(uint8_t* V, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm, uint8_t sew_val) {
         switch(sew_val){
             case 0b000:
-                return softvector::vector_slide<${vlen}, uint8_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
+                return softvector::vector_slideup<${vlen}, uint8_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
             case 0b001:
-                return softvector::vector_slide<${vlen}, uint16_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
+                return softvector::vector_slideup<${vlen}, uint16_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
             case 0b010:
-                return softvector::vector_slide<${vlen}, uint32_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
+                return softvector::vector_slideup<${vlen}, uint32_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
             case 0b011:
-                return softvector::vector_slide<${vlen}, uint64_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
+                return softvector::vector_slideup<${vlen}, uint64_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
             default:
                 throw new std::runtime_error("Unsupported sew bit value");
         }
@@ -643,13 +643,13 @@ if(vector != null) {%>
     void vector_slidedown(uint8_t* V, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm, uint8_t sew_val) {
         switch(sew_val){
             case 0b000:
-                return softvector::vector_slide<${vlen}, uint8_t>(V, vl, vstart, vtype, vm, vd, vs2, -imm);
+                return softvector::vector_slidedown<${vlen}, uint8_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
             case 0b001:
-                return softvector::vector_slide<${vlen}, uint16_t>(V, vl, vstart, vtype, vm, vd, vs2, -imm);
+                return softvector::vector_slidedown<${vlen}, uint16_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
             case 0b010:
-                return softvector::vector_slide<${vlen}, uint32_t>(V, vl, vstart, vtype, vm, vd, vs2, -imm);
+                return softvector::vector_slidedown<${vlen}, uint32_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
             case 0b011:
-                return softvector::vector_slide<${vlen}, uint64_t>(V, vl, vstart, vtype, vm, vd, vs2, -imm);
+                return softvector::vector_slidedown<${vlen}, uint64_t>(V, vl, vstart, vtype, vm, vd, vs2, imm);
             default:
                 throw new std::runtime_error("Unsupported sew bit value");
         }
diff --git a/src/vm/vector_functions.h b/src/vm/vector_functions.h
index b8d0201..1a51262 100644
--- a/src/vm/vector_functions.h
+++ b/src/vm/vector_functions.h
@@ -124,11 +124,13 @@ void viota(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uns
 template <unsigned VLEN, typename src_elem_t> void vid(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd);
 template <unsigned VLEN, typename src_elem_t> uint64_t scalar_move(uint8_t* V, vtype_t vtype, unsigned vd, uint64_t val, bool to_vector);
 template <unsigned VLEN, typename src_elem_t>
-void vector_slide(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm);
+void vector_slideup(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm);
 template <unsigned VLEN, typename src_elem_t>
-void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm);
+void vector_slidedown(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm);
 template <unsigned VLEN, typename src_elem_t>
-void vector_slide1down(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm);
+void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm);
+template <unsigned VLEN, typename src_elem_t>
+void vector_slide1down(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm);
 template <unsigned VLEN, typename dest_elem_t, typename scr_elem_t = dest_elem_t>
 void vector_vector_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, unsigned vs1);
 template <unsigned VLEN, typename scr_elem_t>
diff --git a/src/vm/vector_functions.hpp b/src/vm/vector_functions.hpp
index 609dc62..ac47eca 100644
--- a/src/vm/vector_functions.hpp
+++ b/src/vm/vector_functions.hpp
@@ -349,7 +349,7 @@ void vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
     // elements w/ index smaller than vstart are in the prestart and get skipped
     // body is from vstart to min(elem_count, vl)
     if(carry == carry_t::NO_CARRY) {
-        for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+        for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
             bool mask_active = vm ? 1 : mask_reg[idx];
             if(mask_active) {
                 vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]);
@@ -358,18 +358,18 @@ void vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
             }
         }
     } else if(carry == carry_t::SUB_CARRY) {
-        for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+        for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
             vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]) - mask_reg[idx];
         }
     } else {
-        for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+        for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
             vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]) + mask_reg[idx];
         }
     }
     // elements w/ index larger than elem_count are in the tail (fractional LMUL)
     // elements w/ index larger than vl are in the tail
     unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
-    for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
+    for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
         vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
     }
     return;
@@ -385,7 +385,7 @@ void vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, ui
     // elements w/ index smaller than vstart are in the prestart and get skipped
     // body is from vstart to min(elem_count, vl)
     if(carry == carry_t::NO_CARRY) {
-        for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+        for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
             bool mask_active = vm ? 1 : mask_reg[idx];
             if(mask_active) {
                 vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm);
@@ -394,18 +394,18 @@ void vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, ui
             }
         }
     } else if(carry == carry_t::SUB_CARRY) {
-        for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+        for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
             vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm) - mask_reg[idx];
         }
     } else {
-        for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+        for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
             vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm) + mask_reg[idx];
         }
     }
     // elements w/ index larger than elem_count are in the tail (fractional LMUL)
     // elements w/ index larger than vl are in the tail
     unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
-    for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
+    for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
         vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
     }
     return;
@@ -417,7 +417,7 @@ void vector_vector_merge(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype
     auto vs1_view = get_vreg<VLEN, scr_elem_t>(V, vs1, elem_count);
     auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, elem_count);
     auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, elem_count);
-    for(unsigned idx = vstart; idx < vl; idx++) {
+    for(size_t idx = vstart; idx < vl; idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active)
             vd_view[idx] = vs1_view[idx];
@@ -431,7 +431,7 @@ void vector_imm_merge(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, b
     vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
     auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, elem_count);
     auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, elem_count);
-    for(unsigned idx = vstart; idx < vl; idx++) {
+    for(size_t idx = vstart; idx < vl; idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active)
             vd_view[idx] = imm;
@@ -503,7 +503,7 @@ void mask_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_
     auto fn = get_mask_funct<elem_t>(funct6, funct3);
     // elements w/ index smaller than vstart are in the prestart and get skipped
     // body is from vstart to min(elem_count, vl)
-    for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             vd_mask_view[idx] = fn(vs2_view[idx], vs1_view[idx]);
@@ -513,7 +513,7 @@ void mask_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_
     }
     // elements w/ index larger than elem_count are in the tail (fractional LMUL)
     // elements w/ index larger than vl are in the tail
-    for(unsigned idx = std::min(elem_count, vl); idx < VLEN; idx++) {
+    for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) {
         vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx];
     }
     return;
@@ -528,7 +528,7 @@ void mask_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t v
     auto fn = get_mask_funct<elem_t>(funct6, funct3);
     // elements w/ index smaller than vstart are in the prestart and get skipped
     // body is from vstart to min(elem_count, vl)
-    for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             vd_mask_view[idx] = fn(vs2_view[idx], imm);
@@ -538,7 +538,7 @@ void mask_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t v
     }
     // elements w/ index larger than elem_count are in the tail (fractional LMUL)
     // elements w/ index larger than vl are in the tail
-    for(unsigned idx = std::min(elem_count, vl); idx < VLEN; idx++) {
+    for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) {
         vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx];
     }
     return;
@@ -567,7 +567,7 @@ void vector_unary_op(uint8_t* V, unsigned unary_op, uint64_t vl, uint64_t vstart
     auto fn = get_unary_fn<dest_elem_t, src2_elem_t>(unary_op);
     // elements w/ index smaller than vstart are in the prestart and get skipped
     // body is from vstart to min(elem_count, vl)
-    for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             vd_view[idx] = fn(vs2_view[idx]);
@@ -578,7 +578,7 @@ void vector_unary_op(uint8_t* V, unsigned unary_op, uint64_t vl, uint64_t vstart
     // elements w/ index larger than elem_count are in the tail (fractional LMUL)
     // elements w/ index larger than vl are in the tail
     unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
-    for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
+    for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
         vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
     }
     return;
@@ -608,13 +608,13 @@ void carry_vector_vector_op(uint8_t* V, unsigned funct, uint64_t vl, uint64_t vs
     auto fn = get_carry_funct<elem_t>(funct);
     // elements w/ index smaller than vstart are in the prestart and get skipped
     // body is from vstart to min(elem_count, vl)
-    for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
         elem_t carry = vm ? 0 : mask_reg[idx];
         vd_mask_view[idx] = fn(vs2_view[idx], vs1_view[idx], carry);
     }
     // elements w/ index larger than elem_count are in the tail (fractional LMUL)
     // elements w/ index larger than vl are in the tail
-    for(unsigned idx = std::min(elem_count, vl); idx < VLEN; idx++) {
+    for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) {
         // always tail agnostic
     }
     return;
@@ -629,13 +629,13 @@ void carry_vector_imm_op(uint8_t* V, unsigned funct, uint64_t vl, uint64_t vstar
     auto fn = get_carry_funct<elem_t>(funct);
     // elements w/ index smaller than vstart are in the prestart and get skipped
     // body is from vstart to min(elem_count, vl)
-    for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
         elem_t carry = vm ? 0 : mask_reg[idx];
         vd_mask_view[idx] = fn(vs2_view[idx], imm, carry);
     }
     // elements w/ index larger than elem_count are in the tail (fractional LMUL)
     // elements w/ index larger than vl are in the tail
-    for(unsigned idx = std::min(elem_count, vl); idx < VLEN; idx++) {
+    for(size_t idx = std::min(elem_count, vl); idx < VLEN; idx++) {
         // always tail agnostic
     }
     return;
@@ -814,7 +814,7 @@ bool sat_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t
     auto fn = get_sat_funct<dest_elem_t, src2_elem_t, src1_elem_T>(funct6, funct3);
     // elements w/ index smaller than vstart are in the prestart and get skipped
     // body is from vstart to min(elem_count, vl)
-    for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             saturated |= fn(vxrm, vtype, vd_view[idx], vs2_view[idx], vs1_view[idx]);
@@ -825,7 +825,7 @@ bool sat_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t
     // elements w/ index larger than elem_count are in the tail (fractional LMUL)
     // elements w/ index larger than vl are in the tail
     unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
-    for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
+    for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
         vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
     }
     return saturated;
@@ -841,7 +841,7 @@ bool sat_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl
     auto fn = get_sat_funct<dest_elem_t, src2_elem_t, src1_elem_T>(funct6, funct3);
     // elements w/ index smaller than vstart are in the prestart and get skipped
     // body is from vstart to min(elem_count, vl)
-    for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             saturated |= fn(vxrm, vtype, vd_view[idx], vs2_view[idx], imm);
@@ -852,7 +852,7 @@ bool sat_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl
     // elements w/ index larger than elem_count are in the tail (fractional LMUL)
     // elements w/ index larger than vl are in the tail
     unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
-    for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
+    for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
         vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
     }
     return saturated;
@@ -916,7 +916,7 @@ void vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, ui
     auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, elem_count);
     auto fn = get_red_funct<dest_elem_t, src_elem_t>(funct6, funct3);
     dest_elem_t& running_total = {vs1_elem};
-    for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             fn(running_total, vs2_view[idx]);
@@ -924,7 +924,7 @@ void vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, ui
     }
     vd_view[0] = running_total;
     // the tail is all elements of the destination register beyond the first one
-    for(unsigned idx = 1; idx < VLEN / (vtype.sew() * RFS); idx++) {
+    for(size_t idx = 1; idx < VLEN / (vtype.sew() * RFS); idx++) {
         vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
     }
     return;
@@ -1228,7 +1228,7 @@ void fp_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t
     uint8_t accrued_flags = 0;
     // elements w/ index smaller than vstart are in the prestart and get skipped
     // body is from vstart to min(elem_count, vl)
-    for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             vd_view[idx] = fn(rm, accrued_flags, vd_view[idx], vs2_view[idx], vs1_view[idx]);
@@ -1240,7 +1240,7 @@ void fp_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t
     // elements w/ index larger than elem_count are in the tail (fractional LMUL)
     // elements w/ index larger than vl are in the tail
     unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
-    for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
+    for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
         vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
     }
     return;
@@ -1256,7 +1256,7 @@ void fp_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
     uint8_t accrued_flags = 0;
     // elements w/ index smaller than vstart are in the prestart and get skipped
     // body is from vstart to min(elem_count, vl)
-    for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             vd_view[idx] = fn(rm, accrued_flags, vd_view[idx], vs2_view[idx], imm);
@@ -1268,7 +1268,7 @@ void fp_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
     // elements w/ index larger than elem_count are in the tail (fractional LMUL)
     // elements w/ index larger than vl are in the tail
     unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
-    for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
+    for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
         vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
     }
     return;
@@ -1324,7 +1324,7 @@ void fp_vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
     auto fn = get_fp_red_funct<dest_elem_t, src_elem_t>(funct6, funct3);
     dest_elem_t& running_total = {vs1_elem};
     uint8_t accrued_flags = 0;
-    for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             fn(rm, accrued_flags, running_total, vs2_view[idx]);
@@ -1333,7 +1333,7 @@ void fp_vector_red_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl,
     vd_view[0] = running_total;
     softfloat_exceptionFlags = accrued_flags;
     // the tail is all elements of the destination register beyond the first one
-    for(unsigned idx = 1; idx < VLEN / (vtype.sew() * RFS); idx++) {
+    for(size_t idx = 1; idx < VLEN / (vtype.sew() * RFS); idx++) {
         vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
     }
     return;
@@ -1517,7 +1517,7 @@ void fp_vector_unary_op(uint8_t* V, unsigned encoding_space, unsigned unary_op,
     auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, elem_count);
     auto fn = get_fp_unary_fn<dest_elem_t, src_elem_t>(encoding_space, unary_op);
     uint8_t accrued_flags = 0;
-    for(unsigned idx = vstart; idx < std::min(elem_count, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(elem_count, vl); idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             vd_view[idx] = fn(rm, accrued_flags, vs2_view[idx]);
@@ -1527,7 +1527,7 @@ void fp_vector_unary_op(uint8_t* V, unsigned encoding_space, unsigned unary_op,
     }
     softfloat_exceptionFlags = accrued_flags;
     unsigned maximum_elems = VLEN * vtype.lmul() / (sizeof(dest_elem_t) * 8);
-    for(unsigned idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
+    for(size_t idx = std::min(elem_count, vl); idx < maximum_elems; idx++) {
         vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
     }
     return;
@@ -1593,7 +1593,7 @@ void mask_fp_vector_vector_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t
     vmask_view vd_mask_view = read_vmask<VLEN>(V, VLEN, vd);
     auto fn = get_fp_mask_funct<elem_t>(funct6);
     uint8_t accrued_flags = 0;
-    for(unsigned idx = vstart; idx < vl; idx++) {
+    for(size_t idx = vstart; idx < vl; idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             vd_mask_view[idx] = fn(rm, accrued_flags, vs2_view[idx], vs1_view[idx]);
@@ -1602,7 +1602,7 @@ void mask_fp_vector_vector_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t
         }
     }
     softfloat_exceptionFlags = accrued_flags;
-    for(unsigned idx = vl; idx < VLEN; idx++) {
+    for(size_t idx = vl; idx < VLEN; idx++) {
         vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx];
     }
     return;
@@ -1616,7 +1616,7 @@ void mask_fp_vector_imm_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t vs
     vmask_view vd_mask_view = read_vmask<VLEN>(V, VLEN, vd);
     auto fn = get_fp_mask_funct<elem_t>(funct6);
     uint8_t accrued_flags = 0;
-    for(unsigned idx = vstart; idx < vl; idx++) {
+    for(size_t idx = vstart; idx < vl; idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             vd_mask_view[idx] = fn(rm, accrued_flags, vs2_view[idx], imm);
@@ -1625,7 +1625,7 @@ void mask_fp_vector_imm_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t vs
         }
     }
     softfloat_exceptionFlags = accrued_flags;
-    for(unsigned idx = vl; idx < VLEN; idx++) {
+    for(size_t idx = vl; idx < VLEN; idx++) {
         vd_mask_view[idx] = vtype.vta() ? vd_mask_view[idx] : vd_mask_view[idx];
     }
     return;
@@ -1637,11 +1637,11 @@ void mask_mask_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, uin
     auto vs2_view = read_vmask<VLEN>(V, elem_count, vs2);
     auto vd_view = read_vmask<VLEN>(V, elem_count, vd);
     auto fn = get_mask_funct<unsigned>(funct6, funct3); // could be bool, but would break the make_signed_t in get_mask_funct
-    for(unsigned idx = vstart; idx < vl; idx++) {
+    for(size_t idx = vstart; idx < vl; idx++) {
         vd_view[idx] = fn(vs2_view[idx], vs1_view[idx]);
     }
     // the tail is all elements of the destination register beyond the first one
-    for(unsigned idx = 1; idx < VLEN; idx++) {
+    for(size_t idx = 1; idx < VLEN; idx++) {
         // always tail agnostic
         // this is a nop, placeholder for vta behavior
         vd_view[idx] = vd_view[idx];
@@ -1653,7 +1653,7 @@ template <unsigned VLEN> uint64_t vcpop(uint8_t* V, uint64_t vl, uint64_t vstart
     auto vs2_view = read_vmask<VLEN>(V, elem_count, vs2);
     vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
     unsigned running_total = 0;
-    for(unsigned idx = vstart; idx < vl; idx++) {
+    for(size_t idx = vstart; idx < vl; idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active && vs2_view[idx])
             running_total += 1;
@@ -1664,7 +1664,7 @@ template <unsigned VLEN> uint64_t vfirst(uint8_t* V, uint64_t vl, uint64_t vstar
     uint64_t elem_count = VLEN;
     auto vs2_view = read_vmask<VLEN>(V, elem_count, vs2);
     vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
-    for(unsigned idx = vstart; idx < vl; idx++) {
+    for(size_t idx = vstart; idx < vl; idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active && vs2_view[idx])
             return idx;
@@ -1714,14 +1714,14 @@ template <unsigned VLEN> void mask_set_op(uint8_t* V, unsigned enc, uint64_t vl,
     vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
     auto fn = get_mask_set_funct(enc);
     bool marker = false;
-    for(unsigned idx = vstart; idx < vl; idx++) {
+    for(size_t idx = vstart; idx < vl; idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             vd_view[idx] = fn(marker, vs2_view[idx]);
         }
     }
     // the tail is all elements of the destination register beyond the first one
-    for(unsigned idx = vl; idx < VLEN; idx++) {
+    for(size_t idx = vl; idx < VLEN; idx++) {
         // always tail agnostic
         // this is a nop, placeholder for vta behavior
         vd_view[idx] = vd_view[idx];
@@ -1734,7 +1734,7 @@ void viota(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, uns
     auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, elem_count);
     vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
     unsigned current = 0;
-    for(unsigned idx = vstart; idx < std::min(vl, elem_count); idx++) {
+    for(size_t idx = vstart; idx < std::min(vl, elem_count); idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             vd_view[idx] = current;
@@ -1748,7 +1748,7 @@ template <unsigned VLEN, typename src_elem_t> void vid(uint8_t* V, uint64_t vl,
     uint64_t elem_count = VLEN * vtype.lmul() / vtype.sew();
     auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, elem_count);
     vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
-    for(unsigned idx = vstart; idx < std::min(vl, elem_count); idx++) {
+    for(size_t idx = vstart; idx < std::min(vl, elem_count); idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             vd_view[idx] = idx;
@@ -1761,37 +1761,53 @@ template <unsigned VLEN, typename src_elem_t> uint64_t scalar_move(uint8_t* V, v
     auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, vlmax);
     if(to_vector) {
         vd_view[0] = val;
-        for(unsigned idx = 1; idx < vlmax; idx++) {
+        for(size_t idx = 1; idx < vlmax; idx++) {
             vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
         }
     }
     return static_cast<int64_t>(static_cast<std::make_signed_t<src_elem_t>>(vd_view[0]));
 }
 template <unsigned VLEN, typename src_elem_t>
-void vector_slide(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm) {
+void vector_slideup(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
     uint64_t elem_count = VLEN * vtype.lmul() / (sizeof(src_elem_t) * 8);
     vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
     auto vs2_view = get_vreg<VLEN, src_elem_t>(V, vs2, elem_count);
     auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, elem_count);
-    for(unsigned idx = std::max<int64_t>(vstart, imm); idx < vl; idx++) {
+    for(size_t idx = std::max(vstart, imm); idx < vl; idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
-        src_elem_t src_elem = 0;
-        if(imm >= 0 || (idx - imm < elem_count))
-            src_elem = vs2_view[idx - imm];
         if(mask_active) {
-            vd_view[idx] = src_elem;
+            vd_view[idx] = idx - imm < elem_count ? vs2_view[idx - imm] : 0;
         } else {
             vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
         }
     }
-    for(unsigned idx = vl; idx < elem_count; idx++) {
+    for(size_t idx = vl; idx < elem_count; idx++) {
         vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
     }
     return;
 }
 template <unsigned VLEN, typename src_elem_t>
-void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm) {
-    vector_slide<VLEN, src_elem_t>(V, vl, vstart, vtype, vm, vd, vs2, 1);
+void vector_slidedown(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
+    uint64_t elem_count = VLEN * vtype.lmul() / (sizeof(src_elem_t) * 8);
+    vmask_view mask_reg = read_vmask<VLEN>(V, elem_count);
+    auto vs2_view = get_vreg<VLEN, src_elem_t>(V, vs2, elem_count);
+    auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, elem_count);
+    for(size_t idx = vstart; idx < vl; idx++) {
+        bool mask_active = vm ? 1 : mask_reg[idx];
+        if(mask_active) {
+            vd_view[idx] = std::numeric_limits<uint64_t>::max() - idx > imm && idx + imm < elem_count ? vs2_view[idx + imm] : 0;
+        } else {
+            vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
+        }
+    }
+    for(size_t idx = vl; idx < elem_count; idx++) {
+        vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
+    }
+    return;
+}
+template <unsigned VLEN, typename src_elem_t>
+void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
+    vector_slideup<VLEN, src_elem_t>(V, vl, vstart, vtype, vm, vd, vs2, 1);
     vmask_view mask_reg = read_vmask<VLEN>(V, 1);
     auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, 1);
     if(vm || mask_reg[0])
@@ -1800,8 +1816,8 @@ void vector_slide1up(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bo
         vd_view[0] = vtype.vma() ? vd_view[0] : vd_view[0];
 }
 template <unsigned VLEN, typename src_elem_t>
-void vector_slide1down(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, int64_t imm) {
-    vector_slide<VLEN, src_elem_t>(V, vl, vstart, vtype, vm, vd, vs2, -1);
+void vector_slide1down(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2, uint64_t imm) {
+    vector_slidedown<VLEN, src_elem_t>(V, vl, vstart, vtype, vm, vd, vs2, 1);
     if(vl > 0) {
         vmask_view mask_reg = read_vmask<VLEN>(V, vl);
         auto vd_view = get_vreg<VLEN, src_elem_t>(V, vd, vl);
@@ -1818,7 +1834,7 @@ void vector_vector_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtyp
     auto vs1_view = get_vreg<VLEN, scr_elem_t>(V, vs1, vlmax);
     auto vs2_view = get_vreg<VLEN, dest_elem_t>(V, vs2, vlmax);
     auto vd_view = get_vreg<VLEN, dest_elem_t>(V, vd, vlmax);
-    for(unsigned idx = vstart; idx < std::min(vlmax, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(vlmax, vl); idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             vd_view[idx] = (vs1_view[idx] >= vlmax) ? 0 : vs2_view[vs1_view[idx]];
@@ -1826,7 +1842,7 @@ void vector_vector_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtyp
             vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
         }
     }
-    for(unsigned idx = vl; idx < vlmax; idx++) {
+    for(size_t idx = vl; idx < vlmax; idx++) {
         vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
     }
     return;
@@ -1837,7 +1853,7 @@ void vector_imm_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype,
     vmask_view mask_reg = read_vmask<VLEN>(V, vlmax);
     auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, vlmax);
     auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, vlmax);
-    for(unsigned idx = vstart; idx < std::min(vlmax, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(vlmax, vl); idx++) {
         bool mask_active = vm ? 1 : mask_reg[idx];
         if(mask_active) {
             vd_view[idx] = (imm >= vlmax) ? 0 : vs2_view[imm];
@@ -1845,7 +1861,7 @@ void vector_imm_gather(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype,
             vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
         }
     }
-    for(unsigned idx = vl; idx < vlmax; idx++) {
+    for(size_t idx = vl; idx < vlmax; idx++) {
         vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
     }
     return;
@@ -1857,13 +1873,13 @@ void vector_compress(uint8_t* V, uint64_t vl, uint64_t vstart, vtype_t vtype, un
     auto vs2_view = get_vreg<VLEN, scr_elem_t>(V, vs2, vlmax);
     auto vd_view = get_vreg<VLEN, scr_elem_t>(V, vd, vlmax);
     unsigned current_pos = 0;
-    for(unsigned idx = vstart; idx < std::min(vlmax, vl); idx++) {
+    for(size_t idx = vstart; idx < std::min(vlmax, vl); idx++) {
         if(mask_reg[idx]) {
             vd_view[current_pos] = vs2_view[idx];
             current_pos += 1;
         }
     }
-    for(unsigned idx = current_pos; idx < vlmax; idx++) {
+    for(size_t idx = current_pos; idx < vlmax; idx++) {
         vd_view[idx] = vtype.vta() ? vd_view[idx] : vd_view[idx];
     }
     return;