adds functionality for all Zvk Instructions

adds all arithmetic Zvk extensions
cleans up agnostic behaviour for softvector
2025-07-01 20:36:46 +02:00 · 2025-06-30 10:53:48 +02:00 · 2025-06-30 09:04:46 +02:00 · 2025-05-23 20:28:01 +02:00 · 2025-05-09 20:14:09 +02:00
7 changed files with 706 additions and 19 deletions
--- a/gen_input/templates/interp/CORENAME.cpp.gtl
+++ b/gen_input/templates/interp/CORENAME.cpp.gtl
@ -962,6 +962,20 @@ if(vector != null) {%>
                throw new std::runtime_error("Unsupported sew bit value");
        }
    }
+    void vector_unary_op(uint8_t* V, uint8_t unary_op, uint64_t vl, uint64_t vstart, softvector::vtype_t vtype, bool vm, uint8_t vd, uint8_t vs2, uint8_t sew_val){
+        switch(sew_val){
+            case 0b000:
+                return softvector::vector_unary_op<${vlen}, uint8_t>(V, unary_op, vl, vstart, vtype, vm, vd, vs2);
+            case 0b001:
+                return softvector::vector_unary_op<${vlen}, uint16_t>(V, unary_op, vl, vstart, vtype, vm, vd, vs2);
+            case 0b010:
+                return softvector::vector_unary_op<${vlen}, uint32_t>(V, unary_op, vl, vstart, vtype, vm, vd, vs2);
+            case 0b011:
+                return softvector::vector_unary_op<${vlen}, uint64_t>(V, unary_op, vl, vstart, vtype, vm, vd, vs2);
+            default: 
+                throw new std::runtime_error("Unsupported sew_val");
+        }
+    }
 <%}%>
    uint64_t fetch_count{0};
    uint64_t tval{0};
--- a/gen_input/templates/llvm/CORENAME.cpp.gtl
+++ b/gen_input/templates/llvm/CORENAME.cpp.gtl
@ -31,12 +31,14 @@
 *******************************************************************************/
 // clang-format off
 #include <iss/arch/${coreDef.name.toLowerCase()}.h>
+// vm_base needs to be included before gdb_session as termios.h (via boost and gdb_server) has a define which clashes with a variable
+// name in ConstantRange.h
+#include <iss/llvm/vm_base.h>
+#include <iss/iss.h>
 #include <iss/debugger/gdb_session.h>
 #include <iss/debugger/server.h>
-#include <iss/iss.h>
-#include <iss/llvm/vm_base.h>
-#include <util/logging.h>
 #include <iss/instruction_decoder.h>
+#include <util/logging.h>
 <%def fcsr = registers.find {it.name=='FCSR'}
 if(fcsr != null) {%>
 #include <vm/fp_functions.h><%}%>
--- a/src/sysc/core_complex.cpp
+++ b/src/sysc/core_complex.cpp
@ -267,7 +267,10 @@ template <unsigned int BUSWIDTH> void core_complex<BUSWIDTH>::before_end_of_elab
    cpu = new core_wrapper(this);
    cpu->create_cpu(GET_PROP_VALUE(core_type), GET_PROP_VALUE(backend), GET_PROP_VALUE(gdb_server_port), GET_PROP_VALUE(mhartid));
    sc_assert(cpu->vm != nullptr);
-    cpu->vm->setDisassEnabled(GET_PROP_VALUE(enable_disass) || trc->m_db != nullptr);
+    auto disass = GET_PROP_VALUE(enable_disass);
+    if(disass && trc->m_db)
+        SCCINFO(SCMOD)<<"Disasssembly will only be in transaction trace database!";
+    cpu->vm->setDisassEnabled(disass || trc->m_db != nullptr);
    if(GET_PROP_VALUE(plugins).length()) {
        auto p = util::split(GET_PROP_VALUE(plugins), ';');
        for(std::string const& opt_val : p) {
--- a/src/vm/llvm/vm_tgc5c.cpp
+++ b/src/vm/llvm/vm_tgc5c.cpp
@ -31,10 +31,12 @@
 *******************************************************************************/
 // clang-format off
 #include <iss/arch/tgc5c.h>
+// vm_base needs to be included before gdb_session as termios.h (via boost and gdb_server) has a define which clashes with a variable
+// name in ConstantRange.h
+#include <iss/llvm/vm_base.h>
 #include <iss/debugger/gdb_session.h>
 #include <iss/debugger/server.h>
 #include <iss/iss.h>
-#include <iss/llvm/vm_base.h>
 #include <util/logging.h>
 #include <iss/instruction_decoder.h>

--- a/src/vm/vector_functions.cpp
+++ b/src/vm/vector_functions.cpp
@ -34,6 +34,7 @@

 #include "vector_functions.h"
 #include "iss/vm_types.h"
+#include "vm/aes_sbox.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@ -98,4 +99,309 @@ vmask_view read_vmask(uint8_t* V, uint16_t VLEN, uint16_t elem_count, uint8_t re
    assert(mask_start + elem_count / 8 <= V + VLEN * RFS / 8);
    return {mask_start, elem_count};
 }
+uint8_t xt2(uint8_t x) { return (x << 1) ^ (bit_sub<7, 1>(x) ? 27 : 0); }
+
+uint8_t xt3(uint8_t x) { return x ^ xt2(x); }
+
+uint8_t gfmul(uint8_t x, uint8_t y) {
+    return (bit_sub<0, 1>(y) ? x : 0) ^ (bit_sub<1, 1>(y) ? xt2(x) : 0) ^ (bit_sub<2, 1>(y) ? xt2(xt2(x)) : 0) ^
+           (bit_sub<3, 1>(y) ? xt2(xt2(xt2(x))) : 0);
+}
+
+uint32_t aes_mixcolumn_byte_fwd(uint8_t so) {
+    return ((uint32_t)gfmul(so, 3) << 24) | ((uint32_t)so << 16) | ((uint32_t)so << 8) | gfmul(so, 2);
+}
+
+uint32_t aes_mixcolumn_byte_inv(uint8_t so) {
+    return ((uint32_t)gfmul(so, 11) << 24) | ((uint32_t)gfmul(so, 13) << 16) | ((uint32_t)gfmul(so, 9) << 8) | gfmul(so, 14);
+}
+
+uint32_t aes_mixcolumn_fwd(uint32_t x) {
+    uint8_t s0 = bit_sub<0, 7 - 0 + 1>(x);
+    uint8_t s1 = bit_sub<8, 15 - 8 + 1>(x);
+    uint8_t s2 = bit_sub<16, 23 - 16 + 1>(x);
+    uint8_t s3 = bit_sub<24, 31 - 24 + 1>(x);
+    uint8_t b0 = xt2(s0) ^ xt3(s1) ^ (s2) ^ (s3);
+    uint8_t b1 = (s0) ^ xt2(s1) ^ xt3(s2) ^ (s3);
+    uint8_t b2 = (s0) ^ (s1) ^ xt2(s2) ^ xt3(s3);
+    uint8_t b3 = xt3(s0) ^ (s1) ^ (s2) ^ xt2(s3);
+    return ((uint32_t)b3 << 24) | ((uint32_t)b2 << 16) | ((uint32_t)b1 << 8) | b0;
+}
+
+uint32_t aes_mixcolumn_inv(uint32_t x) {
+    uint8_t s0 = bit_sub<0, 7 - 0 + 1>(x);
+    uint8_t s1 = bit_sub<8, 15 - 8 + 1>(x);
+    uint8_t s2 = bit_sub<16, 23 - 16 + 1>(x);
+    uint8_t s3 = bit_sub<24, 31 - 24 + 1>(x);
+    uint8_t b0 = gfmul(s0, 14) ^ gfmul(s1, 11) ^ gfmul(s2, 13) ^ gfmul(s3, 9);
+    uint8_t b1 = gfmul(s0, 9) ^ gfmul(s1, 14) ^ gfmul(s2, 11) ^ gfmul(s3, 13);
+    uint8_t b2 = gfmul(s0, 13) ^ gfmul(s1, 9) ^ gfmul(s2, 14) ^ gfmul(s3, 11);
+    uint8_t b3 = gfmul(s0, 11) ^ gfmul(s1, 13) ^ gfmul(s2, 9) ^ gfmul(s3, 14);
+    return ((uint32_t)b3 << 24) | ((uint32_t)b2 << 16) | ((uint32_t)b1 << 8) | b0;
+}
+
+uint32_t aes_decode_rcon(uint8_t r) {
+    switch(r) {
+    case 0:
+        return 1;
+    case 1:
+        return 2;
+    case 2:
+        return 4;
+    case 3:
+        return 8;
+    case 4:
+        return 16;
+    case 5:
+        return 32;
+    case 6:
+        return 64;
+    case 7:
+        return 128;
+    case 8:
+        return 27;
+    case 9:
+        return 54;
+    }
+    return 0;
+}
+
+uint32_t aes_subword_fwd(uint32_t x) {
+    return ((uint32_t)aes_sbox_fwd(bit_sub<24, 31 - 24 + 1>(x)) << 24) | ((uint32_t)aes_sbox_fwd(bit_sub<16, 23 - 16 + 1>(x)) << 16) |
+           ((uint32_t)aes_sbox_fwd(bit_sub<8, 15 - 8 + 1>(x)) << 8) | aes_sbox_fwd(bit_sub<0, 7 - 0 + 1>(x));
+}
+
+uint32_t aes_subword_inv(uint32_t x) {
+    return ((uint32_t)aes_sbox_inv(bit_sub<24, 31 - 24 + 1>(x)) << 24) | ((uint32_t)aes_sbox_inv(bit_sub<16, 23 - 16 + 1>(x)) << 16) |
+           ((uint32_t)aes_sbox_inv(bit_sub<8, 15 - 8 + 1>(x)) << 8) | aes_sbox_inv(bit_sub<0, 7 - 0 + 1>(x));
+}
+
+uint32_t aes_get_column(__uint128_t state, unsigned c) {
+    assert(c < 4);
+    return static_cast<uint32_t>(state >> (32 * c));
+};
+
+uint64_t aes_apply_fwd_sbox_to_each_byte(uint64_t x) {
+    return ((uint64_t)aes_sbox_fwd(bit_sub<56, 63 - 56 + 1>(x)) << 56) | ((uint64_t)aes_sbox_fwd(bit_sub<48, 55 - 48 + 1>(x)) << 48) |
+           ((uint64_t)aes_sbox_fwd(bit_sub<40, 47 - 40 + 1>(x)) << 40) | ((uint64_t)aes_sbox_fwd(bit_sub<32, 39 - 32 + 1>(x)) << 32) |
+           ((uint64_t)aes_sbox_fwd(bit_sub<24, 31 - 24 + 1>(x)) << 24) | ((uint64_t)aes_sbox_fwd(bit_sub<16, 23 - 16 + 1>(x)) << 16) |
+           ((uint64_t)aes_sbox_fwd(bit_sub<8, 15 - 8 + 1>(x)) << 8) | aes_sbox_fwd(bit_sub<0, 7 - 0 + 1>(x));
+}
+
+uint64_t aes_apply_inv_sbox_to_each_byte(uint64_t x) {
+    return ((uint64_t)aes_sbox_inv(bit_sub<56, 63 - 56 + 1>(x)) << 56) | ((uint64_t)aes_sbox_inv(bit_sub<48, 55 - 48 + 1>(x)) << 48) |
+           ((uint64_t)aes_sbox_inv(bit_sub<40, 47 - 40 + 1>(x)) << 40) | ((uint64_t)aes_sbox_inv(bit_sub<32, 39 - 32 + 1>(x)) << 32) |
+           ((uint64_t)aes_sbox_inv(bit_sub<24, 31 - 24 + 1>(x)) << 24) | ((uint64_t)aes_sbox_inv(bit_sub<16, 23 - 16 + 1>(x)) << 16) |
+           ((uint64_t)aes_sbox_inv(bit_sub<8, 15 - 8 + 1>(x)) << 8) | aes_sbox_inv(bit_sub<0, 7 - 0 + 1>(x));
+}
+
+uint64_t aes_rv64_shiftrows_fwd(uint64_t rs2, uint64_t rs1) {
+    return ((uint64_t)bit_sub<24, 31 - 24 + 1>(rs1) << 56) | ((uint64_t)bit_sub<48, 55 - 48 + 1>(rs2) << 48) |
+           ((uint64_t)bit_sub<8, 15 - 8 + 1>(rs2) << 40) | ((uint64_t)bit_sub<32, 39 - 32 + 1>(rs1) << 32) |
+           ((uint64_t)bit_sub<56, 63 - 56 + 1>(rs2) << 24) | ((uint64_t)bit_sub<16, 23 - 16 + 1>(rs2) << 16) |
+           ((uint64_t)bit_sub<40, 47 - 40 + 1>(rs1) << 8) | bit_sub<0, 7 - 0 + 1>(rs1);
+}
+
+uint64_t aes_rv64_shiftrows_inv(uint64_t rs2, uint64_t rs1) {
+    return ((uint64_t)bit_sub<24, 31 - 24 + 1>(rs2) << 56) | ((uint64_t)bit_sub<48, 55 - 48 + 1>(rs2) << 48) |
+           ((uint64_t)bit_sub<8, 15 - 8 + 1>(rs1) << 40) | ((uint64_t)bit_sub<32, 39 - 32 + 1>(rs1) << 32) |
+           ((uint64_t)bit_sub<56, 63 - 56 + 1>(rs1) << 24) | ((uint64_t)bit_sub<16, 23 - 16 + 1>(rs2) << 16) |
+           ((uint64_t)bit_sub<40, 47 - 40 + 1>(rs2) << 8) | bit_sub<0, 7 - 0 + 1>(rs1);
+}
+
+uint128_t aes_shift_rows_fwd(uint128_t x) {
+    uint32_t ic3 = aes_get_column(x, 3);
+    uint32_t ic2 = aes_get_column(x, 2);
+    uint32_t ic1 = aes_get_column(x, 1);
+    uint32_t ic0 = aes_get_column(x, 0);
+    uint32_t oc0 = ((uint32_t)bit_sub<24, 31 - 24 + 1>(ic3) << 24) | ((uint32_t)bit_sub<16, 23 - 16 + 1>(ic2) << 16) |
+                   ((uint32_t)bit_sub<8, 15 - 8 + 1>(ic1) << 8) | bit_sub<0, 7 - 0 + 1>(ic0);
+    uint32_t oc1 = ((uint32_t)bit_sub<24, 31 - 24 + 1>(ic0) << 24) | ((uint32_t)bit_sub<16, 23 - 16 + 1>(ic3) << 16) |
+                   ((uint32_t)bit_sub<8, 15 - 8 + 1>(ic2) << 8) | bit_sub<0, 7 - 0 + 1>(ic1);
+    uint32_t oc2 = ((uint32_t)bit_sub<24, 31 - 24 + 1>(ic1) << 24) | ((uint32_t)bit_sub<16, 23 - 16 + 1>(ic0) << 16) |
+                   ((uint32_t)bit_sub<8, 15 - 8 + 1>(ic3) << 8) | bit_sub<0, 7 - 0 + 1>(ic2);
+    uint32_t oc3 = ((uint32_t)bit_sub<24, 31 - 24 + 1>(ic2) << 24) | ((uint32_t)bit_sub<16, 23 - 16 + 1>(ic1) << 16) |
+                   ((uint32_t)bit_sub<8, 15 - 8 + 1>(ic0) << 8) | bit_sub<0, 7 - 0 + 1>(ic3);
+    return ((uint128_t)oc3 << 96) | ((uint128_t)oc2 << 64) | ((uint128_t)oc1 << 32) | oc0;
+}
+
+uint128_t aes_shift_rows_inv(uint128_t x) {
+    uint32_t ic3 = aes_get_column(x, 3);
+    uint32_t ic2 = aes_get_column(x, 2);
+    uint32_t ic1 = aes_get_column(x, 1);
+    uint32_t ic0 = aes_get_column(x, 0);
+    uint32_t oc0 = ((uint32_t)bit_sub<24, 31 - 24 + 1>(ic1) << 24) | ((uint32_t)bit_sub<16, 23 - 16 + 1>(ic2) << 16) |
+                   ((uint32_t)bit_sub<8, 15 - 8 + 1>(ic3) << 8) | bit_sub<0, 7 - 0 + 1>(ic0);
+    uint32_t oc1 = ((uint32_t)bit_sub<24, 31 - 24 + 1>(ic2) << 24) | ((uint32_t)bit_sub<16, 23 - 16 + 1>(ic3) << 16) |
+                   ((uint32_t)bit_sub<8, 15 - 8 + 1>(ic0) << 8) | bit_sub<0, 7 - 0 + 1>(ic1);
+    uint32_t oc2 = ((uint32_t)bit_sub<24, 31 - 24 + 1>(ic3) << 24) | ((uint32_t)bit_sub<16, 23 - 16 + 1>(ic0) << 16) |
+                   ((uint32_t)bit_sub<8, 15 - 8 + 1>(ic1) << 8) | bit_sub<0, 7 - 0 + 1>(ic2);
+    uint32_t oc3 = ((uint32_t)bit_sub<24, 31 - 24 + 1>(ic0) << 24) | ((uint32_t)bit_sub<16, 23 - 16 + 1>(ic1) << 16) |
+                   ((uint32_t)bit_sub<8, 15 - 8 + 1>(ic2) << 8) | bit_sub<0, 7 - 0 + 1>(ic3);
+    return ((uint128_t)oc3 << 96) | ((uint128_t)oc2 << 64) | ((uint128_t)oc1 << 32) | oc0;
+}
+
+uint128_t aes_subbytes_fwd(uint128_t x) {
+    uint32_t oc0 = aes_subword_fwd(aes_get_column(x, 0));
+    uint32_t oc1 = aes_subword_fwd(aes_get_column(x, 1));
+    uint32_t oc2 = aes_subword_fwd(aes_get_column(x, 2));
+    uint32_t oc3 = aes_subword_fwd(aes_get_column(x, 3));
+    return ((uint128_t)oc3 << 96) | ((uint128_t)oc2 << 64) | ((uint128_t)oc1 << 32) | oc0;
+}
+
+uint128_t aes_subbytes_inv(uint128_t x) {
+    uint32_t oc0 = aes_subword_inv(aes_get_column(x, 0));
+    uint32_t oc1 = aes_subword_inv(aes_get_column(x, 1));
+    uint32_t oc2 = aes_subword_inv(aes_get_column(x, 2));
+    uint32_t oc3 = aes_subword_inv(aes_get_column(x, 3));
+    return ((uint128_t)oc3 << 96) | ((uint128_t)oc2 << 64) | ((uint128_t)oc1 << 32) | oc0;
+}
+
+uint128_t aes_mixcolumns_fwd(uint128_t x) {
+    uint32_t oc0 = aes_mixcolumn_fwd(aes_get_column(x, 0));
+    uint32_t oc1 = aes_mixcolumn_fwd(aes_get_column(x, 1));
+    uint32_t oc2 = aes_mixcolumn_fwd(aes_get_column(x, 2));
+    uint32_t oc3 = aes_mixcolumn_fwd(aes_get_column(x, 3));
+    return ((uint128_t)oc3 << 96) | ((uint128_t)oc2 << 64) | ((uint128_t)oc1 << 32) | oc0;
+}
+
+uint128_t aes_mixcolumns_inv(uint128_t x) {
+    uint32_t oc0 = aes_mixcolumn_inv(aes_get_column(x, 0));
+    uint32_t oc1 = aes_mixcolumn_inv(aes_get_column(x, 1));
+    uint32_t oc2 = aes_mixcolumn_inv(aes_get_column(x, 2));
+    uint32_t oc3 = aes_mixcolumn_inv(aes_get_column(x, 3));
+    return ((uint128_t)oc3 << 96) | ((uint128_t)oc2 << 64) | ((uint128_t)oc1 << 32) | oc0;
+}
+
+uint32_t aes_rotword(uint32_t x) {
+    uint8_t a0 = bit_sub<0, 7 - 0 + 1>(x);
+    uint8_t a1 = bit_sub<8, 15 - 8 + 1>(x);
+    uint8_t a2 = bit_sub<16, 23 - 16 + 1>(x);
+    uint8_t a3 = bit_sub<24, 31 - 24 + 1>(x);
+    return ((uint32_t)a0 << 24) | ((uint32_t)a3 << 16) | ((uint32_t)a2 << 8) | a1;
+}
+
+std::function<uint128_t(uint128_t, uint128_t, uint128_t)> get_crypto_funct(unsigned funct6, unsigned vs1) {
+    switch(funct6) {
+    case 0b101000: // VAES.VV
+    case 0b101001: // VAES.VS
+        switch(vs1) {
+        case 0b00000: // VAESDM
+            return [](uint128_t state, uint128_t rkey, uint128_t) {
+                uint128_t sr = aes_shift_rows_inv(state);
+                uint128_t sb = aes_subbytes_inv(sr);
+                uint128_t ark = sb ^ rkey;
+                uint128_t mix = aes_mixcolumns_inv(ark);
+                return mix;
+            };
+        case 0b00001: // VAESDF
+            return [](uint128_t state, uint128_t rkey, uint128_t) {
+                uint128_t sr = aes_shift_rows_inv(state);
+                uint128_t sb = aes_subbytes_inv(sr);
+                uint128_t ark = sb ^ rkey;
+                return ark;
+            };
+        case 0b00010: // VAESEM
+            return [](uint128_t state, uint128_t rkey, uint128_t) {
+                uint128_t sb = aes_subbytes_fwd(state);
+                uint128_t sr = aes_shift_rows_fwd(sb);
+                uint128_t mix = aes_mixcolumns_fwd(sr);
+                uint128_t ark = mix ^ rkey;
+                return ark;
+            };
+        case 0b00011: // VAESEF
+            return [](uint128_t state, uint128_t rkey, uint128_t) {
+                uint128_t sb = aes_subbytes_fwd(state);
+                uint128_t sr = aes_shift_rows_fwd(sb);
+                uint128_t ark = sr ^ rkey;
+                return ark;
+            };
+        case 0b00111: // VAESZ
+            return [](uint128_t state, uint128_t rkey, uint128_t) {
+                uint128_t ark = state ^ rkey;
+                return ark;
+            };
+        case 0b10000: // VSM4R
+            throw new std::runtime_error("Unsupported operation in get_crypto_funct");
+        case 0b10001: // VGMUL
+            return [](uint128_t vd, uint128_t vs2, uint128_t) {
+                uint128_t Y = brev8<uint128_t>(vd);
+                uint128_t H = brev8<uint128_t>(vs2);
+                uint128_t Z = 0;
+                for(size_t bit = 0; bit < 128; bit++) {
+                    if((Y >> bit) & 1)
+                        Z ^= H;
+                    bool reduce = (H >> 127) & 1;
+                    H = H << 1;
+                    if(reduce)
+                        H ^= 0x87;
+                }
+                uint128_t result = brev8<uint128_t>(Z);
+                return result;
+            };
+        default:
+            throw new std::runtime_error("Unsupported operation in get_crypto_funct");
+        }
+    case 0b100000: // VSM3ME
+    case 0b100001: // VSM4K
+        throw new std::runtime_error("Unsupported operation in get_crypto_funct");
+    case 0b100010: // VAESKF1
+        return [](uint128_t vd, uint128_t vs2, uint128_t r) {
+            auto extract_word = [](const uint128_t& value, int index) -> uint32_t {
+                return static_cast<uint32_t>((value >> (32 * index)) & std::numeric_limits<uint32_t>::max());
+            };
+
+            uint32_t k0 = (vs2 >> 32 * 0) & std::numeric_limits<uint32_t>::max();
+            uint32_t k1 = (vs2 >> 32 * 1) & std::numeric_limits<uint32_t>::max();
+            uint32_t k2 = (vs2 >> 32 * 2) & std::numeric_limits<uint32_t>::max();
+            uint32_t k3 = (vs2 >> 32 * 3) & std::numeric_limits<uint32_t>::max();
+            uint32_t w0 = aes_subword_fwd(aes_rotword(k3)) ^ aes_decode_rcon(r) ^ k0;
+            uint32_t w1 = w0 ^ k1;
+            uint32_t w2 = w1 ^ k2;
+            uint32_t w3 = w2 ^ k3;
+            uint128_t result = (uint128_t(w3) << 96) | (uint128_t(w2) << 64) | (uint128_t(w1) << 32) | (uint128_t(w0));
+            return result;
+        };
+    case 0b101010: // VAESKF2
+        return [](uint128_t vd, uint128_t vs2, uint128_t r) {
+            uint32_t k0 = (vs2 >> 32 * 0) & std::numeric_limits<uint32_t>::max();
+            uint32_t k1 = (vs2 >> 32 * 1) & std::numeric_limits<uint32_t>::max();
+            uint32_t k2 = (vs2 >> 32 * 2) & std::numeric_limits<uint32_t>::max();
+            uint32_t k3 = (vs2 >> 32 * 3) & std::numeric_limits<uint32_t>::max();
+            uint32_t rkb0 = (vd >> 32 * 0) & std::numeric_limits<uint32_t>::max();
+            uint32_t rkb1 = (vd >> 32 * 1) & std::numeric_limits<uint32_t>::max();
+            uint32_t rkb2 = (vd >> 32 * 2) & std::numeric_limits<uint32_t>::max();
+            uint32_t rkb3 = (vd >> 32 * 3) & std::numeric_limits<uint32_t>::max();
+            uint32_t w0 = r & 1 ? aes_subword_fwd(k3) ^ rkb0 : aes_subword_fwd(aes_rotword(k3)) ^ aes_decode_rcon((r >> 1) - 1) ^ rkb0;
+            uint32_t w1 = w0 ^ rkb1;
+            uint32_t w2 = w1 ^ rkb2;
+            uint32_t w3 = w2 ^ rkb3;
+            uint128_t result = (uint128_t(w3) << 96) | (uint128_t(w2) << 64) | (uint128_t(w1) << 32) | (uint128_t(w0));
+            return result;
+        };
+    case 0b101011: // VSM3C
+        throw new std::runtime_error("Unsupported operation in get_crypto_funct");
+    case 0b101100: // VGHSH
+        return [](uint128_t Y, uint128_t vs2, uint128_t X) {
+            auto H = brev8<uint128_t>(vs2);
+            uint128_t Z = 0;
+            uint128_t S = brev8<uint128_t>(Y ^ X);
+            for(size_t bit = 0; bit < 128; bit++) {
+                if((S >> bit) & 1)
+                    Z ^= H;
+                bool reduce = (H >> 127) & 1;
+                H = H << 1;
+                if(reduce)
+                    H ^= 0x87;
+            }
+            uint128_t result = brev8<uint128_t>(Z);
+            return result;
+        };
+    case 0b101101: // VSHA2MS
+    case 0b101110: // VSHA2CH
+    case 0b101111: // VSHA2CL
+    default:
+        throw new std::runtime_error("Unknown funct6 in get_crypto_funct");
+    }
+}
 } // namespace softvector
--- a/src/vm/vector_functions.h
+++ b/src/vm/vector_functions.h
@ -41,6 +41,10 @@
 #include <functional>
 #include <stdint.h>
 namespace softvector {
+#ifndef _MSC_VER
+using int128_t = __int128;
+using uint128_t = unsigned __int128;
+#endif
 const unsigned RFS = 32;

 struct vtype_t {
@ -70,9 +74,14 @@ struct vmask_view {
 };
 vmask_view read_vmask(uint8_t* V, uint16_t VLEN, uint16_t elem_count, uint8_t reg_idx = 0);
 template <unsigned VLEN> vmask_view read_vmask(uint8_t* V, uint16_t elem_count, uint8_t reg_idx = 0);
+std::function<uint128_t(uint128_t, uint128_t, uint128_t)> get_crypto_funct(unsigned funct6, unsigned vs1);
+
+template <typename dest_elem_t, typename src_elem_t = dest_elem_t> dest_elem_t brev(src_elem_t vs2);
+template <typename dest_elem_t, typename src_elem_t = dest_elem_t> dest_elem_t brev8(src_elem_t vs2);

 bool softvec_read(void* core, uint64_t addr, uint64_t length, uint8_t* data);
 bool softvec_write(void* core, uint64_t addr, uint64_t length, uint8_t* data);
+
 template <unsigned VLEN, typename eew_t>
 uint64_t vector_load_store(void* core, std::function<bool(void*, uint64_t, uint64_t, uint8_t*)> load_store_fn, uint8_t* V, uint64_t vl,
                           uint64_t vstart, vtype_t vtype, bool vm, uint8_t vd, uint64_t rs1, uint8_t segment_size, int64_t stride = 0,
@ -167,6 +176,17 @@ void mask_fp_vector_vector_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t
 template <unsigned VLEN, typename elem_t>
 void mask_fp_vector_imm_op(uint8_t* V, unsigned funct6, uint64_t vl, uint64_t vstart, vtype_t vtype, bool vm, unsigned vd, unsigned vs2,
                           elem_t imm, uint8_t rm);
+template <unsigned VLEN, unsigned EGS>
+void vector_vector_crypto(uint8_t* V, unsigned funct6, uint64_t eg_len, uint64_t eg_start, vtype_t vtype, unsigned vd, unsigned vs2,
+                          unsigned vs1);
+template <unsigned VLEN, unsigned EGS>
+void vector_scalar_crypto(uint8_t* V, unsigned funct6, uint64_t eg_len, uint64_t eg_start, vtype_t vtype, unsigned vd, unsigned vs2,
+                          unsigned vs1);
+template <unsigned VLEN, unsigned EGS>
+void vector_imm_crypto(uint8_t* V, unsigned funct6, uint64_t eg_len, uint64_t eg_start, vtype_t vtype, unsigned vd, unsigned vs2,
+                       uint8_t imm);
+template <unsigned VLEN, unsigned EGS, typename elem_type_t>
+void vector_crypto(uint8_t* V, unsigned funct6, uint64_t eg_len, uint64_t eg_start, vtype_t vtype, unsigned vd, unsigned vs2, unsigned vs1);
 } // namespace softvector
 #include "vm/vector_functions.hpp"
 #endif /* _VM_VECTOR_FUNCTIONS_H_ */
--- a/src/vm/vector_functions.hpp
+++ b/src/vm/vector_functions.hpp
@ -32,6 +32,7 @@
 //       alex@minres.com - initial API and implementation
 ////////////////////////////////////////////////////////////////////////////////
 #pragma once
+#include "iss/interp/vm_base.h"
 extern "C" {
 #include <softfloat.h>
 }
@ -55,6 +56,14 @@ extern "C" {
 #ifdef __SIZEOF_INT128__
 template <> struct std::make_signed<__uint128_t> { using type = __int128_t; };
 template <> struct std::make_signed<__int128_t> { using type = __int128_t; };
+// helper struct to make calling twice<T> on 128-bit datatypes legal at compile time
+struct poison128_t {
+    poison128_t() { throw std::runtime_error("Attempt to use twice<__uint128_t>::type at runtime"); }
+    poison128_t(const poison128_t&) { throw std::runtime_error("Copy of poison128_t is not allowed"); }
+    template <typename U> poison128_t(U) { throw std::runtime_error("Conversion to poison128_t is not allowed"); }
+    operator __uint128_t() const { throw std::runtime_error("Use of poison128_t as __uint128_t is not allowed"); }
+};
+template <> struct std::make_signed<poison128_t> { using type = poison128_t; };
 #endif

 namespace softvector {
@ -108,6 +117,10 @@ template <> struct twice<uint32_t> { using type = uint64_t; };
 #ifdef __SIZEOF_INT128__
 template <> struct twice<int64_t> { using type = __int128_t; };
 template <> struct twice<uint64_t> { using type = __uint128_t; };
+
+template <> struct twice<__uint128_t> { using type = poison128_t; };
+template <> struct twice<__int128_t> { using type = poison128_t; };
+
 #endif
 template <class T> using twice_t = typename twice<T>::type; // for convenience
 template <typename TO, typename FROM> constexpr TO sext(FROM val) {
@ -180,6 +193,8 @@ std::function<dest_elem_t(dest_elem_t, src2_elem_t, src1_elem_t)> get_funct(unsi
        switch(funct6) {
        case 0b000000: // VADD
            return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs2 + vs1; };
+        case 0b000001: // VANDN
+            return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs2 & ~vs1; };
        case 0b000010: // VSUB
            return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs2 - vs1; };
        case 0b000011: // VRSUB
@ -205,6 +220,22 @@ std::function<dest_elem_t(dest_elem_t, src2_elem_t, src1_elem_t)> get_funct(unsi
                return static_cast<std::make_signed_t<dest_elem_t>>(static_cast<std::make_signed_t<src2_elem_t>>(vs2) -
                                                                    static_cast<std::make_signed_t<src1_elem_t>>(vs1));
            };
+        case 0b010100: // VROR
+            return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
+                constexpr dest_elem_t bits = sizeof(src2_elem_t) * 8;
+                auto shamt = vs1 & shift_mask<src1_elem_t>();
+                return (vs2 >> shamt) | (vs2 << (bits - shamt));
+            };
+        case 0b010101: { // VROL
+            if(funct3 == OPIVI)
+                return get_funct<dest_elem_t>(0b010100, funct3);
+            else
+                return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
+                    constexpr dest_elem_t bits = sizeof(src2_elem_t) * 8;
+                    auto shamt = vs1 & shift_mask<src1_elem_t>();
+                    return (vs2 << shamt) | (vs2 >> (bits - shamt));
+                };
+        }
        case 0b100101: // VSLL
            return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) { return vs2 << (vs1 & shift_mask<src2_elem_t>()); };
        case 0b101000: // VSRL
@ -219,6 +250,10 @@ std::function<dest_elem_t(dest_elem_t, src2_elem_t, src1_elem_t)> get_funct(unsi
            return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
                return static_cast<std::make_signed_t<src2_elem_t>>(vs2) >> (vs1 & shift_mask<src2_elem_t>());
            };
+        case 0b110101: // VWSLL
+            return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
+                return static_cast<dest_elem_t>(vs2) << (vs1 & (shift_mask<dest_elem_t>()));
+            };
        default:
            throw new std::runtime_error("Unknown funct6 in get_funct");
        }
@ -328,6 +363,24 @@ std::function<dest_elem_t(dest_elem_t, src2_elem_t, src1_elem_t)> get_funct(unsi
            return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
                return sext<dest_elem_t>(vs1) * static_cast<dest_elem_t>(vs2) + vd;
            };
+        case 0b001100: // VCLMUL
+            return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
+                dest_elem_t output = 0;
+                for(size_t i = 0; i <= sizeof(dest_elem_t) * 8 - 1; i++) {
+                    if((vs2 >> i) & 1)
+                        output = output ^ (vs1 << i);
+                }
+                return output;
+            };
+        case 0b001101: // VCLMULH
+            return [](dest_elem_t vd, src2_elem_t vs2, src1_elem_t vs1) {
+                dest_elem_t output = 0;
+                for(size_t i = 1; i < sizeof(dest_elem_t) * 8; i++) {
+                    if((vs2 >> i) & 1)
+                        output = output ^ (vs1 >> (sizeof(dest_elem_t) * 8 - i));
+                }
+                return output;
+            };
        default:
            throw new std::runtime_error("Unknown funct6 in get_funct");
        }
@ -364,11 +417,10 @@ void vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl, ui
    auto fn = get_funct<dest_elem_t, src2_elem_t, src1_elem_t>(funct6, funct3);
    for(size_t idx = vstart; idx < vl; idx++) {
        bool mask_active = vm ? 1 : mask_reg[idx];
-        if(mask_active) {
+        if(mask_active)
            vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm);
-        } else {
-            vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
-        }
+        else if(vtype.vma())
+            vd_view[idx] = agnostic_behavior(vd_view[idx]);
    }
    if(vtype.vta())
        for(size_t idx = vl; idx < vlmax; idx++)
@ -535,6 +587,79 @@ std::function<dest_elem_t(src2_elem_t)> get_unary_fn(unsigned unary_op) {
    case 0b00100: // VZEXT.VF4
    case 0b00010: // VZEXT.VF8
        return [](src2_elem_t vs2) { return vs2; };
+    case 0b01000: // VBREV8
+        return [](src2_elem_t vs2) { return brev8<dest_elem_t>(vs2); };
+    case 0b01001: // VREV8
+        return [](src2_elem_t vs2) {
+            constexpr unsigned byte_count = sizeof(src2_elem_t);
+            dest_elem_t result = 0;
+            for(size_t i = 0; i < byte_count; ++i) {
+                result <<= 8;
+                result |= (vs2 & 0xFF);
+                vs2 >>= 8;
+            }
+            return result;
+        };
+    case 0b01010: // VBREV
+        return [](src2_elem_t vs2) { return brev<dest_elem_t>(vs2); };
+    case 0b01100: // VCLZ
+        return [](src2_elem_t vs2) {
+            if(std::is_same_v<src2_elem_t, unsigned int>)
+                return static_cast<dest_elem_t>(__builtin_clz(vs2));
+            else if(std::is_same_v<src2_elem_t, unsigned long>)
+                return static_cast<dest_elem_t>(__builtin_clzl(vs2));
+            else if(std::is_same_v<src2_elem_t, unsigned long long>)
+                return static_cast<dest_elem_t>(__builtin_clzll(vs2));
+            else {
+                constexpr dest_elem_t bits = sizeof(src2_elem_t) * 8;
+                if(vs2 == 0)
+                    return bits;
+                dest_elem_t count = 0;
+                for(size_t i = bits - 1; i >= 0; --i) {
+                    if((vs2 >> i) & 1)
+                        break;
+                    ++count;
+                }
+                return count;
+            }
+        };
+    case 0b01101: // VCTZ
+        return [](src2_elem_t vs2) {
+            if(std::is_same_v<src2_elem_t, unsigned int>)
+                return static_cast<dest_elem_t>(__builtin_ctz(vs2));
+            else if(std::is_same_v<src2_elem_t, unsigned long>)
+                return static_cast<dest_elem_t>(__builtin_ctzl(vs2));
+            else if(std::is_same_v<src2_elem_t, unsigned long long>)
+                return static_cast<dest_elem_t>(__builtin_ctzll(vs2));
+            else {
+                constexpr dest_elem_t bits = sizeof(src2_elem_t) * 8;
+                if(vs2 == 0)
+                    return bits;
+                dest_elem_t count = 0;
+                while((vs2 & 1) == 0) {
+                    ++count;
+                    vs2 >>= 1;
+                }
+                return count;
+            }
+        };
+    case 0b01110: // VCPOP
+        return [](src2_elem_t vs2) {
+            if(std::is_same_v<src2_elem_t, unsigned int>)
+                return static_cast<dest_elem_t>(__builtin_popcount(vs2));
+            else if(std::is_same_v<src2_elem_t, unsigned long>)
+                return static_cast<dest_elem_t>(__builtin_popcountl(vs2));
+            else if(std::is_same_v<src2_elem_t, unsigned long long>)
+                return static_cast<dest_elem_t>(__builtin_popcountll(vs2));
+            else {
+                dest_elem_t count = 0;
+                while(vs2) {
+                    count += vs2 & 1;
+                    vs2 >>= 1;
+                }
+                return count;
+            }
+        };
    default:
        throw new std::runtime_error("Unknown funct in get_unary_fn");
    }
@ -550,8 +675,8 @@ void vector_unary_op(uint8_t* V, unsigned unary_op, uint64_t vl, uint64_t vstart
        bool mask_active = vm ? 1 : mask_reg[idx];
        if(mask_active)
            vd_view[idx] = fn(vs2_view[idx]);
-        else
-            vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
+        else if(vtype.vma())
+            vd_view[idx] = agnostic_behavior(vd_view[idx]);
    }
    if(vtype.vta())
        for(size_t idx = vl; idx < vlmax; idx++)
@ -774,11 +899,10 @@ bool sat_vector_vector_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t
    auto fn = get_sat_funct<dest_elem_t, src2_elem_t, src1_elem_t>(funct6, funct3);
    for(size_t idx = vstart; idx < vl; idx++) {
        bool mask_active = vm ? 1 : mask_reg[idx];
-        if(mask_active) {
+        if(mask_active)
            saturated |= fn(vxrm, vtype, vd_view[idx], vs2_view[idx], vs1_view[idx]);
-        } else {
-            vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
-        }
+        else if(vtype.vma())
+            vd_view[idx] = agnostic_behavior(vd_view[idx]);
    }
    if(vtype.vta())
        for(size_t idx = vl; idx < vlmax; idx++) {
@ -797,11 +921,10 @@ bool sat_vector_imm_op(uint8_t* V, unsigned funct6, unsigned funct3, uint64_t vl
    auto fn = get_sat_funct<dest_elem_t, src2_elem_t, src1_elem_t>(funct6, funct3);
    for(size_t idx = vstart; idx < vl; idx++) {
        bool mask_active = vm ? 1 : mask_reg[idx];
-        if(mask_active) {
+        if(mask_active)
            saturated |= fn(vxrm, vtype, vd_view[idx], vs2_view[idx], imm);
-        } else {
-            vd_view[idx] = vtype.vma() ? vd_view[idx] : vd_view[idx];
-        }
+        else if(vtype.vma())
+            vd_view[idx] = agnostic_behavior(vd_view[idx]);
    }
    if(vtype.vta())
        for(size_t idx = vl; idx < vlmax; idx++) {
@ -1939,4 +2062,221 @@ template <unsigned VLEN> void vector_whole_move(uint8_t* V, unsigned vd, unsigne
    memcpy(vd_view.start, vs2_view.start, VLEN / 8 * count);
 }

+template <unsigned VLEN, unsigned EGS>
+void vector_vector_crypto(uint8_t* V, unsigned funct6, uint64_t eg_len, uint64_t eg_start, vtype_t vtype, unsigned vd, unsigned vs2,
+                          unsigned vs1) {
+    uint64_t vlmax = VLEN * vtype.lmul() / (vtype.sew() * EGS);
+    auto vs1_view = get_vreg<VLEN, uint128_t>(V, vs1, vlmax);
+    auto vs2_view = get_vreg<VLEN, uint128_t>(V, vs2, vlmax);
+    auto vd_view = get_vreg<VLEN, uint128_t>(V, vd, vlmax);
+    auto fn = get_crypto_funct(funct6, vs1);
+    for(size_t idx = eg_start; idx < eg_len; idx++) {
+        vd_view[idx] = fn(vd_view[idx], vs2_view[idx], vs1_view[idx]);
+    }
+    if(vtype.vta())
+        for(size_t idx = eg_len; idx < vlmax; idx++)
+            vd_view[idx] = agnostic_behavior(vd_view[idx]);
+}
+template <unsigned VLEN, unsigned EGS>
+void vector_scalar_crypto(uint8_t* V, unsigned funct6, uint64_t eg_len, uint64_t eg_start, vtype_t vtype, unsigned vd, unsigned vs2,
+                          unsigned vs1) {
+    uint64_t vlmax = VLEN * vtype.lmul() / (vtype.sew() * EGS);
+    auto vs2_val = get_vreg<VLEN, uint128_t>(V, vs2, vlmax)[0];
+    auto vd_view = get_vreg<VLEN, uint128_t>(V, vd, vlmax);
+    auto fn = get_crypto_funct(funct6, vs1);
+    for(size_t idx = eg_start; idx < eg_len; idx++) {
+        vd_view[idx] = fn(vd_view[idx], vs2_val, -1);
+    }
+    if(vtype.vta())
+        for(size_t idx = eg_len; idx < vlmax; idx++)
+            vd_view[idx] = agnostic_behavior(vd_view[idx]);
+}
+
+template <unsigned VLEN, unsigned EGS>
+void vector_imm_crypto(uint8_t* V, unsigned funct6, uint64_t eg_len, uint64_t eg_start, vtype_t vtype, unsigned vd, unsigned vs2,
+                       uint8_t imm) {
+    uint64_t vlmax = VLEN * vtype.lmul() / (vtype.sew() * EGS);
+    auto vs2_view = get_vreg<VLEN, uint128_t>(V, vs2, vlmax);
+    auto vd_view = get_vreg<VLEN, uint128_t>(V, vd, vlmax);
+    auto fn = get_crypto_funct(funct6, -1);
+    for(size_t idx = eg_start; idx < eg_len; idx++) {
+        vd_view[idx] = fn(vd_view[idx], vs2_view[idx], imm);
+    }
+    if(vtype.vta())
+        for(size_t idx = eg_len; idx < vlmax; idx++)
+            vd_view[idx] = agnostic_behavior(vd_view[idx]);
+}
+template <typename T> T rotr(T x, unsigned n) {
+    assert(n < sizeof(T) * 8);
+    return (x >> n) | (x << (sizeof(T) * 8 - n));
+}
+template <typename T> T shr(T x, unsigned n) {
+    assert(n < sizeof(T) * 8);
+    return (x >> n);
+}
+template <typename T> T sum0(T);
+template <> inline uint32_t sum0(uint32_t x) { return rotr(x, 2) ^ rotr(x, 13) ^ rotr(x, 22); }
+template <> inline uint64_t sum0(uint64_t x) { return rotr(x, 28) ^ rotr(x, 34) ^ rotr(x, 39); }
+template <typename T> T sum1(T);
+template <> inline uint32_t sum1(uint32_t x) { return rotr(x, 6) ^ rotr(x, 11) ^ rotr(x, 25); }
+template <> inline uint64_t sum1(uint64_t x) { return rotr(x, 14) ^ rotr(x, 18) ^ rotr(x, 41); }
+template <typename T> T ch(T x, T y, T z) { return ((x & y) ^ ((~x) & z)); }
+template <typename T> T maj(T x, T y, T z) { return ((x & y) ^ (x & z) ^ (y & z)); }
+template <typename T> T sig0(T);
+template <> inline uint32_t sig0(uint32_t x) { return rotr(x, 7) ^ rotr(x, 18) ^ shr(x, 3); }
+template <> inline uint64_t sig0(uint64_t x) { return rotr(x, 1) ^ rotr(x, 8) ^ shr(x, 7); }
+template <typename T> T sig1(T);
+template <> inline uint32_t sig1(uint32_t x) { return rotr(x, 17) ^ rotr(x, 19) ^ shr(x, 10); }
+template <> inline uint64_t sig1(uint64_t x) { return rotr(x, 19) ^ rotr(x, 61) ^ shr(x, 6); }
+template <typename T> std::function<void(vreg_view<T>&, vreg_view<T>&, vreg_view<T>&)> get_crypto_funct(unsigned int funct6) {
+    switch(funct6) {
+    case 0b101110: // VSHA2CH
+        return [](vreg_view<T>& vd_view, vreg_view<T>& vs2_view, vreg_view<T>& vs1_view) {
+            T a = vs2_view[3];
+            T b = vs2_view[2];
+            T c = vd_view[3];
+            T d = vd_view[2];
+            T e = vs2_view[1];
+            T f = vs2_view[0];
+            T g = vd_view[1];
+            T h = vd_view[0];
+            T W0 = vs1_view[2];
+            T W1 = vs1_view[3];
+            T T1 = h + sum1(e) + ch(e, f, g) + W0;
+            T T2 = sum0(a) + maj(a, b, c);
+            h = g;
+            g = f;
+            f = e;
+            e = d + T1;
+            d = c;
+            c = b;
+            b = a;
+            a = T1 + T2;
+            T1 = h + sum1(e) + ch(e, f, g) + W1;
+            T2 = sum0(a) + maj(a, b, c);
+            h = g;
+            g = f;
+            f = e;
+            e = d + T1;
+            d = c;
+            c = b;
+            b = a;
+            a = T1 + T2;
+            vd_view[0] = f;
+            vd_view[1] = e;
+            vd_view[2] = b;
+            vd_view[3] = a;
+        };
+    case 0b101111: // VSHA2CL
+        return [](vreg_view<T>& vd_view, vreg_view<T>& vs2_view, vreg_view<T>& vs1_view) {
+            T a = vs2_view[3];
+            T b = vs2_view[2];
+            T c = vd_view[3];
+            T d = vd_view[2];
+            T e = vs2_view[1];
+            T f = vs2_view[0];
+            T g = vd_view[1];
+            T h = vd_view[0];
+            T W0 = vs1_view[0];
+            T W1 = vs1_view[1];
+            T T1 = h + sum1(e) + ch(e, f, g) + W0;
+            T T2 = sum0(a) + maj(a, b, c);
+            h = g;
+            g = f;
+            f = e;
+            e = d + T1;
+            d = c;
+            c = b;
+            b = a;
+            a = T1 + T2;
+            T1 = h + sum1(e) + ch(e, f, g) + W1;
+            T2 = sum0(a) + maj(a, b, c);
+            h = g;
+            g = f;
+            f = e;
+            e = d + T1;
+            d = c;
+            c = b;
+            b = a;
+            a = T1 + T2;
+            vd_view[0] = f;
+            vd_view[1] = e;
+            vd_view[2] = b;
+            vd_view[3] = a;
+        };
+    case 0b101101: // VSHA2MS
+        return [](vreg_view<T>& vd_view, vreg_view<T>& vs2_view, vreg_view<T>& vs1_view) {
+            T W0 = vd_view[0];
+            T W1 = vd_view[1];
+            T W2 = vd_view[2];
+            T W3 = vd_view[3];
+
+            T W4 = vs2_view[0];
+            T W9 = vs2_view[1];
+            T W10 = vs2_view[2];
+            T W11 = vs2_view[3];
+
+            T W12 = vs1_view[0];
+            T W13 = vs1_view[1];
+            T W14 = vs1_view[2];
+            T W15 = vs1_view[3];
+
+            T W16 = sig1(W14) + W9 + sig0(W1) + W0;
+            T W17 = sig1(W15) + W10 + sig0(W2) + W1;
+            T W18 = sig1(W16) + W11 + sig0(W3) + W2;
+            T W19 = sig1(W17) + W12 + sig0(W4) + W3;
+
+            vd_view[0] = W16;
+            vd_view[1] = W17;
+            vd_view[2] = W18;
+            vd_view[3] = W19;
+        };
+    default:
+        throw new std::runtime_error("Unsupported operation in get_crypto_funct");
+    }
+}
+template <unsigned VLEN, unsigned EGS, typename elem_type_t>
+void vector_crypto(uint8_t* V, unsigned funct6, uint64_t eg_len, uint64_t eg_start, vtype_t vtype, unsigned vd, unsigned vs2,
+                   unsigned vs1) {
+    auto fn = get_crypto_funct<elem_type_t>(funct6);
+    auto vd_view = get_vreg<VLEN, elem_type_t>(V, vd, EGS);
+    auto vs2_view = get_vreg<VLEN, elem_type_t>(V, vs2, EGS);
+    auto vs1_view = get_vreg<VLEN, elem_type_t>(V, vs1, EGS);
+    for(size_t idx = eg_start; idx < eg_len; idx++) {
+        fn(vd_view, vs2_view, vs1_view);
+        // We cannot use views in case EGW < VLEN, as views can only address the start of a register
+        vd_view.start += EGS * sizeof(elem_type_t);
+        vs2_view.start += EGS * sizeof(elem_type_t);
+        vs1_view.start += EGS * sizeof(elem_type_t);
+    }
+    if(vtype.vta()) {
+        uint64_t vlmax = VLEN * vtype.lmul() / (vtype.sew());
+        auto vd_view = get_vreg<VLEN, elem_type_t>(V, vd, vlmax);
+        for(size_t idx = eg_len * EGS; idx < vlmax; idx++)
+            vd_view[idx] = agnostic_behavior(vd_view[idx]);
+    }
+}
+
+template <typename dest_elem_t, typename src_elem_t> dest_elem_t brev(src_elem_t vs2) {
+    constexpr dest_elem_t bits = sizeof(src_elem_t) * 8;
+    dest_elem_t result = 0;
+    for(size_t i = 0; i < bits; ++i) {
+        result <<= 1;
+        result |= (vs2 & 1);
+        vs2 >>= 1;
+    }
+    return result;
+};
+template <typename dest_elem_t, typename src_elem_t> dest_elem_t brev8(src_elem_t vs2) {
+    constexpr unsigned byte_count = sizeof(src_elem_t);
+    dest_elem_t result = 0;
+    for(size_t i = 0; i < byte_count; ++i) {
+        dest_elem_t byte = (vs2 >> (i * 8)) & 0xFF;
+        byte = ((byte & 0xF0) >> 4) | ((byte & 0x0F) << 4);
+        byte = ((byte & 0xCC) >> 2) | ((byte & 0x33) << 2);
+        byte = ((byte & 0xAA) >> 1) | ((byte & 0x55) << 1);
+        result |= byte << (i * 8);
+    }
+    return result;
+};
 } // namespace softvector
Author	SHA1	Message	Date
Eyck-Alexander Jentzsch	9fcbeb478b	adds functionality for all Zvk Instructions	2025-07-01 20:36:46 +02:00
Eyck-Alexander Jentzsch	a768bde7f2	adds all arithmetic Zvk extensions	2025-06-30 10:53:48 +02:00
Eyck-Alexander Jentzsch	cd866fd74d	cleans up agnostic behaviour for softvector	2025-06-30 09:04:46 +02:00
Eyck Jentzsch	67f364049c	adds some message if disass will be in the trace file	2025-05-23 20:28:01 +02:00
Eyck Jentzsch	047e2e12b0	fixes include issue in LLVM vm_base vm_base.h needs to be included before gdb_session.h as termios.h (via boost and gdb_server) has a define which clashes with a variable name in ConstantRange.h (via iss/llvm/vm_base.h)	2025-05-09 20:14:09 +02:00