From edba497fa16ae0f56a5a9e043af62d09484c0e33 Mon Sep 17 00:00:00 2001 From: Eyck Jentzsch Date: Wed, 19 Jul 2023 08:19:38 +0200 Subject: [PATCH 1/4] fixes linker isseu using whole-archive --- CMakeLists.txt | 8 +++++--- cmake/flink.cmake | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+), 3 deletions(-) create mode 100644 cmake/flink.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ad2123..8559904 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,10 +1,12 @@ cmake_minimum_required(VERSION 3.12) +list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) ############################################################################### # ############################################################################### project(dbt-rise-tgc VERSION 1.0.0) include(GNUInstallDirs) +include(flink) find_package(elfio QUIET) find_package(Boost COMPONENTS coroutine) @@ -82,8 +84,8 @@ if(TARGET jsoncpp::jsoncpp) else() target_link_libraries(${PROJECT_NAME} PUBLIC jsoncpp) endif() -if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND BUILD_SHARED_LIBS) - target_link_libraries(${PROJECT_NAME} PUBLIC -Wl,--whole-archive dbt-rise-core -Wl,--no-whole-archive) +if(BUILD_SHARED_LIBS) + target_force_link_libraries(${PROJECT_NAME} PUBLIC dbt-rise-core) else() target_link_libraries(${PROJECT_NAME} PUBLIC dbt-rise-core) endif() @@ -154,7 +156,7 @@ if(WITH_TCC) target_compile_definitions(${PROJECT_NAME} PRIVATE WITH_TCC) endif() # Links the target exe against the libraries -target_link_libraries(${PROJECT_NAME} PUBLIC dbt-rise-tgc) +target_force_link_libraries(${PROJECT_NAME} PUBLIC dbt-rise-tgc) if(TARGET Boost::program_options) target_link_libraries(${PROJECT_NAME} PUBLIC Boost::program_options) else() diff --git a/cmake/flink.cmake b/cmake/flink.cmake new file mode 100644 index 0000000..69b5c9f --- /dev/null +++ b/cmake/flink.cmake @@ -0,0 +1,35 @@ +# according to https://github.com/horance-liu/flink.cmake/tree/master +# SPDX-License-Identifier: Apache-2.0 + +include(CMakeParseArguments) + +function(target_do_force_link_libraries target visibility lib) + if(MSVC) + target_link_libraries(${target} ${visibility} "/WHOLEARCHIVE:${lib}") + elseif(APPLE) + target_link_libraries(${target} ${visibility} -Wl,-force_load ${lib}) + else() + target_link_libraries(${target} ${visibility} -Wl,--whole-archive ${lib} -Wl,--no-whole-archive) + endif() +endfunction() + +function(target_force_link_libraries target) + cmake_parse_arguments(FLINK + "" + "" + "PUBLIC;INTERFACE;PRIVATE" + ${ARGN} + ) + + foreach(lib IN LISTS FLINK_PUBLIC) + target_do_force_link_libraries(${target} PUBLIC ${lib}) + endforeach() + + foreach(lib IN LISTS FLINK_INTERFACE) + target_do_force_link_libraries(${target} INTERFACE ${lib}) + endforeach() + + foreach(lib IN LISTS FLINK_PRIVATE) + target_do_force_link_libraries(${target} PRIVATE ${lib}) + endforeach() +endfunction() \ No newline at end of file From c78026b720d2fa548c502a8b77d5fe9a6534e382 Mon Sep 17 00:00:00 2001 From: Eyck-Alexander Jentzsch Date: Sun, 23 Jul 2023 08:05:15 +0200 Subject: [PATCH 2/4] adds faster instruction decoding --- src/vm/interp/vm_tgc_c.cpp | 93 +++++++++++++++++++++++++++----------- 1 file changed, 66 insertions(+), 27 deletions(-) diff --git a/src/vm/interp/vm_tgc_c.cpp b/src/vm/interp/vm_tgc_c.cpp index c94b716..2525a37 100644 --- a/src/vm/interp/vm_tgc_c.cpp +++ b/src/vm/interp/vm_tgc_c.cpp @@ -153,14 +153,22 @@ private: /**************************************************************************** * start opcode definitions ****************************************************************************/ - struct InstructionDesriptor { + struct instruction_descriptor { size_t length; uint32_t value; uint32_t mask; typename arch::traits::opcode_e op; }; - - const std::array instr_descr = {{ + struct decoding_tree_node{ + std::vector instrs; + std::vector children; + uint32_t submask = std::numeric_limits::max(); + uint32_t value; + decoding_tree_node(uint32_t value) : value(value){} + }; + + decoding_tree_node* root {nullptr}; + const std::array instr_descr = {{ /* entries are: size, valid value, valid mask, function ptr */ {32, 0b00000000000000000000000000110111, 0b00000000000000000000000001111111, arch::traits::opcode_e::LUI}, {32, 0b00000000000000000000000000010111, 0b00000000000000000000000001111111, arch::traits::opcode_e::AUIPC}, @@ -251,18 +259,61 @@ private: {16, 0b0000000000000000, 0b1111111111111111, arch::traits::opcode_e::DII}, }}; - //static constexpr typename traits::addr_t upper_bits = ~traits::PGMASK; iss::status fetch_ins(virt_addr_t pc, uint8_t * data){ auto phys_pc = this->core.v2p(pc); - //if ((pc.val & upper_bits) != ((pc.val + 2) & upper_bits)) { // we may cross a page boundary - // if (this->core.read(phys_pc, 2, data) != iss::Ok) return iss::Err; - // if ((data[0] & 0x3) == 0x3) // this is a 32bit instruction - // if (this->core.read(this->core.v2p(pc + 2), 2, data + 2) != iss::Ok) return iss::Err; - //} else { if (this->core.read(phys_pc, 4, data) != iss::Ok) return iss::Err; - //} return iss::Ok; } + void populate_decoding_tree(decoding_tree_node* root){ + //create submask + for(auto instr: root->instrs){ + root->submask &= instr.mask; + } + //put each instr according to submask&encoding into children + for(auto instr: root->instrs){ + bool foundMatch = false; + for(auto child: root->children){ + //use value as identifying trait + if(child->value == (instr.value&root->submask)){ + child->instrs.push_back(instr); + foundMatch = true; + } + } + if(!foundMatch){ + decoding_tree_node* child = new decoding_tree_node(instr.value&root->submask); + child->instrs.push_back(instr); + root->children.push_back(child); + } + } + root->instrs.clear(); + //call populate_decoding_tree for all children + if(root->children.size() >1) + for(auto child: root->children){ + populate_decoding_tree(child); + } + else{ + //sort instrs by value of the mask, this works bc we want to have the least restrictive one last + std::sort(root->children[0]->instrs.begin(), root->children[0]->instrs.end(), [](const instruction_descriptor& instr1, const instruction_descriptor& instr2) { + return instr1.mask > instr2.mask; + }); + } + } + typename arch::traits::opcode_e decodeInstr(decoding_tree_node* node, code_word_t word){ + if(!node->children.size()){ + if(node->instrs.size() == 1) return node->instrs[0].op; + for(auto instr : node->instrs){ + if((instr.mask&word) == instr.value) return instr.op; + } + } + else{ + for(auto child : node->children){ + if (child->value == (node->submask&word)){ + return decodeInstr(child, word); + } + } + } + return arch::traits::opcode_e::MAX_OPCODE; + } }; template void debug_fn(CODE_WORD insn) { @@ -290,15 +341,11 @@ template vm_impl::vm_impl(ARCH &core, unsigned core_id, unsigned cluster_id) : vm_base(core, core_id, cluster_id) { unsigned id=0; - for (auto instr : instr_descr) { - auto quadrant = instr.value & 0x3; - qlut[quadrant].push_back(instruction_pattern{instr.value, instr.mask, instr.op}); - } - for(auto& lut: qlut){ - std::sort(std::begin(lut), std::end(lut), [](instruction_pattern const& a, instruction_pattern const& b){ - return bit_count(a.mask) > bit_count(b.mask); - }); + root = new decoding_tree_node(std::numeric_limits::max()); + for(auto instr:instr_descr){ + root->instrs.push_back(instr); } + populate_decoding_tree(root); } inline bool is_count_limit_enabled(finish_cond_e cond){ @@ -309,14 +356,6 @@ inline bool is_jump_to_self_enabled(finish_cond_e cond){ return (cond & finish_cond_e::JUMP_TO_SELF) == finish_cond_e::JUMP_TO_SELF; } -template -typename arch::traits::opcode_e vm_impl::decode_inst_id(code_word_t instr){ - for(auto& e: qlut[instr&0x3]){ - if(!((instr&e.mask) ^ e.value )) return e.id; - } - return arch::traits::opcode_e::MAX_OPCODE; -} - template typename vm_base::virt_addr_t vm_impl::execute_inst(finish_cond_e cond, virt_addr_t start, uint64_t icount_limit){ auto pc=start; @@ -338,7 +377,7 @@ typename vm_base::virt_addr_t vm_impl::execute_inst(finish_cond_e co } else { if (is_jump_to_self_enabled(cond) && (instr == 0x0000006f || (instr&0xffff)==0xa001)) throw simulation_stopped(0); // 'J 0' or 'C.J 0' - auto inst_id = decode_inst_id(instr); + auto inst_id = decodeInstr(root, instr); // pre execution stuff this->core.reg.last_branch = 0; if(this->sync_exec && PRE_SYNC) this->do_sync(PRE_SYNC, static_cast(inst_id)); From bd0d15f3a2e8c751d78d61acd18f869ddec3b2d5 Mon Sep 17 00:00:00 2001 From: Eyck-Alexander Jentzsch Date: Sun, 23 Jul 2023 08:10:57 +0200 Subject: [PATCH 3/4] updates template for faster instruction decoding --- gen_input/templates/interp/CORENAME.cpp.gtl | 91 +++++++++++++++------ src/vm/interp/vm_tgc_c.cpp | 2 +- 2 files changed, 66 insertions(+), 27 deletions(-) diff --git a/gen_input/templates/interp/CORENAME.cpp.gtl b/gen_input/templates/interp/CORENAME.cpp.gtl index acf7afd..7a7707e 100644 --- a/gen_input/templates/interp/CORENAME.cpp.gtl +++ b/gen_input/templates/interp/CORENAME.cpp.gtl @@ -159,30 +159,81 @@ private: /**************************************************************************** * start opcode definitions ****************************************************************************/ - struct InstructionDesriptor { + struct instruction_descriptor { size_t length; uint32_t value; uint32_t mask; typename arch::traits::opcode_e op; }; + struct decoding_tree_node{ + std::vector instrs; + std::vector children; + uint32_t submask = std::numeric_limits::max(); + uint32_t value; + decoding_tree_node(uint32_t value) : value(value){} + }; - const std::array instr_descr = {{ + decoding_tree_node* root {nullptr}; + const std::array instr_descr = {{ /* entries are: size, valid value, valid mask, function ptr */<%instructions.each{instr -> %> {${instr.length}, ${instr.encoding}, ${instr.mask}, arch::traits::opcode_e::${instr.instruction.name}},<%}%> }}; - //static constexpr typename traits::addr_t upper_bits = ~traits::PGMASK; iss::status fetch_ins(virt_addr_t pc, uint8_t * data){ auto phys_pc = this->core.v2p(pc); - //if ((pc.val & upper_bits) != ((pc.val + 2) & upper_bits)) { // we may cross a page boundary - // if (this->core.read(phys_pc, 2, data) != iss::Ok) return iss::Err; - // if ((data[0] & 0x3) == 0x3) // this is a 32bit instruction - // if (this->core.read(this->core.v2p(pc + 2), 2, data + 2) != iss::Ok) return iss::Err; - //} else { if (this->core.read(phys_pc, 4, data) != iss::Ok) return iss::Err; - //} return iss::Ok; } + void populate_decoding_tree(decoding_tree_node* root){ + //create submask + for(auto instr: root->instrs){ + root->submask &= instr.mask; + } + //put each instr according to submask&encoding into children + for(auto instr: root->instrs){ + bool foundMatch = false; + for(auto child: root->children){ + //use value as identifying trait + if(child->value == (instr.value&root->submask)){ + child->instrs.push_back(instr); + foundMatch = true; + } + } + if(!foundMatch){ + decoding_tree_node* child = new decoding_tree_node(instr.value&root->submask); + child->instrs.push_back(instr); + root->children.push_back(child); + } + } + root->instrs.clear(); + //call populate_decoding_tree for all children + if(root->children.size() >1) + for(auto child: root->children){ + populate_decoding_tree(child); + } + else{ + //sort instrs by value of the mask, this works bc we want to have the least restrictive one last + std::sort(root->children[0]->instrs.begin(), root->children[0]->instrs.end(), [](const instruction_descriptor& instr1, const instruction_descriptor& instr2) { + return instr1.mask > instr2.mask; + }); + } + } + typename arch::traits::opcode_e decodeInstr(decoding_tree_node* node, code_word_t word){ + if(!node->children.size()){ + if(node->instrs.size() == 1) return node->instrs[0].op; + for(auto instr : node->instrs){ + if((instr.mask&word) == instr.value) return instr.op; + } + } + else{ + for(auto child : node->children){ + if (child->value == (node->submask&word)){ + return decodeInstr(child, word); + } + } + } + return arch::traits::opcode_e::MAX_OPCODE; + } }; template void debug_fn(CODE_WORD insn) { @@ -210,15 +261,11 @@ template vm_impl::vm_impl(ARCH &core, unsigned core_id, unsigned cluster_id) : vm_base(core, core_id, cluster_id) { unsigned id=0; - for (auto instr : instr_descr) { - auto quadrant = instr.value & 0x3; - qlut[quadrant].push_back(instruction_pattern{instr.value, instr.mask, instr.op}); - } - for(auto& lut: qlut){ - std::sort(std::begin(lut), std::end(lut), [](instruction_pattern const& a, instruction_pattern const& b){ - return bit_count(a.mask) > bit_count(b.mask); - }); + root = new decoding_tree_node(std::numeric_limits::max()); + for(auto instr:instr_descr){ + root->instrs.push_back(instr); } + populate_decoding_tree(root); } inline bool is_count_limit_enabled(finish_cond_e cond){ @@ -229,14 +276,6 @@ inline bool is_jump_to_self_enabled(finish_cond_e cond){ return (cond & finish_cond_e::JUMP_TO_SELF) == finish_cond_e::JUMP_TO_SELF; } -template -typename arch::traits::opcode_e vm_impl::decode_inst_id(code_word_t instr){ - for(auto& e: qlut[instr&0x3]){ - if(!((instr&e.mask) ^ e.value )) return e.id; - } - return arch::traits::opcode_e::MAX_OPCODE; -} - template typename vm_base::virt_addr_t vm_impl::execute_inst(finish_cond_e cond, virt_addr_t start, uint64_t icount_limit){ auto pc=start; @@ -258,7 +297,7 @@ typename vm_base::virt_addr_t vm_impl::execute_inst(finish_cond_e co } else { if (is_jump_to_self_enabled(cond) && (instr == 0x0000006f || (instr&0xffff)==0xa001)) throw simulation_stopped(0); // 'J 0' or 'C.J 0' - auto inst_id = decode_inst_id(instr); + auto inst_id = decodeInstr(root, instr); // pre execution stuff this->core.reg.last_branch = 0; if(this->sync_exec && PRE_SYNC) this->do_sync(PRE_SYNC, static_cast(inst_id)); diff --git a/src/vm/interp/vm_tgc_c.cpp b/src/vm/interp/vm_tgc_c.cpp index 2525a37..d5817ed 100644 --- a/src/vm/interp/vm_tgc_c.cpp +++ b/src/vm/interp/vm_tgc_c.cpp @@ -166,7 +166,7 @@ private: uint32_t value; decoding_tree_node(uint32_t value) : value(value){} }; - + decoding_tree_node* root {nullptr}; const std::array instr_descr = {{ /* entries are: size, valid value, valid mask, function ptr */ From 6e52af168bb7984c28e4ed0208aa38720875dde7 Mon Sep 17 00:00:00 2001 From: Eyck-Alexander Jentzsch Date: Sat, 29 Jul 2023 11:42:46 +0200 Subject: [PATCH 4/4] adds faster decoding to tcc and cleans up others --- gen_input/templates/interp/CORENAME.cpp.gtl | 7 +- gen_input/templates/tcc/CORENAME.cpp.gtl | 158 ++++++++-------- src/vm/interp/vm_tgc_c.cpp | 7 +- src/vm/tcc/vm_tgc_c.cpp | 195 ++++++++++---------- 4 files changed, 180 insertions(+), 187 deletions(-) diff --git a/gen_input/templates/interp/CORENAME.cpp.gtl b/gen_input/templates/interp/CORENAME.cpp.gtl index 7a7707e..9585627 100644 --- a/gen_input/templates/interp/CORENAME.cpp.gtl +++ b/gen_input/templates/interp/CORENAME.cpp.gtl @@ -218,7 +218,7 @@ private: }); } } - typename arch::traits::opcode_e decodeInstr(decoding_tree_node* node, code_word_t word){ + typename arch::traits::opcode_e decode_instr(decoding_tree_node* node, code_word_t word){ if(!node->children.size()){ if(node->instrs.size() == 1) return node->instrs[0].op; for(auto instr : node->instrs){ @@ -228,7 +228,7 @@ private: else{ for(auto child : node->children){ if (child->value == (node->submask&word)){ - return decodeInstr(child, word); + return decode_instr(child, word); } } } @@ -260,7 +260,6 @@ constexpr size_t bit_count(uint32_t u) { template vm_impl::vm_impl(ARCH &core, unsigned core_id, unsigned cluster_id) : vm_base(core, core_id, cluster_id) { - unsigned id=0; root = new decoding_tree_node(std::numeric_limits::max()); for(auto instr:instr_descr){ root->instrs.push_back(instr); @@ -297,7 +296,7 @@ typename vm_base::virt_addr_t vm_impl::execute_inst(finish_cond_e co } else { if (is_jump_to_self_enabled(cond) && (instr == 0x0000006f || (instr&0xffff)==0xa001)) throw simulation_stopped(0); // 'J 0' or 'C.J 0' - auto inst_id = decodeInstr(root, instr); + auto inst_id = decode_instr(root, instr); // pre execution stuff this->core.reg.last_branch = 0; if(this->sync_exec && PRE_SYNC) this->do_sync(PRE_SYNC, static_cast(inst_id)); diff --git a/gen_input/templates/tcc/CORENAME.cpp.gtl b/gen_input/templates/tcc/CORENAME.cpp.gtl index b7cec56..809ae00 100644 --- a/gen_input/templates/tcc/CORENAME.cpp.gtl +++ b/gen_input/templates/tcc/CORENAME.cpp.gtl @@ -121,57 +121,7 @@ protected: } } - // some compile time constants - // enum { MASK16 = 0b1111110001100011, MASK32 = 0b11111111111100000111000001111111 }; - enum { MASK16 = 0b1111111111111111, MASK32 = 0b11111111111100000111000001111111 }; - enum { EXTR_MASK16 = MASK16 >> 2, EXTR_MASK32 = MASK32 >> 2 }; - enum { LUT_SIZE = 1 << util::bit_count(static_cast(EXTR_MASK32)), LUT_SIZE_C = 1 << util::bit_count(static_cast(EXTR_MASK16)) }; - - std::array lut; - - std::array lut_00, lut_01, lut_10; - std::array lut_11; - - std::array qlut; - - std::array lutmasks = {{EXTR_MASK16, EXTR_MASK16, EXTR_MASK16, EXTR_MASK32}}; - - void expand_bit_mask(int pos, uint32_t mask, uint32_t value, uint32_t valid, uint32_t idx, compile_func lut[], - compile_func f) { - if (pos < 0) { - lut[idx] = f; - } else { - auto bitmask = 1UL << pos; - if ((mask & bitmask) == 0) { - expand_bit_mask(pos - 1, mask, value, valid, idx, lut, f); - } else { - if ((valid & bitmask) == 0) { - expand_bit_mask(pos - 1, mask, value, valid, (idx << 1), lut, f); - expand_bit_mask(pos - 1, mask, value, valid, (idx << 1) + 1, lut, f); - } else { - auto new_val = idx << 1; - if ((value & bitmask) != 0) new_val++; - expand_bit_mask(pos - 1, mask, value, valid, new_val, lut, f); - } - } - } - } - - inline uint32_t extract_fields(uint32_t val) { return extract_fields(29, val >> 2, lutmasks[val & 0x3], 0); } - - uint32_t extract_fields(int pos, uint32_t val, uint32_t mask, uint32_t lut_val) { - if (pos >= 0) { - auto bitmask = 1UL << pos; - if ((mask & bitmask) == 0) { - lut_val = extract_fields(pos - 1, val, mask, lut_val); - } else { - auto new_val = lut_val << 1; - if ((val & bitmask) != 0) new_val++; - lut_val = extract_fields(pos - 1, val, mask, new_val); - } - } - return lut_val; - } + template::type> inline S sext(U from) { auto mask = (1ULL< instrs; + std::vector children; + uint32_t submask = std::numeric_limits::max(); + uint32_t value; + decoding_tree_node(uint32_t value) : value(value){} + }; - const std::array instr_descr = {{ + decoding_tree_node* root {nullptr}; + + const std::array instr_descr = {{ /* entries are: size, valid value, valid mask, function ptr */<%instructions.each{instr -> %> /* instruction ${instr.instruction.name}, encoding '${instr.encoding}' */ {${instr.length}, ${instr.encoding}, ${instr.mask}, &this_class::__${generator.functionName(instr.name)}},<%}%> @@ -228,11 +187,64 @@ private: vm_impl::gen_trap_check(tu); return BRANCH; } + + //decoding functionality + + void populate_decoding_tree(decoding_tree_node* root){ + //create submask + for(auto instr: root->instrs){ + root->submask &= instr.mask; + } + //put each instr according to submask&encoding into children + for(auto instr: root->instrs){ + bool foundMatch = false; + for(auto child: root->children){ + //use value as identifying trait + if(child->value == (instr.value&root->submask)){ + child->instrs.push_back(instr); + foundMatch = true; + } + } + if(!foundMatch){ + decoding_tree_node* child = new decoding_tree_node(instr.value&root->submask); + child->instrs.push_back(instr); + root->children.push_back(child); + } + } + root->instrs.clear(); + //call populate_decoding_tree for all children + if(root->children.size() >1) + for(auto child: root->children){ + populate_decoding_tree(child); + } + else{ + //sort instrs by value of the mask, this works bc we want to have the least restrictive one last + std::sort(root->children[0]->instrs.begin(), root->children[0]->instrs.end(), [](const instruction_descriptor& instr1, const instruction_descriptor& instr2) { + return instr1.mask > instr2.mask; + }); + } + } + compile_func decode_instr(decoding_tree_node* node, code_word_t word){ + if(!node->children.size()){ + if(node->instrs.size() == 1) return node->instrs[0].op; + for(auto instr : node->instrs){ + if((instr.mask&word) == instr.value) return instr.op; + } + } + else{ + for(auto child : node->children){ + if (child->value == (node->submask&word)){ + return decode_instr(child, word); + } + } + } + return nullptr; + } }; -template void debug_fn(CODE_WORD insn) { - volatile CODE_WORD x = insn; - insn = 2 * x; +template void debug_fn(CODE_WORD instr) { + volatile CODE_WORD x = instr; + instr = 2 * x; } template vm_impl::vm_impl() { this(new ARCH()); } @@ -240,14 +252,11 @@ template vm_impl::vm_impl() { this(new ARCH()); } template vm_impl::vm_impl(ARCH &core, unsigned core_id, unsigned cluster_id) : vm_base(core, core_id, cluster_id) { - qlut[0] = lut_00.data(); - qlut[1] = lut_01.data(); - qlut[2] = lut_10.data(); - qlut[3] = lut_11.data(); - for (auto instr : instr_descr) { - auto quantrant = instr.value & 0x3; - expand_bit_mask(29, lutmasks[quantrant], instr.value >> 2, instr.mask >> 2, 0, qlut[quantrant], instr.op); + root = new decoding_tree_node(std::numeric_limits::max()); + for(auto instr:instr_descr){ + root->instrs.push_back(instr); } + populate_decoding_tree(root); } template @@ -255,30 +264,19 @@ std::tuple vm_impl::gen_single_inst_behavior(virt_addr_t &pc, unsigned int &inst_cnt, tu_builder& tu) { // we fetch at max 4 byte, alignment is 2 enum {TRAP_ID=1<<16}; - code_word_t insn = 0; - // const typename traits::addr_t upper_bits = ~traits::PGMASK; + code_word_t instr = 0; phys_addr_t paddr(pc); - auto *const data = (uint8_t *)&insn; paddr = this->core.v2p(pc); -// if ((pc.val & upper_bits) != ((pc.val + 2) & upper_bits)) { // we may cross a page boundary -// auto res = this->core.read(paddr, 2, data); -// if (res != iss::Ok) throw trap_access(TRAP_ID, pc.val); -// if ((insn & 0x3) == 0x3) { // this is a 32bit instruction -// res = this->core.read(this->core.v2p(pc + 2), 2, data + 2); -// } -// } else { - auto res = this->core.read(paddr, 4, data); - if (res != iss::Ok) throw trap_access(TRAP_ID, pc.val); -// } - if (insn == 0x0000006f || (insn&0xffff)==0xa001) throw simulation_stopped(0); // 'J 0' or 'C.J 0' + auto res = this->core.read(paddr, 4, reinterpret_cast(&instr)); + if (res != iss::Ok) throw trap_access(TRAP_ID, pc.val); + if (instr == 0x0000006f || (instr&0xffff)==0xa001) throw simulation_stopped(0); // 'J 0' or 'C.J 0' // curr pc on stack ++inst_cnt; - auto lut_val = extract_fields(insn); - auto f = qlut[insn & 0x3][lut_val]; + auto f = decode_instr(root, instr); if (f == nullptr) { f = &this_class::illegal_intruction; } - return (this->*f)(pc, insn, tu); + return (this->*f)(pc, instr, tu); } template void vm_impl::gen_raise_trap(tu_builder& tu, uint16_t trap_id, uint16_t cause) { diff --git a/src/vm/interp/vm_tgc_c.cpp b/src/vm/interp/vm_tgc_c.cpp index d5817ed..55c75d5 100644 --- a/src/vm/interp/vm_tgc_c.cpp +++ b/src/vm/interp/vm_tgc_c.cpp @@ -298,7 +298,7 @@ private: }); } } - typename arch::traits::opcode_e decodeInstr(decoding_tree_node* node, code_word_t word){ + typename arch::traits::opcode_e decode_instr(decoding_tree_node* node, code_word_t word){ if(!node->children.size()){ if(node->instrs.size() == 1) return node->instrs[0].op; for(auto instr : node->instrs){ @@ -308,7 +308,7 @@ private: else{ for(auto child : node->children){ if (child->value == (node->submask&word)){ - return decodeInstr(child, word); + return decode_instr(child, word); } } } @@ -340,7 +340,6 @@ constexpr size_t bit_count(uint32_t u) { template vm_impl::vm_impl(ARCH &core, unsigned core_id, unsigned cluster_id) : vm_base(core, core_id, cluster_id) { - unsigned id=0; root = new decoding_tree_node(std::numeric_limits::max()); for(auto instr:instr_descr){ root->instrs.push_back(instr); @@ -377,7 +376,7 @@ typename vm_base::virt_addr_t vm_impl::execute_inst(finish_cond_e co } else { if (is_jump_to_self_enabled(cond) && (instr == 0x0000006f || (instr&0xffff)==0xa001)) throw simulation_stopped(0); // 'J 0' or 'C.J 0' - auto inst_id = decodeInstr(root, instr); + auto inst_id = decode_instr(root, instr); // pre execution stuff this->core.reg.last_branch = 0; if(this->sync_exec && PRE_SYNC) this->do_sync(PRE_SYNC, static_cast(inst_id)); diff --git a/src/vm/tcc/vm_tgc_c.cpp b/src/vm/tcc/vm_tgc_c.cpp index 495d97e..fde3ec3 100644 --- a/src/vm/tcc/vm_tgc_c.cpp +++ b/src/vm/tcc/vm_tgc_c.cpp @@ -121,57 +121,7 @@ protected: } } - // some compile time constants - // enum { MASK16 = 0b1111110001100011, MASK32 = 0b11111111111100000111000001111111 }; - enum { MASK16 = 0b1111111111111111, MASK32 = 0b11111111111100000111000001111111 }; - enum { EXTR_MASK16 = MASK16 >> 2, EXTR_MASK32 = MASK32 >> 2 }; - enum { LUT_SIZE = 1 << util::bit_count(static_cast(EXTR_MASK32)), LUT_SIZE_C = 1 << util::bit_count(static_cast(EXTR_MASK16)) }; - - std::array lut; - - std::array lut_00, lut_01, lut_10; - std::array lut_11; - - std::array qlut; - - std::array lutmasks = {{EXTR_MASK16, EXTR_MASK16, EXTR_MASK16, EXTR_MASK32}}; - - void expand_bit_mask(int pos, uint32_t mask, uint32_t value, uint32_t valid, uint32_t idx, compile_func lut[], - compile_func f) { - if (pos < 0) { - lut[idx] = f; - } else { - auto bitmask = 1UL << pos; - if ((mask & bitmask) == 0) { - expand_bit_mask(pos - 1, mask, value, valid, idx, lut, f); - } else { - if ((valid & bitmask) == 0) { - expand_bit_mask(pos - 1, mask, value, valid, (idx << 1), lut, f); - expand_bit_mask(pos - 1, mask, value, valid, (idx << 1) + 1, lut, f); - } else { - auto new_val = idx << 1; - if ((value & bitmask) != 0) new_val++; - expand_bit_mask(pos - 1, mask, value, valid, new_val, lut, f); - } - } - } - } - - inline uint32_t extract_fields(uint32_t val) { return extract_fields(29, val >> 2, lutmasks[val & 0x3], 0); } - - uint32_t extract_fields(int pos, uint32_t val, uint32_t mask, uint32_t lut_val) { - if (pos >= 0) { - auto bitmask = 1UL << pos; - if ((mask & bitmask) == 0) { - lut_val = extract_fields(pos - 1, val, mask, lut_val); - } else { - auto new_val = lut_val << 1; - if ((val & bitmask) != 0) new_val++; - lut_val = extract_fields(pos - 1, val, mask, new_val); - } - } - return lut_val; - } + template::type> inline S sext(U from) { auto mask = (1ULL< instrs; + std::vector children; + uint32_t submask = std::numeric_limits::max(); + uint32_t value; + decoding_tree_node(uint32_t value) : value(value){} + }; - const std::array instr_descr = {{ + decoding_tree_node* root {nullptr}; + + const std::array instr_descr = {{ /* entries are: size, valid value, valid mask, function ptr */ /* instruction LUI, encoding '0b00000000000000000000000000110111' */ {32, 0b00000000000000000000000000110111, 0b00000000000000000000000001111111, &this_class::__lui}, @@ -494,14 +453,14 @@ private: this->gen_raise_trap(tu, 0, 2); } else{ - auto new_pc = tu.assignment(tu.ext((tu.bitwise_and((tu.add(tu.load(rs1+ traits::X0, 0),tu.constant((int16_t)sext<12>(imm),16))),tu.constant(~ 0x1,8))),32,true),32); + auto new_pc = tu.assignment(tu.ext((tu.bitwise_and((tu.add(tu.load(rs1+ traits::X0, 0),tu.constant((int16_t)sext<12>(imm),16))),tu.constant(~0x1,8))),32,true),32); tu.open_if(tu.srem(new_pc,tu.constant(static_cast(traits:: INSTR_ALIGNMENT),32))); this->gen_raise_trap(tu, 0, 0); tu.open_else(); if(rd!= 0) { tu.store(rd + traits::X0,tu.ext((tu.add(tu.ext(cur_pc_val,32,false),tu.constant( 4,8))),32,true)); } - auto PC_val_v = tu.assignment("PC_val", tu.bitwise_and(new_pc,tu.constant(~ 0x1,8)),32); + auto PC_val_v = tu.assignment("PC_val", tu.bitwise_and(new_pc,tu.constant(~0x1,8)),32); tu.store(traits::NEXT_PC, PC_val_v); tu.store(traits::LAST_BRANCH, tu.constant(2U, 2)); tu.close_scope(); @@ -1963,7 +1922,7 @@ private: else{ auto xrd = tu.assignment(tu.read_mem(traits::CSR, csr, 32),32); if(zimm!= 0) { - tu.write_mem(traits::CSR, csr, tu.bitwise_and(xrd,tu.constant(~ ((uint32_t)zimm),32))); + tu.write_mem(traits::CSR, csr, tu.bitwise_and(xrd,tu.constant(~((uint32_t)zimm),32))); } if(rd!= 0) { tu.store(rd + traits::X0,xrd); @@ -2024,7 +1983,7 @@ private: this->gen_raise_trap(tu, 0, 2); } else{ - auto res = tu.assignment(tu.mul(tu.ext(tu.load(rs1+ traits::X0, 0),32,false),tu.ext(tu.load(rs2+ traits::X0, 0),32,false)),64); + auto res = tu.assignment(tu.ext((tu.mul(tu.ext(tu.ext(tu.load(rs1+ traits::X0, 0),32,true),64,false),tu.ext(tu.ext(tu.load(rs2+ traits::X0, 0),32,true),64,false))),64,false),64); if(rd!=0) { tu.store(rd + traits::X0,tu.ext(res,32,true)); } @@ -2058,7 +2017,7 @@ private: this->gen_raise_trap(tu, 0, 2); } else{ - auto res = tu.assignment(tu.mul(tu.ext(tu.load(rs1+ traits::X0, 0),32,false),tu.ext(tu.load(rs2+ traits::X0, 0),32,false)),64); + auto res = tu.assignment(tu.ext((tu.mul(tu.ext(tu.ext(tu.load(rs1+ traits::X0, 0),32,true),64,false),tu.ext(tu.ext(tu.load(rs2+ traits::X0, 0),32,true),64,false))),64,false),64); if(rd!=0) { tu.store(rd + traits::X0,tu.ext((tu.lshr(res,tu.constant(static_cast(traits:: XLEN),32))),32,true)); } @@ -2092,7 +2051,7 @@ private: this->gen_raise_trap(tu, 0, 2); } else{ - auto res = tu.assignment(tu.mul(tu.ext(tu.load(rs1+ traits::X0, 0),32,false),tu.load(rs2+ traits::X0, 0)),64); + auto res = tu.assignment(tu.ext((tu.mul(tu.ext(tu.ext(tu.load(rs1+ traits::X0, 0),32,true),64,false),tu.ext(tu.load(rs2+ traits::X0, 0),64,true))),64,false),64); if(rd!=0) { tu.store(rd + traits::X0,tu.ext((tu.lshr(res,tu.constant(static_cast(traits:: XLEN),32))),32,true)); } @@ -2126,7 +2085,7 @@ private: this->gen_raise_trap(tu, 0, 2); } else{ - auto res = tu.assignment(tu.mul(tu.load(rs1+ traits::X0, 0),tu.load(rs2+ traits::X0, 0)),64); + auto res = tu.assignment(tu.ext((tu.mul(tu.ext(tu.load(rs1+ traits::X0, 0),64,true),tu.ext(tu.load(rs2+ traits::X0, 0),64,true))),64,true),64); if(rd!=0) { tu.store(rd + traits::X0,tu.ext((tu.lshr(res,tu.constant(static_cast(traits:: XLEN),32))),32,true)); } @@ -2164,13 +2123,13 @@ private: auto divisor = tu.assignment(tu.ext(tu.load(rs2+ traits::X0, 0),32,false),32); if(rd!= 0){ tu.open_if(tu.icmp(ICmpInst::ICMP_NE,divisor,tu.constant( 0,8))); auto MMIN = tu.assignment(tu.constant(((uint32_t)1)<<(static_cast(traits:: XLEN)-1),32),32); - tu.open_if(tu.logical_and(tu.icmp(ICmpInst::ICMP_EQ,tu.load(rs1+ traits::X0, 0),MMIN),tu.icmp(ICmpInst::ICMP_EQ,divisor,tu.constant(- 1,8)))); + tu.open_if(tu.logical_and(tu.icmp(ICmpInst::ICMP_EQ,tu.load(rs1+ traits::X0, 0),MMIN),tu.icmp(ICmpInst::ICMP_EQ,divisor,tu.constant(-1,8)))); tu.store(rd + traits::X0,MMIN); tu.open_else(); tu.store(rd + traits::X0,tu.ext((tu.sdiv(dividend,divisor)),32,true)); tu.close_scope(); tu.open_else(); - tu.store(rd + traits::X0,tu.constant((uint32_t)- 1,32)); + tu.store(rd + traits::X0,tu.constant((uint32_t)-1,32)); tu.close_scope(); } } @@ -2209,7 +2168,7 @@ private: } tu.open_else(); if(rd!=0) { - tu.store(rd + traits::X0,tu.constant((uint32_t)- 1,32)); + tu.store(rd + traits::X0,tu.constant((uint32_t)-1,32)); } tu.close_scope(); } @@ -2244,7 +2203,7 @@ private: else{ tu.open_if(tu.icmp(ICmpInst::ICMP_NE,tu.load(rs2+ traits::X0, 0),tu.constant( 0,8))); auto MMIN = tu.assignment(tu.constant( 1<<(static_cast(traits:: XLEN)-1),8),32); - tu.open_if(tu.logical_and(tu.icmp(ICmpInst::ICMP_EQ,tu.load(rs1+ traits::X0, 0),MMIN),tu.icmp(ICmpInst::ICMP_EQ,tu.ext(tu.load(rs2+ traits::X0, 0),32,false),tu.constant(- 1,8)))); + tu.open_if(tu.logical_and(tu.icmp(ICmpInst::ICMP_EQ,tu.load(rs1+ traits::X0, 0),MMIN),tu.icmp(ICmpInst::ICMP_EQ,tu.ext(tu.load(rs2+ traits::X0, 0),32,false),tu.constant(-1,8)))); if(rd!=0) { tu.store(rd + traits::X0,tu.constant( 0,8)); } @@ -2353,8 +2312,8 @@ private: pc=pc+ 2; gen_set_pc(tu, pc, traits::NEXT_PC); tu.open_scope(); - auto load_address = tu.assignment(tu.ext((tu.add(tu.load(rs1+ 8+ traits::X0, 0),tu.constant(uimm,8))),32,true),32); - tu.store(rd+ 8 + traits::X0,tu.ext(tu.ext(tu.read_mem(traits::MEM, load_address, 32),32,false),32,true)); + auto offs = tu.assignment(tu.ext((tu.add(tu.load(rs1+ 8+ traits::X0, 0),tu.constant(uimm,8))),32,true),32); + tu.store(rd+ 8 + traits::X0,tu.ext(tu.ext(tu.read_mem(traits::MEM, offs, 32),32,false),32,true)); auto returnValue = std::make_tuple(CONT); tu.close_scope(); vm_base::gen_sync(tu, POST_SYNC,58); @@ -2380,8 +2339,8 @@ private: pc=pc+ 2; gen_set_pc(tu, pc, traits::NEXT_PC); tu.open_scope(); - auto load_address = tu.assignment(tu.ext((tu.add(tu.load(rs1+ 8+ traits::X0, 0),tu.constant(uimm,8))),32,true),32); - tu.write_mem(traits::MEM, load_address, tu.ext(tu.load(rs2+ 8+ traits::X0, 0),32,true)); + auto offs = tu.assignment(tu.ext((tu.add(tu.load(rs1+ 8+ traits::X0, 0),tu.constant(uimm,8))),32,true),32); + tu.write_mem(traits::MEM, offs, tu.ext(tu.load(rs2+ 8+ traits::X0, 0),32,true)); auto returnValue = std::make_tuple(CONT); tu.close_scope(); vm_base::gen_sync(tu, POST_SYNC,59); @@ -2898,8 +2857,7 @@ private: } else{ auto offs = tu.assignment(tu.ext((tu.add(tu.load(2+ traits::X0, 0),tu.constant(uimm,8))),32,true),32); - auto res = tu.assignment(tu.ext(tu.read_mem(traits::MEM, offs, 32),32,false),32); - tu.store(rd + traits::X0,tu.ext(res,32,true)); + tu.store(rd + traits::X0,tu.ext(tu.ext(tu.read_mem(traits::MEM, offs, 32),32,false),32,true)); } auto returnValue = std::make_tuple(CONT); tu.close_scope(); @@ -2957,7 +2915,7 @@ private: gen_set_pc(tu, pc, traits::NEXT_PC); tu.open_scope(); if(rs1&&rs1(traits:: RFS)) { - auto PC_val_v = tu.assignment("PC_val", tu.bitwise_and(tu.load(rs1%static_cast(traits:: RFS)+ traits::X0, 0),tu.constant(~ 0x1,8)),32); + auto PC_val_v = tu.assignment("PC_val", tu.bitwise_and(tu.load(rs1%static_cast(traits:: RFS)+ traits::X0, 0),tu.constant(~0x1,8)),32); tu.store(traits::NEXT_PC, PC_val_v); tu.store(traits::LAST_BRANCH, tu.constant(2U, 2)); } @@ -3045,7 +3003,7 @@ private: else{ auto new_pc = tu.assignment(tu.load(rs1+ traits::X0, 0),32); tu.store(1 + traits::X0,tu.ext((tu.add(tu.ext(cur_pc_val,32,false),tu.constant( 2,8))),32,true)); - auto PC_val_v = tu.assignment("PC_val", tu.bitwise_and(new_pc,tu.constant(~ 0x1,8)),32); + auto PC_val_v = tu.assignment("PC_val", tu.bitwise_and(new_pc,tu.constant(~0x1,8)),32); tu.store(traits::NEXT_PC, PC_val_v); tu.store(traits::LAST_BRANCH, tu.constant(2U, 2)); } @@ -3138,11 +3096,64 @@ private: vm_impl::gen_trap_check(tu); return BRANCH; } + + //decoding functionality + + void populate_decoding_tree(decoding_tree_node* root){ + //create submask + for(auto instr: root->instrs){ + root->submask &= instr.mask; + } + //put each instr according to submask&encoding into children + for(auto instr: root->instrs){ + bool foundMatch = false; + for(auto child: root->children){ + //use value as identifying trait + if(child->value == (instr.value&root->submask)){ + child->instrs.push_back(instr); + foundMatch = true; + } + } + if(!foundMatch){ + decoding_tree_node* child = new decoding_tree_node(instr.value&root->submask); + child->instrs.push_back(instr); + root->children.push_back(child); + } + } + root->instrs.clear(); + //call populate_decoding_tree for all children + if(root->children.size() >1) + for(auto child: root->children){ + populate_decoding_tree(child); + } + else{ + //sort instrs by value of the mask, this works bc we want to have the least restrictive one last + std::sort(root->children[0]->instrs.begin(), root->children[0]->instrs.end(), [](const instruction_descriptor& instr1, const instruction_descriptor& instr2) { + return instr1.mask > instr2.mask; + }); + } + } + compile_func decode_instr(decoding_tree_node* node, code_word_t word){ + if(!node->children.size()){ + if(node->instrs.size() == 1) return node->instrs[0].op; + for(auto instr : node->instrs){ + if((instr.mask&word) == instr.value) return instr.op; + } + } + else{ + for(auto child : node->children){ + if (child->value == (node->submask&word)){ + return decode_instr(child, word); + } + } + } + return nullptr; + } }; -template void debug_fn(CODE_WORD insn) { - volatile CODE_WORD x = insn; - insn = 2 * x; +template void debug_fn(CODE_WORD instr) { + volatile CODE_WORD x = instr; + instr = 2 * x; } template vm_impl::vm_impl() { this(new ARCH()); } @@ -3150,14 +3161,11 @@ template vm_impl::vm_impl() { this(new ARCH()); } template vm_impl::vm_impl(ARCH &core, unsigned core_id, unsigned cluster_id) : vm_base(core, core_id, cluster_id) { - qlut[0] = lut_00.data(); - qlut[1] = lut_01.data(); - qlut[2] = lut_10.data(); - qlut[3] = lut_11.data(); - for (auto instr : instr_descr) { - auto quantrant = instr.value & 0x3; - expand_bit_mask(29, lutmasks[quantrant], instr.value >> 2, instr.mask >> 2, 0, qlut[quantrant], instr.op); + root = new decoding_tree_node(std::numeric_limits::max()); + for(auto instr:instr_descr){ + root->instrs.push_back(instr); } + populate_decoding_tree(root); } template @@ -3165,30 +3173,19 @@ std::tuple vm_impl::gen_single_inst_behavior(virt_addr_t &pc, unsigned int &inst_cnt, tu_builder& tu) { // we fetch at max 4 byte, alignment is 2 enum {TRAP_ID=1<<16}; - code_word_t insn = 0; - // const typename traits::addr_t upper_bits = ~traits::PGMASK; + code_word_t instr = 0; phys_addr_t paddr(pc); - auto *const data = (uint8_t *)&insn; paddr = this->core.v2p(pc); -// if ((pc.val & upper_bits) != ((pc.val + 2) & upper_bits)) { // we may cross a page boundary -// auto res = this->core.read(paddr, 2, data); -// if (res != iss::Ok) throw trap_access(TRAP_ID, pc.val); -// if ((insn & 0x3) == 0x3) { // this is a 32bit instruction -// res = this->core.read(this->core.v2p(pc + 2), 2, data + 2); -// } -// } else { - auto res = this->core.read(paddr, 4, data); - if (res != iss::Ok) throw trap_access(TRAP_ID, pc.val); -// } - if (insn == 0x0000006f || (insn&0xffff)==0xa001) throw simulation_stopped(0); // 'J 0' or 'C.J 0' + auto res = this->core.read(paddr, 4, reinterpret_cast(&instr)); + if (res != iss::Ok) throw trap_access(TRAP_ID, pc.val); + if (instr == 0x0000006f || (instr&0xffff)==0xa001) throw simulation_stopped(0); // 'J 0' or 'C.J 0' // curr pc on stack ++inst_cnt; - auto lut_val = extract_fields(insn); - auto f = qlut[insn & 0x3][lut_val]; + auto f = decode_instr(root, instr); if (f == nullptr) { f = &this_class::illegal_intruction; } - return (this->*f)(pc, insn, tu); + return (this->*f)(pc, instr, tu); } template void vm_impl::gen_raise_trap(tu_builder& tu, uint16_t trap_id, uint16_t cause) {