Skip to content

Commit

Permalink
Merge pull request #157 from fwsGonzo/shared_execute_segments
Browse files Browse the repository at this point in the history
Make execute segments thread-safe shareable, reference counted
  • Loading branch information
fwsGonzo committed Jun 16, 2024
2 parents 0139c0f + e8e7cb7 commit 34c0c40
Show file tree
Hide file tree
Showing 12 changed files with 122 additions and 37 deletions.
2 changes: 1 addition & 1 deletion lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ endif()

if (RISCV_LIBTCC)
if(CMAKE_VERSION VERSION_LESS "3.14.0" OR RISCV_LIBTCC_DISTRO_PACKAGE)
target_link_libraries(riscv PUBLIC libtcc.a)
target_link_libraries(riscv PUBLIC -l:libtcc.a)
set_source_files_properties(libriscv/tr_tcc.cpp PROPERTIES
COMPILE_DEFINITIONS RISCV_LIBTCC_PACKAGE=1)
set(LIBTCC_FROM_PACKAGE TRUE)
Expand Down
8 changes: 4 additions & 4 deletions lib/libriscv/cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ namespace riscv
// A default empty execute segment used to enforce that the
// current CPU execute segment is never null.
template <int W>
static DecodedExecuteSegment<W> empty(0, 0, 0, 0);
static std::shared_ptr<DecodedExecuteSegment<W>> empty_shared = std::make_shared<DecodedExecuteSegment<W>>(0, 0, 0, 0);
template <int W>
DecodedExecuteSegment<W>& CPU<W>::empty_execute_segment() {
return empty<W>;
std::shared_ptr<DecodedExecuteSegment<W>>& CPU<W>::empty_execute_segment() {
return empty_shared<W>;
}

// Instructions may be unaligned with C-extension
Expand Down Expand Up @@ -108,7 +108,7 @@ namespace riscv
}

// Find previously decoded execute segment
this->m_exec = &machine().memory.exec_segment_for(pc);
this->m_exec = machine().memory.exec_segment_for(pc).get();
if (LIKELY(!this->m_exec->empty())) {
return {this->m_exec, pc};
}
Expand Down
4 changes: 2 additions & 2 deletions lib/libriscv/cpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ namespace riscv
address_t pc;
};
NextExecuteReturn next_execute_segment(address_t pc);
static DecodedExecuteSegment<W>& empty_execute_segment();
static std::shared_ptr<DecodedExecuteSegment<W>>& empty_execute_segment();
bool is_executable(address_t addr) const noexcept;

// Override the function that gets called when the CPU
Expand Down Expand Up @@ -171,7 +171,7 @@ namespace riscv

// The default execute override returns no new execute segment
override_execute_segment_t m_override_exec = [] (auto&) -> DecodedExecuteSegment<W>& {
return empty_execute_segment();
return *empty_execute_segment();
};

#ifdef RISCV_BINARY_TRANSLATION
Expand Down
2 changes: 1 addition & 1 deletion lib/libriscv/cpu_inline.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const Memory<W>& CPU<W>::memory() const noexcept { return machine().memory; }

template <int W>
inline CPU<W>::CPU(Machine<W>& machine, unsigned cpu_id)
: m_machine { machine }, m_exec(&empty_execute_segment()), m_cpuid { cpu_id }
: m_machine { machine }, m_exec(empty_execute_segment().get()), m_cpuid { cpu_id }
{
}
template <int W>
Expand Down
4 changes: 4 additions & 0 deletions lib/libriscv/decoded_exec_segment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ namespace riscv

size_t threaded_rewrite(size_t bytecode, address_t pc, rv32i_instruction& instr);

uint32_t crc32c_hash() const noexcept { return m_crc32c_hash; }
void set_crc32c_hash(uint32_t hash) { m_crc32c_hash = hash; }

#ifdef RISCV_BINARY_TRANSLATION
bool is_binary_translated() const noexcept { return m_bintr_dl != nullptr; }
void* binary_translation_so() const { return m_bintr_dl; }
Expand Down Expand Up @@ -93,6 +96,7 @@ namespace riscv
mutable void* m_bintr_dl = nullptr;
uint32_t m_bintr_hash = 0x0; // CRC32-C of the execute segment + compiler options
#endif
uint32_t m_crc32c_hash = 0x0; // CRC32-C of the execute segment
};

template <int W>
Expand Down
101 changes: 90 additions & 11 deletions lib/libriscv/decoder_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,60 @@
#include "safe_instr_loader.hpp"
#include "threaded_rewriter.cpp"
#include "threaded_bytecodes.hpp"
#include "util/crc32.hpp"
#include <mutex>

namespace riscv
{
static constexpr bool VERBOSE_DECODER = false;

template <int W>
struct SharedExecuteSegments {
SharedExecuteSegments() = default;
SharedExecuteSegments(const SharedExecuteSegments&) = delete;
SharedExecuteSegments& operator=(const SharedExecuteSegments&) = delete;

struct Segment {
std::shared_ptr<DecodedExecuteSegment<W>> segment;
std::mutex mutex;

std::shared_ptr<DecodedExecuteSegment<W>> get() {
std::lock_guard<std::mutex> lock(mutex);
return segment;
}

void unlocked_set(std::shared_ptr<DecodedExecuteSegment<W>> segment) {
this->segment = std::move(segment);
}
};

// Remove a segment if it is the last reference
void remove_if_unique(uint32_t hash) {
std::lock_guard<std::mutex> lock(mutex);
// We are not able to remove the Segment itself, as the mutex
// may be locked by another thread. We can, however, lock the
// Segments mutex and set the segment to nullptr.
auto it = m_segments.find(hash);
if (it != m_segments.end()) {
std::scoped_lock lock(it->second.mutex);
if (it->second.segment.use_count() == 1)
it->second.segment = nullptr;
}
}

auto& get_segment(const uint32_t hash) {
std::scoped_lock lock(mutex);
auto& entry = m_segments[hash];
return entry;
}

private:
std::unordered_map<uint32_t, Segment> m_segments;
std::mutex mutex;
};
template <int W>
static SharedExecuteSegments<W> shared_execute_segments;

template <int W>
static bool is_regular_compressed(uint16_t instr) {
const rv32c_instruction ci { instr };
Expand Down Expand Up @@ -420,24 +469,48 @@ namespace riscv
}

// Create the whole executable memory range
auto& current_exec = this->next_execute_segment();
new (&current_exec) DecodedExecuteSegment<W>(pbase, plen, vaddr, exlen);
auto current_exec = std::make_shared<DecodedExecuteSegment<W>>(pbase, plen, vaddr, exlen);

auto* exec_data = current_exec.exec_data(pbase);
auto* exec_data = current_exec->exec_data(pbase);
// This is a zeroed prologue in order to be able to use whole pages
std::memset(&exec_data[0], 0, prelen);
// This is the actual instruction bytes
std::memcpy(&exec_data[prelen], vdata, exlen);
// This memset() operation will end up zeroing the extra 4 bytes
std::memset(&exec_data[prelen + exlen], 0, postlen);

this->generate_decoder_cache(options, current_exec);
// Create CRC32-C hash of the execute segment
const uint32_t hash = crc32c(exec_data, current_exec->exec_end() - current_exec->exec_begin());

// Get a free slot to reference the execute segment
auto& free_slot = this->next_execute_segment();

// In order to prevent others from creating the same execute segment
// we need to lock the shared execute segments mutex.
auto& segment = shared_execute_segments<W>.get_segment(hash);
std::scoped_lock lock(segment.mutex);

if (segment.segment != nullptr) {
free_slot = segment.segment;
return *free_slot;
}

// We need to create a new execute segment, as there is no shared
// execute segment with the same hash.
free_slot = std::move(current_exec);
// Store the hash in the decoder cache
free_slot->set_crc32c_hash(hash);

this->generate_decoder_cache(options, *free_slot);

// Share the execute segment in the shared execute segments
segment.unlocked_set(free_slot);

return current_exec;
return *free_slot;
}

template <int W>
DecodedExecuteSegment<W>& Memory<W>::next_execute_segment()
std::shared_ptr<DecodedExecuteSegment<W>>& Memory<W>::next_execute_segment()
{
if (LIKELY(m_exec_segs < MAX_EXECUTE_SEGS)) {
auto& result = this->m_exec.at(m_exec_segs);
Expand All @@ -448,17 +521,17 @@ namespace riscv
}

template <int W>
DecodedExecuteSegment<W>& Memory<W>::exec_segment_for(address_t vaddr)
std::shared_ptr<DecodedExecuteSegment<W>>& Memory<W>::exec_segment_for(address_t vaddr)
{
for (size_t i = 0; i < m_exec_segs; i++) {
auto& segment = m_exec[i];
if (segment.is_within(vaddr)) return segment;
if (segment->is_within(vaddr)) return segment;
}
return CPU<W>::empty_execute_segment();
}

template <int W>
const DecodedExecuteSegment<W>& Memory<W>::exec_segment_for(address_t vaddr) const
const std::shared_ptr<DecodedExecuteSegment<W>>& Memory<W>::exec_segment_for(address_t vaddr) const
{
return const_cast<Memory<W>*>(this)->exec_segment_for(vaddr);
}
Expand All @@ -470,11 +543,17 @@ namespace riscv
return;

// destructor could throw, so let's invalidate early
machine().cpu.set_execute_segment(CPU<W>::empty_execute_segment());
machine().cpu.set_execute_segment(*CPU<W>::empty_execute_segment());

while (m_exec_segs > remaining_size) {
m_exec_segs--;
m_exec.at(m_exec_segs).~DecodedExecuteSegment<W>();

auto& segment = m_exec.at(m_exec_segs);
if (segment) {
const uint32_t hash = segment->crc32c_hash();
segment = nullptr;
shared_execute_segments<W>.remove_if_unique(hash);
}
}
}

Expand Down
2 changes: 2 additions & 0 deletions lib/libriscv/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,8 @@ namespace riscv
Memory<W>::~Memory()
{
this->clear_all_pages();
// remove execute segments
this->evict_execute_segments(0);
// only the original machine owns arena
if (this->m_arena.data != nullptr && !is_forked()) {
#ifdef __linux__
Expand Down
9 changes: 4 additions & 5 deletions lib/libriscv/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -174,9 +174,8 @@ namespace riscv
address_t dst, void* src, size_t size, PageAttributes = {});

// Custom execute segment, returns page base, final size and execute segment pointer
DecodedExecuteSegment<W>& exec_segment_for(address_t vaddr);
const DecodedExecuteSegment<W>& exec_segment_for(address_t vaddr) const;
const DecodedExecuteSegment<W>& main_execute_segment() const { return m_exec.at(0); }
std::shared_ptr<DecodedExecuteSegment<W>>& exec_segment_for(address_t vaddr);
const std::shared_ptr<DecodedExecuteSegment<W>>& exec_segment_for(address_t vaddr) const;
DecodedExecuteSegment<W>& create_execute_segment(const MachineOptions<W>&, const void* data, address_t addr, size_t len);
size_t cached_execute_segments() const noexcept { return m_exec_segs; }
// Evict newest execute segments until only remaining left
Expand Down Expand Up @@ -277,9 +276,9 @@ namespace riscv
#endif

// Execute segments
std::array<DecodedExecuteSegment<W>, MAX_EXECUTE_SEGS> m_exec;
std::array<std::shared_ptr<DecodedExecuteSegment<W>>, MAX_EXECUTE_SEGS> m_exec;
size_t m_exec_segs = 0;
DecodedExecuteSegment<W>& next_execute_segment();
std::shared_ptr<DecodedExecuteSegment<W>>& next_execute_segment();

// Linear arena at start of memory (mmap-backed)
struct alignas(16) {
Expand Down
3 changes: 2 additions & 1 deletion lib/libriscv/memory_rw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,8 @@ namespace riscv
}

for (const auto& exec : m_exec) {
total += exec.size_bytes();
if (exec)
total += exec->size_bytes();
}

return total;
Expand Down
7 changes: 1 addition & 6 deletions lib/libriscv/tr_api.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,13 +165,8 @@ static struct CallbackTable {
#define ARENA_WRITABLE(x) ((x) - RISCV_ARENA_ROEND < ARENA_WRITE_BOUNDARY)
INTERNAL static char* arena_ptr;
#define ARENA_AT(cpu, x) arena_ptr[x]
//#ifdef __TINYC__
//#define ARENA_AT(cpu, x) arena_ptr[x]
//#else
//#define ARENA_AT(cpu, x) (*(char **)((uintptr_t)cpu + RISCV_ARENA_OFF))[x]
//#endif
#define ARENA_AT(cpu, x) (*(char **)((uintptr_t)cpu + RISCV_ARENA_OFF))[x]
static inline int do_syscall(CPU* cpu, uint64_t counter, uint64_t max_counter, addr_t sysno)
{
Expand Down
9 changes: 7 additions & 2 deletions lib/libriscv/tr_emit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@
#endif

namespace riscv {
// libtcc direct arena pointer access
// This is a performance optimization for libtcc, which allows direct access to the memory arena
// however, with it execute segments can no longer be shared between different machines.
// So, for a simple CLI tool, this is a good optimization. But not for a system of multiple machines.
static constexpr bool libtcc_direct_pointer_enabled = false;
static const std::string LOOP_EXPRESSION = "LIKELY(counter < max_counter)";
static const std::string SIGNEXTW = "(saddr_t) (int32_t)";
static constexpr int ALIGN_MASK = (compressed_enabled) ? 0x1 : 0x3;
Expand Down Expand Up @@ -188,7 +193,7 @@ struct Emitter
auto& get_gpr_exists() const noexcept { return this->gpr_exists; }

std::string arena_at(const std::string& address) {
if constexpr (libtcc_enabled) {
if constexpr (libtcc_direct_pointer_enabled && libtcc_enabled) {
if (cpu.machine().memory.uses_32bit_encompassing_arena()) {
return "(*(char*)(" + std::to_string(tinfo.arena_ptr) + " + (uint32_t)(" + address + ")))";
} else {
Expand All @@ -202,7 +207,7 @@ struct Emitter
}

std::string arena_at_fixed(address_t address) {
if constexpr (libtcc_enabled) {
if constexpr (libtcc_direct_pointer_enabled && libtcc_enabled) {
if (cpu.machine().memory.uses_32bit_encompassing_arena()) {
return "(*(char*)" + std::to_string(tinfo.arena_ptr + uint32_t(address)) + "ul)";
} else {
Expand Down
8 changes: 4 additions & 4 deletions lib/libriscv/tr_translate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,14 @@ int CPU<W>::load_translation(const MachineOptions<W>& options,
throw MachineException(ILLEGAL_OPERATION, "Execute segment already binary translated");
}

auto* exec_data = exec.exec_data(exec.exec_begin());

// Checksum the execute segment + compiler flags
TIME_POINT(t5);
const std::string cflags = defines_to_string(create_defines_for(machine(), options));
extern std::string compile_command(int arch, const std::string& cflags);
uint32_t checksum =
crc32c(exec_data, exec.exec_end() - exec.exec_begin());
uint32_t checksum = exec.crc32c_hash();
if (UNLIKELY(checksum == 0)) {
throw MachineException(INVALID_PROGRAM, "Invalid execute segment hash for translation");
}
// Also add the compiler flags to the checksum
checksum = ~crc32c(~checksum, cflags.c_str(), cflags.size());
exec.set_translation_hash(checksum);
Expand Down

0 comments on commit 34c0c40

Please sign in to comment.