Skip to content

Commit

Permalink
bintr: Create patched execute segment during background compilation
Browse files Browse the repository at this point in the history
Avoid race conditions entirely by swapping in a patched execute segment
  • Loading branch information
fwsGonzo committed Jul 4, 2024
1 parent b2c9522 commit 1bbb0c7
Show file tree
Hide file tree
Showing 9 changed files with 107 additions and 58 deletions.
14 changes: 14 additions & 0 deletions lib/libriscv/bytecode_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,20 @@ INSTRUCTION(RV32I_BC_FUNCBLOCK, execute_function_block) {
NEXT_BLOCK(instr.length(), true);
}

#ifdef RISCV_BINARY_TRANSLATION
INSTRUCTION(RV32I_BC_LIVEPATCH, execute_livepatch) {
// Special bytecode that does not read any decoder data
// except the function handler (which never changes),
// which makes it possible to set it from live-patching
// 1. Wind back PC to the current decoder position
pc = pc - DECODER().block_bytes();
// 2. Find the correct decoder pointer in the patched decoder cache
auto* patched = &exec->patched_decoder_cache()[pc / DecoderCache<W>::DIVISOR];
decoder = patched;
// 3. Execute the instruction
EXECUTE_INSTR();
}
#endif

INSTRUCTION(RV32I_BC_JALR, rv32i_jalr) {
VIEW_INSTR_AS(fi, FasterItype);
Expand Down
5 changes: 5 additions & 0 deletions lib/libriscv/cpu_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ namespace riscv
counter.increment_counter(decoder->instruction_count()); \
EXECUTE_INSTR();

#define SAFE_INSTR_NEXT(len) \
pc += len; \
decoder += len >> DecoderCache<W>::SHIFT; \
counter.increment_counter(1);

#define NEXT_SEGMENT() \
decoder = &exec_decoder[pc >> DecoderCache<W>::SHIFT]; \
pc += decoder->block_bytes(); \
Expand Down
5 changes: 5 additions & 0 deletions lib/libriscv/cpu_inaccurate_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ namespace riscv
#undef NEXT_INSTR
#undef NEXT_C_INSTR
#undef NEXT_BLOCK
#undef SAFE_INSTR_NEXT
#undef NEXT_SEGMENT
#undef PERFORM_BRANCH
#undef PERFORM_FORWARD_BRANCH
Expand Down Expand Up @@ -51,6 +52,10 @@ namespace riscv
pc += decoder->block_bytes(); \
EXECUTE_INSTR();

#define SAFE_INSTR_NEXT(len) \
pc += len; \
decoder += len >> DecoderCache<W>::SHIFT;

#define NEXT_SEGMENT() \
decoder = &exec_decoder[pc >> DecoderCache<W>::SHIFT]; \
pc += decoder->block_bytes(); \
Expand Down
15 changes: 12 additions & 3 deletions lib/libriscv/decoded_exec_segment.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once
#include <memory>
#include <unordered_map>
#include "types.hpp"

namespace riscv
Expand Down Expand Up @@ -35,6 +36,8 @@ namespace riscv

auto* decoder_cache() noexcept { return m_exec_decoder; }
auto* decoder_cache() const noexcept { return m_exec_decoder; }
auto* decoder_cache_base() const noexcept { return m_decoder_cache.get(); }
size_t decoder_cache_size() const noexcept { return m_decoder_cache_size; }

auto* create_decoder_cache(DecoderCache<W>* cache, size_t size) {
m_decoder_cache.reset(cache);
Expand All @@ -44,7 +47,7 @@ namespace riscv
void set_decoder(DecoderData<W>* dec) { m_exec_decoder = dec; }

size_t size_bytes() const noexcept {
return sizeof(*this) + m_exec_pagedata_size + m_decoder_cache_size;
return sizeof(*this) + m_exec_pagedata_size + m_decoder_cache_size; // * sizeof(DecoderCache<W>);
}
bool empty() const noexcept { return m_exec_pagedata_size == 0; }

Expand All @@ -70,6 +73,9 @@ namespace riscv
bintr_block_func<W> mapping_at(unsigned i) const { return m_translator_mappings.at(i); }
bintr_block_func<W> unchecked_mapping_at(unsigned i) const { return m_translator_mappings[i]; }
size_t translator_mappings() const noexcept { return m_translator_mappings.size(); }
auto* patched_decoder_cache() noexcept { return m_patched_exec_decoder; }
void set_patched_decoder_cache(std::unique_ptr<DecoderCache<W>[]> cache, DecoderData<W>* dec)
{ m_patched_decoder_cache = std::move(cache); m_patched_exec_decoder = dec; }
#else
bool is_binary_translated() const noexcept { return false; }
#endif
Expand All @@ -90,13 +96,14 @@ namespace riscv
address_t m_exec_pagedata_base = 0;
std::unique_ptr<uint8_t[]> m_exec_pagedata = nullptr;

// Decoder cache is used to run simulation at a
// high speed, without resorting to JIT
// Decoder cache is used to run bytecode simulation at a high speed
size_t m_decoder_cache_size = 0;
std::unique_ptr<DecoderCache<W>[]> m_decoder_cache = nullptr;

#ifdef RISCV_BINARY_TRANSLATION
std::vector<bintr_block_func<W>> m_translator_mappings;
std::unique_ptr<DecoderCache<W>[]> m_patched_decoder_cache = nullptr;
DecoderData<W>* m_patched_exec_decoder = nullptr;
mutable void* m_bintr_dl = nullptr;
uint32_t m_bintr_hash = 0x0; // CRC32-C of the execute segment + compiler options
#endif
Expand Down Expand Up @@ -137,6 +144,8 @@ namespace riscv
m_translator_mappings = std::move(other.m_translator_mappings);
m_bintr_dl = other.m_bintr_dl;
other.m_bintr_dl = nullptr;
m_bintr_hash = other.m_bintr_hash;
m_is_libtcc = other.m_is_libtcc;
#endif
}

Expand Down
3 changes: 1 addition & 2 deletions lib/libriscv/decoder_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,8 +293,7 @@ namespace riscv
}
// Here we allocate the decoder cache which is page-sized
auto* decoder_cache = exec.create_decoder_cache(
new DecoderCache<W> [n_pages],
n_pages * sizeof(DecoderCache<W>));
new DecoderCache<W> [n_pages], n_pages);
auto* exec_decoder =
decoder_cache[0].get_base() - pbase / DecoderCache<W>::DIVISOR;
exec.set_decoder(exec_decoder);
Expand Down
1 change: 1 addition & 0 deletions lib/libriscv/tailcall_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ namespace riscv
[RV32I_BC_FUNCBLOCK] = execute_function_block,
#ifdef RISCV_BINARY_TRANSLATION
[RV32I_BC_TRANSLATOR] = translated_function,
[RV32I_BC_LIVEPATCH] = execute_livepatch,
#endif
[RV32I_BC_SYSTEM] = rv32i_system,
};
Expand Down
3 changes: 2 additions & 1 deletion lib/libriscv/threaded_bytecode_array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,11 @@ static constexpr void *computed_opcode[] = {
[RV32V_BC_VFADD_VV] = &&rv32v_vfadd_vv,
[RV32V_BC_VFMUL_VF] = &&rv32v_vfmul_vf,
#endif
[RV32I_BC_FUNCTION] = &&execute_decoded_function,
[RV32I_BC_FUNCTION] = &&execute_decoded_function,
[RV32I_BC_FUNCBLOCK] = &&execute_function_block,
#ifdef RISCV_BINARY_TRANSLATION
[RV32I_BC_TRANSLATOR] = &&translated_function,
[RV32I_BC_LIVEPATCH] = &&execute_livepatch,
#endif
[RV32I_BC_SYSTEM] = &&rv32i_system,
};
1 change: 1 addition & 0 deletions lib/libriscv/threaded_bytecodes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ namespace riscv
RV32I_BC_FUNCBLOCK,
#ifdef RISCV_BINARY_TRANSLATION
RV32I_BC_TRANSLATOR,
RV32I_BC_LIVEPATCH,
#endif
RV32I_BC_SYSTEM,
BYTECODES_MAX
Expand Down
118 changes: 66 additions & 52 deletions lib/libriscv/tr_translate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ inline uint32_t opcode(const TransInstr<W>& ti) {
}

template <int W>
inline DecoderData<W>& decoder_entry_at(const DecodedExecuteSegment<W>& exec, address_type<W> addr) {
return exec.decoder_cache()[addr / DecoderCache<W>::DIVISOR];
inline DecoderData<W>& decoder_entry_at(DecoderData<W>* cache, address_type<W> addr) {
return cache[addr / DecoderCache<W>::DIVISOR];
}

template <int W>
Expand Down Expand Up @@ -225,7 +225,7 @@ int CPU<W>::load_translation(const MachineOptions<W>& options,
for (unsigned i = 0; i < translation.nmappings; i++) {
const auto& mapping = translation.mappings[i];
exec.set_mapping(i, mapping.handler);
auto& entry = decoder_entry_at(exec, mapping.addr);
auto& entry = decoder_entry_at(exec.decoder_cache(), mapping.addr);
if (mapping.handler != nullptr) {
entry.instr = i;
entry.set_bytecode(CPU<W>::computed_index_for(RV32_INSTR_BLOCK_END));
Expand Down Expand Up @@ -754,8 +754,16 @@ void CPU<W>::activate_dylib(const MachineOptions<W>& options, DecodedExecuteSegm

// Helper to rebuild decoder blocks
unsigned livepatch_counter = 0;
unsigned livepatch_counter_block_end = 0;
unsigned livepatch_counter_hotpatch = 0;
std::unique_ptr<DecoderCache<W>[]> patched_decoder_cache = nullptr;
DecoderData<W>* patched_decoder = nullptr;
if (live_patch) {
patched_decoder_cache = std::make_unique<DecoderCache<W>[]>(exec.decoder_cache_size());
// A horrible calculation to find the patched decoder
patched_decoder = patched_decoder_cache[0].get_base() - exec.pagedata_base() / DecoderCache<W>::DIVISOR;
// Copy the decoder cache to the patched decoder cache
std::copy(exec.decoder_cache_base(), exec.decoder_cache_base() + exec.decoder_cache_size(), patched_decoder_cache.get());
}
std::vector<DecoderData<W>*> livepatch_bintr;

// Apply mappings to decoder cache
const auto nmappings = *no_mappings;
Expand All @@ -764,55 +772,49 @@ void CPU<W>::activate_dylib(const MachineOptions<W>& options, DecodedExecuteSegm
const auto addr = mappings[i].addr;
if (exec.is_within(addr)) {
exec.set_mapping(i, mappings[i].handler);
auto& entry = decoder_entry_at(exec, addr);
auto& entry = decoder_entry_at(exec.decoder_cache(), addr);
if (mappings[i].handler != nullptr) {
if (live_patch) {
livepatch_counter++;
// When live-patching we can only insert translations where blocks end.
if (entry.block_bytes() == 0) {
auto dd = entry;
dd.instr = i;
dd.set_bytecode(RV32I_BC_TRANSLATOR);

// Atomic overwrite of the 64-bit entry with the new one
entry.atomic_overwrite(dd);
livepatch_counter_block_end++;
} else if (true) {
// Here it gets complicated. We can't insert translations without the
// block ending, so we have to live-patch changes to slowly make the
// block end at the right place.
// 1. The last instruction will be the current entry
// 2. Later instructions will work as normal
// 3. Look back to find the beginning of the block
auto* last = &entry;
auto* current = &entry;
while ((current-1)->block_bytes() != 0) {
current--;
}
auto addr_patch = addr + entry.block_bytes() - current->block_bytes();
// 4. Erase each instruction in the block, by replacing each
// with a block-ending instruction, lowering performance temporarily.
for (auto* dd = current; dd < last; dd++) {
auto instr = read_instruction(exec.exec_data(), addr_patch, exec.exec_end());
auto d = *dd;
d.set_bytecode(RV32I_BC_FUNCBLOCK);
d.idxend = 0; // 0(+1) instructions to next block
#ifdef RISCV_EXT_C
d.icount = 0; // 0 + 1 - 0 == 1 instruction
#endif
d.instr = instr.whole;
dd->atomic_overwrite(d);
addr_patch += instr.length();
}
// 5. Insert the new translation at the end of the block
auto dd = entry;
dd.set_bytecode(RV32I_BC_TRANSLATOR);
dd.idxend = 0;
dd.instr = i;
entry.atomic_overwrite(dd);

livepatch_counter_hotpatch++;
// Here it gets complicated. We can't insert translations without the
// block ending, so we have to live-patch changes to slowly make the
// block end at the right place.
// 1. The last instruction will be the current entry
// 2. Later instructions will work as normal
// 3. Look back to find the beginning of the block
auto* last = &entry;
auto* current = &entry;
while ((current-1)->block_bytes() != 0) {
current--;
}
auto patched_addr = addr + entry.block_bytes() - current->block_bytes();
// 4. Erase each instruction in the block, by replacing each
// with a safe instruction, lowering performance temporarily.
for (auto* dd = current; dd < last; dd++) {
// Get the patched decoder entry
auto* p = patched_decoder + (dd - exec.decoder_cache());
// Create a single-instruction bytecode to replace the original instruction
p->set_bytecode(RV32I_BC_FUNCBLOCK);
p->idxend = 0;
#ifdef RISCV_EXT_C
p->icount = 0;
#endif
auto instr = read_instruction(exec.exec_data(), patched_addr, exec.exec_end());
p->instr = instr.whole;
patched_addr += instr.length();

livepatch_bintr.push_back(dd);
}
// 5. The last instruction will be replaced with a binary translation
// function, which will be the last instruction in the block.
auto* p = patched_decoder + (last - exec.decoder_cache());
p->set_bytecode(RV32I_BC_TRANSLATOR);
p->instr = i;
p->idxend = 0;
#ifdef RISCV_EXT_C
p->icount = 0;
#endif
livepatch_bintr.push_back(last);
} else {
// Normal block-end hint that will be transformed into a translation
// bytecode if it passes a few more checks, later.
Expand All @@ -837,11 +839,23 @@ void CPU<W>::activate_dylib(const MachineOptions<W>& options, DecodedExecuteSegm
}
}

if (live_patch) {
// Move the patched decoder cache to the execute segment
exec.set_patched_decoder_cache(std::move(patched_decoder_cache), patched_decoder);
// Set regular decoder cache to the patched decoder cache
exec.set_decoder(patched_decoder);

// Atomically set a livepatch bytecode for each instruction that is patched
// It will swap out the current decoder with the patched one, and then continue.
for (auto* entry : livepatch_bintr) {
entry->set_bytecode(RV32I_BC_LIVEPATCH);
}
}

if (options.verbose_loader) {
printf("libriscv: Activated binary translation with %u mappings\n", nmappings);
if (live_patch) {
printf("libriscv: Live-patching enabled, %u block-ends, %u hot-patches (%u total)\n",
livepatch_counter_block_end, livepatch_counter_hotpatch, livepatch_counter);
printf("libriscv: Live-patching enabled, %u total\n", livepatch_counter);
}
}

Expand Down

0 comments on commit 1bbb0c7

Please sign in to comment.