Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bintr: Create patched execute segment during background compilation #184

Merged
merged 1 commit into from
Jul 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions lib/libriscv/bytecode_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,20 @@ INSTRUCTION(RV32I_BC_FUNCBLOCK, execute_function_block) {
NEXT_BLOCK(instr.length(), true);
}

#ifdef RISCV_BINARY_TRANSLATION
INSTRUCTION(RV32I_BC_LIVEPATCH, execute_livepatch) {
// Special bytecode that does not read any decoder data
// except the function handler (which never changes),
// which makes it possible to set it from live-patching
// 1. Wind back PC to the current decoder position
pc = pc - DECODER().block_bytes();
// 2. Find the correct decoder pointer in the patched decoder cache
auto* patched = &exec->patched_decoder_cache()[pc / DecoderCache<W>::DIVISOR];
decoder = patched;
// 3. Execute the instruction
EXECUTE_INSTR();
}
#endif

INSTRUCTION(RV32I_BC_JALR, rv32i_jalr) {
VIEW_INSTR_AS(fi, FasterItype);
Expand Down
5 changes: 5 additions & 0 deletions lib/libriscv/cpu_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,11 @@ namespace riscv
counter.increment_counter(decoder->instruction_count()); \
EXECUTE_INSTR();

#define SAFE_INSTR_NEXT(len) \
pc += len; \
decoder += len >> DecoderCache<W>::SHIFT; \
counter.increment_counter(1);

#define NEXT_SEGMENT() \
decoder = &exec_decoder[pc >> DecoderCache<W>::SHIFT]; \
pc += decoder->block_bytes(); \
Expand Down
5 changes: 5 additions & 0 deletions lib/libriscv/cpu_inaccurate_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ namespace riscv
#undef NEXT_INSTR
#undef NEXT_C_INSTR
#undef NEXT_BLOCK
#undef SAFE_INSTR_NEXT
#undef NEXT_SEGMENT
#undef PERFORM_BRANCH
#undef PERFORM_FORWARD_BRANCH
Expand Down Expand Up @@ -51,6 +52,10 @@ namespace riscv
pc += decoder->block_bytes(); \
EXECUTE_INSTR();

#define SAFE_INSTR_NEXT(len) \
pc += len; \
decoder += len >> DecoderCache<W>::SHIFT;

#define NEXT_SEGMENT() \
decoder = &exec_decoder[pc >> DecoderCache<W>::SHIFT]; \
pc += decoder->block_bytes(); \
Expand Down
15 changes: 12 additions & 3 deletions lib/libriscv/decoded_exec_segment.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once
#include <memory>
#include <unordered_map>
#include "types.hpp"

namespace riscv
Expand Down Expand Up @@ -35,6 +36,8 @@ namespace riscv

auto* decoder_cache() noexcept { return m_exec_decoder; }
auto* decoder_cache() const noexcept { return m_exec_decoder; }
auto* decoder_cache_base() const noexcept { return m_decoder_cache.get(); }
size_t decoder_cache_size() const noexcept { return m_decoder_cache_size; }

auto* create_decoder_cache(DecoderCache<W>* cache, size_t size) {
m_decoder_cache.reset(cache);
Expand All @@ -44,7 +47,7 @@ namespace riscv
void set_decoder(DecoderData<W>* dec) { m_exec_decoder = dec; }

size_t size_bytes() const noexcept {
return sizeof(*this) + m_exec_pagedata_size + m_decoder_cache_size;
return sizeof(*this) + m_exec_pagedata_size + m_decoder_cache_size; // * sizeof(DecoderCache<W>);
}
bool empty() const noexcept { return m_exec_pagedata_size == 0; }

Expand All @@ -70,6 +73,9 @@ namespace riscv
bintr_block_func<W> mapping_at(unsigned i) const { return m_translator_mappings.at(i); }
bintr_block_func<W> unchecked_mapping_at(unsigned i) const { return m_translator_mappings[i]; }
size_t translator_mappings() const noexcept { return m_translator_mappings.size(); }
auto* patched_decoder_cache() noexcept { return m_patched_exec_decoder; }
void set_patched_decoder_cache(std::unique_ptr<DecoderCache<W>[]> cache, DecoderData<W>* dec)
{ m_patched_decoder_cache = std::move(cache); m_patched_exec_decoder = dec; }
#else
bool is_binary_translated() const noexcept { return false; }
#endif
Expand All @@ -90,13 +96,14 @@ namespace riscv
address_t m_exec_pagedata_base = 0;
std::unique_ptr<uint8_t[]> m_exec_pagedata = nullptr;

// Decoder cache is used to run simulation at a
// high speed, without resorting to JIT
// Decoder cache is used to run bytecode simulation at a high speed
size_t m_decoder_cache_size = 0;
std::unique_ptr<DecoderCache<W>[]> m_decoder_cache = nullptr;

#ifdef RISCV_BINARY_TRANSLATION
std::vector<bintr_block_func<W>> m_translator_mappings;
std::unique_ptr<DecoderCache<W>[]> m_patched_decoder_cache = nullptr;
DecoderData<W>* m_patched_exec_decoder = nullptr;
mutable void* m_bintr_dl = nullptr;
uint32_t m_bintr_hash = 0x0; // CRC32-C of the execute segment + compiler options
#endif
Expand Down Expand Up @@ -137,6 +144,8 @@ namespace riscv
m_translator_mappings = std::move(other.m_translator_mappings);
m_bintr_dl = other.m_bintr_dl;
other.m_bintr_dl = nullptr;
m_bintr_hash = other.m_bintr_hash;
m_is_libtcc = other.m_is_libtcc;
#endif
}

Expand Down
3 changes: 1 addition & 2 deletions lib/libriscv/decoder_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -293,8 +293,7 @@ namespace riscv
}
// Here we allocate the decoder cache which is page-sized
auto* decoder_cache = exec.create_decoder_cache(
new DecoderCache<W> [n_pages],
n_pages * sizeof(DecoderCache<W>));
new DecoderCache<W> [n_pages], n_pages);
auto* exec_decoder =
decoder_cache[0].get_base() - pbase / DecoderCache<W>::DIVISOR;
exec.set_decoder(exec_decoder);
Expand Down
1 change: 1 addition & 0 deletions lib/libriscv/tailcall_dispatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,7 @@ namespace riscv
[RV32I_BC_FUNCBLOCK] = execute_function_block,
#ifdef RISCV_BINARY_TRANSLATION
[RV32I_BC_TRANSLATOR] = translated_function,
[RV32I_BC_LIVEPATCH] = execute_livepatch,
#endif
[RV32I_BC_SYSTEM] = rv32i_system,
};
Expand Down
3 changes: 2 additions & 1 deletion lib/libriscv/threaded_bytecode_array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,11 @@ static constexpr void *computed_opcode[] = {
[RV32V_BC_VFADD_VV] = &&rv32v_vfadd_vv,
[RV32V_BC_VFMUL_VF] = &&rv32v_vfmul_vf,
#endif
[RV32I_BC_FUNCTION] = &&execute_decoded_function,
[RV32I_BC_FUNCTION] = &&execute_decoded_function,
[RV32I_BC_FUNCBLOCK] = &&execute_function_block,
#ifdef RISCV_BINARY_TRANSLATION
[RV32I_BC_TRANSLATOR] = &&translated_function,
[RV32I_BC_LIVEPATCH] = &&execute_livepatch,
#endif
[RV32I_BC_SYSTEM] = &&rv32i_system,
};
1 change: 1 addition & 0 deletions lib/libriscv/threaded_bytecodes.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ namespace riscv
RV32I_BC_FUNCBLOCK,
#ifdef RISCV_BINARY_TRANSLATION
RV32I_BC_TRANSLATOR,
RV32I_BC_LIVEPATCH,
#endif
RV32I_BC_SYSTEM,
BYTECODES_MAX
Expand Down
118 changes: 66 additions & 52 deletions lib/libriscv/tr_translate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,8 +114,8 @@ inline uint32_t opcode(const TransInstr<W>& ti) {
}

template <int W>
inline DecoderData<W>& decoder_entry_at(const DecodedExecuteSegment<W>& exec, address_type<W> addr) {
return exec.decoder_cache()[addr / DecoderCache<W>::DIVISOR];
inline DecoderData<W>& decoder_entry_at(DecoderData<W>* cache, address_type<W> addr) {
return cache[addr / DecoderCache<W>::DIVISOR];
}

template <int W>
Expand Down Expand Up @@ -225,7 +225,7 @@ int CPU<W>::load_translation(const MachineOptions<W>& options,
for (unsigned i = 0; i < translation.nmappings; i++) {
const auto& mapping = translation.mappings[i];
exec.set_mapping(i, mapping.handler);
auto& entry = decoder_entry_at(exec, mapping.addr);
auto& entry = decoder_entry_at(exec.decoder_cache(), mapping.addr);
if (mapping.handler != nullptr) {
entry.instr = i;
entry.set_bytecode(CPU<W>::computed_index_for(RV32_INSTR_BLOCK_END));
Expand Down Expand Up @@ -754,8 +754,16 @@ void CPU<W>::activate_dylib(const MachineOptions<W>& options, DecodedExecuteSegm

// Helper to rebuild decoder blocks
unsigned livepatch_counter = 0;
unsigned livepatch_counter_block_end = 0;
unsigned livepatch_counter_hotpatch = 0;
std::unique_ptr<DecoderCache<W>[]> patched_decoder_cache = nullptr;
DecoderData<W>* patched_decoder = nullptr;
if (live_patch) {
patched_decoder_cache = std::make_unique<DecoderCache<W>[]>(exec.decoder_cache_size());
// A horrible calculation to find the patched decoder
patched_decoder = patched_decoder_cache[0].get_base() - exec.pagedata_base() / DecoderCache<W>::DIVISOR;
// Copy the decoder cache to the patched decoder cache
std::copy(exec.decoder_cache_base(), exec.decoder_cache_base() + exec.decoder_cache_size(), patched_decoder_cache.get());
}
std::vector<DecoderData<W>*> livepatch_bintr;

// Apply mappings to decoder cache
const auto nmappings = *no_mappings;
Expand All @@ -764,55 +772,49 @@ void CPU<W>::activate_dylib(const MachineOptions<W>& options, DecodedExecuteSegm
const auto addr = mappings[i].addr;
if (exec.is_within(addr)) {
exec.set_mapping(i, mappings[i].handler);
auto& entry = decoder_entry_at(exec, addr);
auto& entry = decoder_entry_at(exec.decoder_cache(), addr);
if (mappings[i].handler != nullptr) {
if (live_patch) {
livepatch_counter++;
// When live-patching we can only insert translations where blocks end.
if (entry.block_bytes() == 0) {
auto dd = entry;
dd.instr = i;
dd.set_bytecode(RV32I_BC_TRANSLATOR);

// Atomic overwrite of the 64-bit entry with the new one
entry.atomic_overwrite(dd);
livepatch_counter_block_end++;
} else if (true) {
// Here it gets complicated. We can't insert translations without the
// block ending, so we have to live-patch changes to slowly make the
// block end at the right place.
// 1. The last instruction will be the current entry
// 2. Later instructions will work as normal
// 3. Look back to find the beginning of the block
auto* last = &entry;
auto* current = &entry;
while ((current-1)->block_bytes() != 0) {
current--;
}
auto addr_patch = addr + entry.block_bytes() - current->block_bytes();
// 4. Erase each instruction in the block, by replacing each
// with a block-ending instruction, lowering performance temporarily.
for (auto* dd = current; dd < last; dd++) {
auto instr = read_instruction(exec.exec_data(), addr_patch, exec.exec_end());
auto d = *dd;
d.set_bytecode(RV32I_BC_FUNCBLOCK);
d.idxend = 0; // 0(+1) instructions to next block
#ifdef RISCV_EXT_C
d.icount = 0; // 0 + 1 - 0 == 1 instruction
#endif
d.instr = instr.whole;
dd->atomic_overwrite(d);
addr_patch += instr.length();
}
// 5. Insert the new translation at the end of the block
auto dd = entry;
dd.set_bytecode(RV32I_BC_TRANSLATOR);
dd.idxend = 0;
dd.instr = i;
entry.atomic_overwrite(dd);

livepatch_counter_hotpatch++;
// Here it gets complicated. We can't insert translations without the
// block ending, so we have to live-patch changes to slowly make the
// block end at the right place.
// 1. The last instruction will be the current entry
// 2. Later instructions will work as normal
// 3. Look back to find the beginning of the block
auto* last = &entry;
auto* current = &entry;
while ((current-1)->block_bytes() != 0) {
current--;
}
auto patched_addr = addr + entry.block_bytes() - current->block_bytes();
// 4. Erase each instruction in the block, by replacing each
// with a safe instruction, lowering performance temporarily.
for (auto* dd = current; dd < last; dd++) {
// Get the patched decoder entry
auto* p = patched_decoder + (dd - exec.decoder_cache());
// Create a single-instruction bytecode to replace the original instruction
p->set_bytecode(RV32I_BC_FUNCBLOCK);
p->idxend = 0;
#ifdef RISCV_EXT_C
p->icount = 0;
#endif
auto instr = read_instruction(exec.exec_data(), patched_addr, exec.exec_end());
p->instr = instr.whole;
patched_addr += instr.length();

livepatch_bintr.push_back(dd);
}
// 5. The last instruction will be replaced with a binary translation
// function, which will be the last instruction in the block.
auto* p = patched_decoder + (last - exec.decoder_cache());
p->set_bytecode(RV32I_BC_TRANSLATOR);
p->instr = i;
p->idxend = 0;
#ifdef RISCV_EXT_C
p->icount = 0;
#endif
livepatch_bintr.push_back(last);
} else {
// Normal block-end hint that will be transformed into a translation
// bytecode if it passes a few more checks, later.
Expand All @@ -837,11 +839,23 @@ void CPU<W>::activate_dylib(const MachineOptions<W>& options, DecodedExecuteSegm
}
}

if (live_patch) {
// Move the patched decoder cache to the execute segment
exec.set_patched_decoder_cache(std::move(patched_decoder_cache), patched_decoder);
// Set regular decoder cache to the patched decoder cache
exec.set_decoder(patched_decoder);

// Atomically set a livepatch bytecode for each instruction that is patched
// It will swap out the current decoder with the patched one, and then continue.
for (auto* entry : livepatch_bintr) {
entry->set_bytecode(RV32I_BC_LIVEPATCH);
}
}

if (options.verbose_loader) {
printf("libriscv: Activated binary translation with %u mappings\n", nmappings);
if (live_patch) {
printf("libriscv: Live-patching enabled, %u block-ends, %u hot-patches (%u total)\n",
livepatch_counter_block_end, livepatch_counter_hotpatch, livepatch_counter);
printf("libriscv: Live-patching enabled, %u total\n", livepatch_counter);
}
}

Expand Down
Loading