Skip to content

Commit

Permalink
wasmtime: Annotate emit-clif output with source line numbers
Browse files Browse the repository at this point in the history
When we're compiling a WebAssembly module that contains a DWARF
`.debug_lines` section, this commit adds comments to the output of
`wasmtime compile --emit-clif` indicating which file/line/column each
block of CLIF instructions originated from. The DWARF info is currently
only parsed if the `WASMTIME_BACKTRACE_DETAILS=1` environment variable
is set or the equivalent `Config::wasm_backtrace_details` method has
been used to enable it.

This is useful when trying to understand why we're generating the code
we do when there's a lot of WebAssembly in a single function. That can
happen either because there's a lot of source code in that function, or
because the toolchain (e.g. LLVM) inlined a lot of other functions into
it before generating WebAssembly.
  • Loading branch information
jameysharp committed Jun 12, 2024
1 parent e852bce commit 52bed62
Show file tree
Hide file tree
Showing 2 changed files with 270 additions and 7 deletions.
55 changes: 48 additions & 7 deletions crates/cranelift/src/compiler.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@ use cranelift_codegen::isa::{
OwnedTargetIsa, TargetIsa,
};
use cranelift_codegen::print_errors::pretty_error;
use cranelift_codegen::Context;
use cranelift_codegen::{CompiledCode, MachStackMap};
use cranelift_codegen::write::{decorate_function, FuncWriter, PlainWriter};
use cranelift_codegen::{CompiledCode, Context, MachStackMap};
use cranelift_entity::{EntityRef, PrimaryMap};
use cranelift_frontend::FunctionBuilder;
use cranelift_wasm::{
Expand All @@ -28,7 +28,7 @@ use std::sync::{Arc, Mutex};
use wasmparser::{FuncValidatorAllocations, FunctionBody};
use wasmtime_environ::{
AddressMapSection, BuiltinFunctionIndex, CacheStore, CompileError, FlagValue, FunctionBodyData,
FunctionLoc, ModuleTranslation, ModuleTypesBuilder, PtrSize, RelocationTarget,
FunctionLoc, LineContext, ModuleTranslation, ModuleTypesBuilder, PtrSize, RelocationTarget,
StackMapInformation, TrapEncodingBuilder, Tunables, VMOffsets, WasmFunctionInfo,
};

Expand Down Expand Up @@ -224,14 +224,21 @@ impl wasmtime_environ::Compiler for Compiler {
)?;

if let Some(path) = &self.clif_dir {
use std::io::Write;

let mut path = path.to_path_buf();
path.push(format!("wasm_func_{}", func_index.as_u32()));
path.set_extension("clif");

let mut output = std::fs::File::create(path).unwrap();
write!(output, "{}", context.func.display()).unwrap();
let mut contents = String::new();
decorate_function(
&mut LineNumberWriter {
context: LineContext::new(&translation.debuginfo),
base: PlainWriter,
},
&mut contents,
&context.func,
)
.unwrap();
std::fs::write(path, contents).unwrap();
}

let (info, func) = compiler.finish_with_info(Some((&body, &self.tunables)))?;
Expand Down Expand Up @@ -919,6 +926,40 @@ impl FunctionCompiler<'_> {
}
}

struct LineNumberWriter<'a> {
context: LineContext<'a>,
base: PlainWriter,
}

impl FuncWriter for LineNumberWriter<'_> {
fn write_block_header(
&mut self,
w: &mut dyn std::fmt::Write,
func: &ir::Function,
block: ir::Block,
indent: usize,
) -> core::fmt::Result {
self.base.write_block_header(w, func, block, indent)
}

fn write_instruction(
&mut self,
w: &mut dyn std::fmt::Write,
func: &ir::Function,
aliases: &cranelift_entity::SecondaryMap<Value, Vec<Value>>,
inst: ir::Inst,
indent: usize,
) -> core::fmt::Result {
let srcloc = func.srcloc(inst);
if !srcloc.is_default() {
if let Some(line) = self.context.lookup(srcloc.bits().into()) {
writeln!(w, ";; {line}")?;
}
}
self.base.write_instruction(w, func, aliases, inst, indent)
}
}

fn mach_stack_maps_to_stack_maps(mach_stack_maps: &[MachStackMap]) -> Vec<StackMapInformation> {
// This is converting from Cranelift's representation of a stack map to
// Wasmtime's representation. They happen to align today but that may
Expand Down
222 changes: 222 additions & 0 deletions crates/environ/src/compile/module_environ.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ use crate::{
WasmResult, WasmValType, WasmparserTypeConverter,
};
use anyhow::{bail, Result};
use core::cmp::Ordering;
use core::mem::replace;
use core::num::NonZeroU64;
use core::ops::Range;
use cranelift_entity::packed_option::ReservedValue;
use cranelift_entity::EntityRef;
use std::borrow::Cow;
Expand Down Expand Up @@ -124,6 +128,8 @@ pub struct FunctionBodyData<'a> {
#[allow(missing_docs)]
pub struct DebugInfoData<'a> {
pub dwarf: Dwarf<'a>,
line_units: Vec<LineUnit<'a>>,
line_sequences: Vec<LineSequence<'a>>,
pub name_section: NameSection<'a>,
pub wasm_file: WasmFileInfo,
pub debug_loc: gimli::DebugLoc<Reader<'a>>,
Expand All @@ -139,6 +145,17 @@ pub type Dwarf<'input> = gimli::Dwarf<Reader<'input>>;

type Reader<'input> = gimli::EndianSlice<'input, gimli::LittleEndian>;

#[derive(Debug)]
struct LineUnit<'input> {
pub line_program: gimli::CompleteLineProgram<Reader<'input>, usize>,
}

#[derive(Debug)]
struct LineSequence<'input> {
pub sequence: gimli::LineSequence<Reader<'input>>,
pub unit: usize,
}

#[derive(Debug, Default)]
#[allow(missing_docs)]
pub struct NameSection<'a> {
Expand All @@ -163,6 +180,160 @@ pub struct FunctionMetadata {
pub locals: Box<[(u32, WasmValType)]>,
}

/// A cache for quickly looking up DWARF file/line information given a source
/// location. In general, `addr2line`-style tools need to do linear searches
/// through the `.debug_lines` section to find a given address. However, if
/// you need to look up many addresses and they're usually in roughly ascending
/// order, then this structure can remember what you last looked up and scan
/// forward from there.
pub struct LineContext<'a> {
/// Reference to all the DWARF info we've extracted.
debuginfo: &'a DebugInfoData<'a>,
/// Address of the last row we looked up in the line program. Subsequent
/// queries in this range would produce identical file/line information so
/// we discard them to avoid clutter.
current: u64,
/// Range of addresses remaining in the last sequence we looked up. A
/// sequence contains some number of rows for addresses in ascending order,
/// and each row provides a filename, line number, and column number that
/// produced the instructions starting at the given address.
sequence: Range<u64>,
/// The next filename to use after `current`.
next_file: u64,
/// The next line number to use after `current`.
next_line: Option<NonZeroU64>,
/// The next column number to use after `current`.
next_column: gimli::ColumnType,
/// The remaining rows in the current sequence, or `None` if we haven't
/// looked up any sequence yet.
upcoming: Option<
gimli::LineRows<Reader<'a>, &'a gimli::CompleteLineProgram<Reader<'a>, usize>, usize>,
>,
}

impl<'a> LineContext<'a> {
/// Construct a new line-number cache that can look up addresses in the
/// given debug info.
pub fn new(debuginfo: &'a DebugInfoData<'a>) -> Self {
LineContext {
debuginfo,
current: 0,
sequence: 0..0,
next_file: 0,
next_line: None,
next_column: gimli::ColumnType::LeftEdge,
upcoming: None,
}
}

/// Reset the cache so that the next call to `lookup` will return something
/// even if it's the same result as the last call to `lookup`.
pub fn reset(&mut self) {
self.current = 0;
self.sequence = 0..0;
self.upcoming = None;
}

/// Given a WebAssembly bytecode source location, try to construct a string
/// describing the file and line in the original source code which produced
/// that instruction. Returns `None` if the address has the same file and
/// line number as the last time that `lookup` was called.
pub fn lookup(&mut self, address: u64) -> Option<String> {
// Wasmtime uses addresses relative to the start of the file, but DWARF
// for WebAssembly uses addresses relative to the start of the code
// section.
let address = address - self.debuginfo.wasm_file.code_section_offset;

// Skip this address if it's going to map to the same row as the last
// lookup.
if (self.current..self.sequence.start).contains(&address) {
return None;
}

// If we're still in the same sequence as the last lookup, we can skip
// the binary search to find the right sequence.
if !self.sequence.contains(&address) {
// Find a sequence whose range contains this address. Sequence
// ranges include their start address and exclude their end address.
// We previously sorted the `debuginfo.line_sequences` array and it
// should be true that there are no overlapping ranges.
let sequence_idx = match self.debuginfo.line_sequences.binary_search_by(|v| {
if address < v.sequence.start {
Ordering::Greater
} else if v.sequence.end <= address {
Ordering::Less
} else {
Ordering::Equal
}
}) {
Ok(i) => i,
// If we don't find any matching range, check whether we're
// following some previous successful `lookup` call; if so,
// report that we're no longer in that range and we don't know
// where we are any more.
Err(_) => {
return self.upcoming.is_some().then(|| {
self.reset();
"<unknown location>".to_string()
})
}
};

let sequence = &self.debuginfo.line_sequences[sequence_idx];
let unit = &self.debuginfo.line_units[sequence.unit];
self.upcoming = Some(unit.line_program.resume_from(&sequence.sequence));
self.sequence = sequence.sequence.start..sequence.sequence.end;
debug_assert!(
self.sequence.contains(&address),
"{:?} should contain {}",
self.sequence,
address
);
}

// Either we just set `self.upcoming` to `Some` above, or we're still in
// the same sequence as a previous call which must have set it.
let rows = self.upcoming.as_mut().unwrap();

// Walk the line program forward until we've gone past the new address,
// then take the last row we saw before that. The invariant for this
// loop is that self.next_{file,line,column} correspond to the row
// for the address in self.sequence.start, except on entry to the
// first iteration after we've started a new sequence, when those
// fields are all garbage. In that case, self.sequence.start equals
// next_row.address() so the loop is guaranteed to run a second time.
let mut file = 0;
let mut line = None;
let mut column = gimli::ColumnType::LeftEdge;
while address >= self.sequence.start {
let (_header, next_row) = rows.next_row().unwrap().unwrap();
self.current = replace(&mut self.sequence.start, next_row.address());
file = replace(&mut self.next_file, next_row.file_index());
line = replace(&mut self.next_line, next_row.line());
column = replace(&mut self.next_column, next_row.column());
}

let line = line.map_or(0, NonZeroU64::get);
let column = match column {
gimli::ColumnType::LeftEdge => 0,
gimli::ColumnType::Column(c) => c.get(),
};

if let Some(file) = rows.header().file(file) {
let mut path = PathBuf::new();
if let Some(gimli::AttributeValue::String(dir)) = file.directory(rows.header()) {
path.push(dir.to_string_lossy().as_ref());
}
if let gimli::AttributeValue::String(file) = file.path_name() {
path.push(file.to_string_lossy().as_ref());
}
Some(format!("{}:{line}:{column}", path.display()))
} else {
Some(format!("<unknown>:{line}:{column}"))
}
}
}

impl<'a, 'data> ModuleEnvironment<'a, 'data> {
/// Allocates the environment data structures.
pub fn new(
Expand Down Expand Up @@ -197,9 +368,60 @@ impl<'a, 'data> ModuleEnvironment<'a, 'data> {
self.translate_payload(payload?)?;
}

if self.tunables.parse_wasm_debuginfo {
self.extract_line_program()?;
}

Ok(self.result)
}

fn extract_line_program(&mut self) -> Result<()> {
// Line programs are grouped by compilation unit, and then may be split
// into sequences. In principle we could determine which compilation
// unit an address falls into and only search the line program in that
// unit, but DWARF producers don't always get those ranges correct, so
// we use the boundaries of the sequences themselves instead and look
// them up ignoring unit boundaries. However there is some per-unit data
// we need to hang on to in order to interpret each sequence, and that
// data is pretty large, so we indirect to it in a separate `line_units`
// vector.
let dwarf = &self.result.debuginfo.dwarf;
let mut line_units = Vec::new();
let mut all_sequences = Vec::new();
let mut iter = dwarf.units();
while let Some(header) = iter.next()? {
let unit = dwarf.unit(header)?;
if let Some(program) = unit.line_program.clone() {
let (line_program, sequences) = program.sequences()?;
let unit = line_units.len();
line_units.push(LineUnit { line_program });
all_sequences.extend(
sequences
.into_iter()
// FIXME: make gimli stop returning nonsense sequences
.filter(|sequence| sequence.start != 0)
.map(|sequence| LineSequence { sequence, unit }),
);
}
}

// Within each sequence DWARF specifies that the instruction
// addresses are in ascending order, but there are no guarantees
// across different sequences, which could even be interleaved
// across different compilation units. We sort the sequences so
// we can look them up easily as needed.
all_sequences.sort_unstable_by_key(|v| v.sequence.start);

// We expect that no sequence overlaps with any other.
debug_assert!(all_sequences
.windows(2)
.all(|w| w[0].sequence.end <= w[1].sequence.start));

self.result.debuginfo.line_units = line_units;
self.result.debuginfo.line_sequences = all_sequences;
Ok(())
}

fn translate_payload(&mut self, payload: Payload<'data>) -> Result<()> {
match payload {
Payload::Version {
Expand Down

0 comments on commit 52bed62

Please sign in to comment.