From 52bed628f6d885002c53b7ca644da675b6a5ba2d Mon Sep 17 00:00:00 2001 From: Jamey Sharp Date: Fri, 7 Jun 2024 09:44:10 -0700 Subject: [PATCH] wasmtime: Annotate emit-clif output with source line numbers When we're compiling a WebAssembly module that contains a DWARF `.debug_lines` section, this commit adds comments to the output of `wasmtime compile --emit-clif` indicating which file/line/column each block of CLIF instructions originated from. The DWARF info is currently only parsed if the `WASMTIME_BACKTRACE_DETAILS=1` environment variable is set or the equivalent `Config::wasm_backtrace_details` method has been used to enable it. This is useful when trying to understand why we're generating the code we do when there's a lot of WebAssembly in a single function. That can happen either because there's a lot of source code in that function, or because the toolchain (e.g. LLVM) inlined a lot of other functions into it before generating WebAssembly. --- crates/cranelift/src/compiler.rs | 55 ++++- crates/environ/src/compile/module_environ.rs | 222 +++++++++++++++++++ 2 files changed, 270 insertions(+), 7 deletions(-) diff --git a/crates/cranelift/src/compiler.rs b/crates/cranelift/src/compiler.rs index 4262f5f8730a..3d33a58d923d 100644 --- a/crates/cranelift/src/compiler.rs +++ b/crates/cranelift/src/compiler.rs @@ -10,8 +10,8 @@ use cranelift_codegen::isa::{ OwnedTargetIsa, TargetIsa, }; use cranelift_codegen::print_errors::pretty_error; -use cranelift_codegen::Context; -use cranelift_codegen::{CompiledCode, MachStackMap}; +use cranelift_codegen::write::{decorate_function, FuncWriter, PlainWriter}; +use cranelift_codegen::{CompiledCode, Context, MachStackMap}; use cranelift_entity::{EntityRef, PrimaryMap}; use cranelift_frontend::FunctionBuilder; use cranelift_wasm::{ @@ -28,7 +28,7 @@ use std::sync::{Arc, Mutex}; use wasmparser::{FuncValidatorAllocations, FunctionBody}; use wasmtime_environ::{ AddressMapSection, BuiltinFunctionIndex, CacheStore, CompileError, FlagValue, FunctionBodyData, - FunctionLoc, ModuleTranslation, ModuleTypesBuilder, PtrSize, RelocationTarget, + FunctionLoc, LineContext, ModuleTranslation, ModuleTypesBuilder, PtrSize, RelocationTarget, StackMapInformation, TrapEncodingBuilder, Tunables, VMOffsets, WasmFunctionInfo, }; @@ -224,14 +224,21 @@ impl wasmtime_environ::Compiler for Compiler { )?; if let Some(path) = &self.clif_dir { - use std::io::Write; - let mut path = path.to_path_buf(); path.push(format!("wasm_func_{}", func_index.as_u32())); path.set_extension("clif"); - let mut output = std::fs::File::create(path).unwrap(); - write!(output, "{}", context.func.display()).unwrap(); + let mut contents = String::new(); + decorate_function( + &mut LineNumberWriter { + context: LineContext::new(&translation.debuginfo), + base: PlainWriter, + }, + &mut contents, + &context.func, + ) + .unwrap(); + std::fs::write(path, contents).unwrap(); } let (info, func) = compiler.finish_with_info(Some((&body, &self.tunables)))?; @@ -919,6 +926,40 @@ impl FunctionCompiler<'_> { } } +struct LineNumberWriter<'a> { + context: LineContext<'a>, + base: PlainWriter, +} + +impl FuncWriter for LineNumberWriter<'_> { + fn write_block_header( + &mut self, + w: &mut dyn std::fmt::Write, + func: &ir::Function, + block: ir::Block, + indent: usize, + ) -> core::fmt::Result { + self.base.write_block_header(w, func, block, indent) + } + + fn write_instruction( + &mut self, + w: &mut dyn std::fmt::Write, + func: &ir::Function, + aliases: &cranelift_entity::SecondaryMap>, + inst: ir::Inst, + indent: usize, + ) -> core::fmt::Result { + let srcloc = func.srcloc(inst); + if !srcloc.is_default() { + if let Some(line) = self.context.lookup(srcloc.bits().into()) { + writeln!(w, ";; {line}")?; + } + } + self.base.write_instruction(w, func, aliases, inst, indent) + } +} + fn mach_stack_maps_to_stack_maps(mach_stack_maps: &[MachStackMap]) -> Vec { // This is converting from Cranelift's representation of a stack map to // Wasmtime's representation. They happen to align today but that may diff --git a/crates/environ/src/compile/module_environ.rs b/crates/environ/src/compile/module_environ.rs index 111e7f094970..8c5debebe20e 100644 --- a/crates/environ/src/compile/module_environ.rs +++ b/crates/environ/src/compile/module_environ.rs @@ -10,6 +10,10 @@ use crate::{ WasmResult, WasmValType, WasmparserTypeConverter, }; use anyhow::{bail, Result}; +use core::cmp::Ordering; +use core::mem::replace; +use core::num::NonZeroU64; +use core::ops::Range; use cranelift_entity::packed_option::ReservedValue; use cranelift_entity::EntityRef; use std::borrow::Cow; @@ -124,6 +128,8 @@ pub struct FunctionBodyData<'a> { #[allow(missing_docs)] pub struct DebugInfoData<'a> { pub dwarf: Dwarf<'a>, + line_units: Vec>, + line_sequences: Vec>, pub name_section: NameSection<'a>, pub wasm_file: WasmFileInfo, pub debug_loc: gimli::DebugLoc>, @@ -139,6 +145,17 @@ pub type Dwarf<'input> = gimli::Dwarf>; type Reader<'input> = gimli::EndianSlice<'input, gimli::LittleEndian>; +#[derive(Debug)] +struct LineUnit<'input> { + pub line_program: gimli::CompleteLineProgram, usize>, +} + +#[derive(Debug)] +struct LineSequence<'input> { + pub sequence: gimli::LineSequence>, + pub unit: usize, +} + #[derive(Debug, Default)] #[allow(missing_docs)] pub struct NameSection<'a> { @@ -163,6 +180,160 @@ pub struct FunctionMetadata { pub locals: Box<[(u32, WasmValType)]>, } +/// A cache for quickly looking up DWARF file/line information given a source +/// location. In general, `addr2line`-style tools need to do linear searches +/// through the `.debug_lines` section to find a given address. However, if +/// you need to look up many addresses and they're usually in roughly ascending +/// order, then this structure can remember what you last looked up and scan +/// forward from there. +pub struct LineContext<'a> { + /// Reference to all the DWARF info we've extracted. + debuginfo: &'a DebugInfoData<'a>, + /// Address of the last row we looked up in the line program. Subsequent + /// queries in this range would produce identical file/line information so + /// we discard them to avoid clutter. + current: u64, + /// Range of addresses remaining in the last sequence we looked up. A + /// sequence contains some number of rows for addresses in ascending order, + /// and each row provides a filename, line number, and column number that + /// produced the instructions starting at the given address. + sequence: Range, + /// The next filename to use after `current`. + next_file: u64, + /// The next line number to use after `current`. + next_line: Option, + /// The next column number to use after `current`. + next_column: gimli::ColumnType, + /// The remaining rows in the current sequence, or `None` if we haven't + /// looked up any sequence yet. + upcoming: Option< + gimli::LineRows, &'a gimli::CompleteLineProgram, usize>, usize>, + >, +} + +impl<'a> LineContext<'a> { + /// Construct a new line-number cache that can look up addresses in the + /// given debug info. + pub fn new(debuginfo: &'a DebugInfoData<'a>) -> Self { + LineContext { + debuginfo, + current: 0, + sequence: 0..0, + next_file: 0, + next_line: None, + next_column: gimli::ColumnType::LeftEdge, + upcoming: None, + } + } + + /// Reset the cache so that the next call to `lookup` will return something + /// even if it's the same result as the last call to `lookup`. + pub fn reset(&mut self) { + self.current = 0; + self.sequence = 0..0; + self.upcoming = None; + } + + /// Given a WebAssembly bytecode source location, try to construct a string + /// describing the file and line in the original source code which produced + /// that instruction. Returns `None` if the address has the same file and + /// line number as the last time that `lookup` was called. + pub fn lookup(&mut self, address: u64) -> Option { + // Wasmtime uses addresses relative to the start of the file, but DWARF + // for WebAssembly uses addresses relative to the start of the code + // section. + let address = address - self.debuginfo.wasm_file.code_section_offset; + + // Skip this address if it's going to map to the same row as the last + // lookup. + if (self.current..self.sequence.start).contains(&address) { + return None; + } + + // If we're still in the same sequence as the last lookup, we can skip + // the binary search to find the right sequence. + if !self.sequence.contains(&address) { + // Find a sequence whose range contains this address. Sequence + // ranges include their start address and exclude their end address. + // We previously sorted the `debuginfo.line_sequences` array and it + // should be true that there are no overlapping ranges. + let sequence_idx = match self.debuginfo.line_sequences.binary_search_by(|v| { + if address < v.sequence.start { + Ordering::Greater + } else if v.sequence.end <= address { + Ordering::Less + } else { + Ordering::Equal + } + }) { + Ok(i) => i, + // If we don't find any matching range, check whether we're + // following some previous successful `lookup` call; if so, + // report that we're no longer in that range and we don't know + // where we are any more. + Err(_) => { + return self.upcoming.is_some().then(|| { + self.reset(); + "".to_string() + }) + } + }; + + let sequence = &self.debuginfo.line_sequences[sequence_idx]; + let unit = &self.debuginfo.line_units[sequence.unit]; + self.upcoming = Some(unit.line_program.resume_from(&sequence.sequence)); + self.sequence = sequence.sequence.start..sequence.sequence.end; + debug_assert!( + self.sequence.contains(&address), + "{:?} should contain {}", + self.sequence, + address + ); + } + + // Either we just set `self.upcoming` to `Some` above, or we're still in + // the same sequence as a previous call which must have set it. + let rows = self.upcoming.as_mut().unwrap(); + + // Walk the line program forward until we've gone past the new address, + // then take the last row we saw before that. The invariant for this + // loop is that self.next_{file,line,column} correspond to the row + // for the address in self.sequence.start, except on entry to the + // first iteration after we've started a new sequence, when those + // fields are all garbage. In that case, self.sequence.start equals + // next_row.address() so the loop is guaranteed to run a second time. + let mut file = 0; + let mut line = None; + let mut column = gimli::ColumnType::LeftEdge; + while address >= self.sequence.start { + let (_header, next_row) = rows.next_row().unwrap().unwrap(); + self.current = replace(&mut self.sequence.start, next_row.address()); + file = replace(&mut self.next_file, next_row.file_index()); + line = replace(&mut self.next_line, next_row.line()); + column = replace(&mut self.next_column, next_row.column()); + } + + let line = line.map_or(0, NonZeroU64::get); + let column = match column { + gimli::ColumnType::LeftEdge => 0, + gimli::ColumnType::Column(c) => c.get(), + }; + + if let Some(file) = rows.header().file(file) { + let mut path = PathBuf::new(); + if let Some(gimli::AttributeValue::String(dir)) = file.directory(rows.header()) { + path.push(dir.to_string_lossy().as_ref()); + } + if let gimli::AttributeValue::String(file) = file.path_name() { + path.push(file.to_string_lossy().as_ref()); + } + Some(format!("{}:{line}:{column}", path.display())) + } else { + Some(format!(":{line}:{column}")) + } + } +} + impl<'a, 'data> ModuleEnvironment<'a, 'data> { /// Allocates the environment data structures. pub fn new( @@ -197,9 +368,60 @@ impl<'a, 'data> ModuleEnvironment<'a, 'data> { self.translate_payload(payload?)?; } + if self.tunables.parse_wasm_debuginfo { + self.extract_line_program()?; + } + Ok(self.result) } + fn extract_line_program(&mut self) -> Result<()> { + // Line programs are grouped by compilation unit, and then may be split + // into sequences. In principle we could determine which compilation + // unit an address falls into and only search the line program in that + // unit, but DWARF producers don't always get those ranges correct, so + // we use the boundaries of the sequences themselves instead and look + // them up ignoring unit boundaries. However there is some per-unit data + // we need to hang on to in order to interpret each sequence, and that + // data is pretty large, so we indirect to it in a separate `line_units` + // vector. + let dwarf = &self.result.debuginfo.dwarf; + let mut line_units = Vec::new(); + let mut all_sequences = Vec::new(); + let mut iter = dwarf.units(); + while let Some(header) = iter.next()? { + let unit = dwarf.unit(header)?; + if let Some(program) = unit.line_program.clone() { + let (line_program, sequences) = program.sequences()?; + let unit = line_units.len(); + line_units.push(LineUnit { line_program }); + all_sequences.extend( + sequences + .into_iter() + // FIXME: make gimli stop returning nonsense sequences + .filter(|sequence| sequence.start != 0) + .map(|sequence| LineSequence { sequence, unit }), + ); + } + } + + // Within each sequence DWARF specifies that the instruction + // addresses are in ascending order, but there are no guarantees + // across different sequences, which could even be interleaved + // across different compilation units. We sort the sequences so + // we can look them up easily as needed. + all_sequences.sort_unstable_by_key(|v| v.sequence.start); + + // We expect that no sequence overlaps with any other. + debug_assert!(all_sequences + .windows(2) + .all(|w| w[0].sequence.end <= w[1].sequence.start)); + + self.result.debuginfo.line_units = line_units; + self.result.debuginfo.line_sequences = all_sequences; + Ok(()) + } + fn translate_payload(&mut self, payload: Payload<'data>) -> Result<()> { match payload { Payload::Version {