diff --git a/src/PDFonts.jl b/src/PDFonts.jl index 94399df..043590b 100644 --- a/src/PDFonts.jl +++ b/src/PDFonts.jl @@ -22,6 +22,14 @@ const endbfrange = b"endbfrange" const begincodespacerange = b"begincodespacerange" const endcodespacerange = b"endcodespacerange" +# EXTRACTION_MODE determines how text extraction of textboxes is handled +# - `:spaces` (default) +# all white spaces are handled as a single space character +# - `:tabs` +# non-space white spaces are handled as tab characters +# - `:boxes` +# text is split into several textboxes with respective coordinates +const EXTRACTION_MODE = Ref(:spaces) # :spaces, :tabs, :boxes mutable struct CMap code_space::IntervalTree{UInt8, @@ -571,13 +579,26 @@ function get_TextBox(ss::Vector{Union{CosXString, CosLiteralString, totalw = 0f0 tj = 0f0 text = "" + offset = 0f0 + params = Tuple{String, Float32, Float32, Float32}[] for s in ss if s isa CosXString || s isa CosLiteralString prev_char = INIT_CODE(pdfont.widths) t = String(get_encoded_string(s, pdfont)) - if (-tj) > 180 && length(t) > 0 && t[1] != ' ' && - length(text) > 0 && text[end] != ' ' - text *= " " + if (-tj) > 180 + if EXTRACTION_MODE[] == :spaces + if length(t) > 0 && t[1] != ' ' && length(text) > 0 && text[end] != ' ' + text *= " " + end + elseif EXTRACTION_MODE[] == :tabs + text *= "\t" + elseif EXTRACTION_MODE[] == :boxes + push!(params, (text, totalw * th, tfs, offset * th)) + offset += totalw - tj * tfs / 1000f0 + text = "" + tj = 0f0 + totalw = 0f0 + end end text *= t barr = Vector{UInt8}(s) @@ -588,8 +609,8 @@ function get_TextBox(ss::Vector{Union{CosXString, CosLiteralString, tj = s |> get |> Float32 end end - totalw *= th - return text, totalw, tfs + push!(params, (text, totalw * th, tfs, offset * th)) + return params end function get_character_width(cid::UInt16, w::CIDWidth) diff --git a/src/PDPageElement.jl b/src/PDPageElement.jl index d042345..ca9d5f6 100644 --- a/src/PDPageElement.jl +++ b/src/PDPageElement.jl @@ -693,20 +693,27 @@ end fontname, font = eval_unicode_mapping(tr, state) heap = get(state, :text_layout, Vector{TextLayout}) - text, w, h = get_TextBox(tr.ss, font, tfs, tc, tw, th) + boxparams = get_TextBox(tr.ss, font, tfs, tc, tw, th) + h = boxparams[1][3] d = get(state, :h_profile, Dict{Int, Int}) ih = round(Int, h*10) - d[ih] = get(d, ih, 0) + length(text) - - tb = [0f0 0f0 1f0; w 0f0 1f0; w h 1f0; 0f0 h 1f0]*trm + if !get(state, :in_artifact, false) - tl = TextLayout(tb[1,1], tb[1,2], tb[2,1], tb[2,2], - tb[3,1], tb[3,2], tb[4,1], tb[4,2], - text, fontname, font.flags) - push!(heap, tl) + for params in boxparams + text = params[1] + w = params[2] + offset = params[4] + d[ih] = get(d, ih, 0) + length(text) + tb = [offset 0f0 1f0; offset + w 0f0 1f0; offset + w h 1f0; offset h 1f0]*trm + tl = TextLayout(tb[1,1], tb[1,2], tb[2,1], tb[2,2], + tb[3,1], tb[3,2], tb[4,1], tb[4,2], + text, fontname, font.flags) + push!(heap, tl) + end end - offset_text_pos!(w, 0f0, state) + totalw = boxparams[end][4] + boxparams[end][2] + offset_text_pos!(totalw, 0f0, state) return state end