From 1287eb1bf40a0876e1407aae329d1411cf13568a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Vouillon?= Date: Tue, 26 May 2026 23:35:23 +0200 Subject: [PATCH 1/6] Wasm: route each Binaryen pass directly to its output file Restructure [link_and_optimize] as a [link] producer followed by a list of transformer passes ([dce], [optimize]), threaded through an explicit [step_io] (wasm file + optional sourcemap). A small runner sends every pass but the last to a fresh temp file, and the last straight to [output_file] / [opt_sourcemap_file]. Pure refactoring: the same passes run in the same order for every profile, so the output is unchanged. The point is that whichever pass ends the pipeline now writes the final file itself, so a later change that drops [optimize] at --opt 1 needs no copy to land the link's or DCE's output at the right path. --- compiler/bin-wasm_of_ocaml/compile.ml | 149 ++++++++++++++++---------- 1 file changed, 90 insertions(+), 59 deletions(-) diff --git a/compiler/bin-wasm_of_ocaml/compile.ml b/compiler/bin-wasm_of_ocaml/compile.ml index 73d6df565b..e8e0111dae 100644 --- a/compiler/bin-wasm_of_ocaml/compile.ml +++ b/compiler/bin-wasm_of_ocaml/compile.ml @@ -133,6 +133,13 @@ let build_runtime ~runtime_file = ~inputs ~output_file:runtime_file +(* The output of one Binaryen pipeline step: the wasm file and its + optional sourcemap. A step reads one [step_io] and writes another. *) +type step_io = + { file : string + ; opt_sm : string option + } + let link_and_optimize ~profile ~sourcemap_root @@ -150,77 +157,101 @@ let link_and_optimize | Some _ | None -> opt_sourcemap in let enable_source_maps = Option.is_some opt_sourcemap_file in + (* Allocate a fresh temp [step_io] (wasm file + sourcemap) for an + intermediate step, scoped to the continuation. *) + let with_temp_step base k = + Fs.with_intermediate_file (Filename.temp_file base ".wasm") + @@ fun file -> + opt_with + Fs.with_intermediate_file + (if enable_source_maps then Some (Filename.temp_file base ".wasm.map") else None) + @@ fun opt_sm -> k { file; opt_sm } + in Fs.with_intermediate_file (Filename.temp_file "runtime" ".wasm") @@ fun runtime_file -> build_runtime ~runtime_file; - Fs.with_intermediate_file (Filename.temp_file "wasm-merged" ".wasm") - @@ fun temp_file -> - opt_with - Fs.with_intermediate_file - (if enable_source_maps - then Some (Filename.temp_file "wasm-merged" ".wasm.map") - else None) - @@ fun opt_temp_sourcemap -> - (with_runtime_files ~runtime_wasm_files - @@ fun runtime_inputs -> - let t = Timer.make ~get_time:Unix.time () in - Binaryen.link - ~inputs: - ({ Binaryen.module_name = "env"; file = runtime_file; source_map_file = None } - :: runtime_inputs - @ List.map - ~f:(fun (file, source_map_file) -> - { Binaryen.module_name = "OCaml"; file; source_map_file }) - wat_files) - ~opt_output_sourcemap:opt_temp_sourcemap - ~output_file:temp_file - (); - if binaryen_times () then Format.eprintf " binaryen link: %a@." Timer.print t); - - let optimize_and_finish ~opt_input_sourcemap ~input_file primitives = + (* The producer: merge the runtime and the compiled units. *) + let link ~(output : step_io) = + with_runtime_files ~runtime_wasm_files + @@ fun runtime_inputs -> let t = Timer.make ~get_time:Unix.time () in - Binaryen.optimize - ~profile - ~opt_input_sourcemap - ~opt_output_sourcemap:opt_sourcemap - ~input_file - ~output_file + Binaryen.link + ~inputs: + ({ Binaryen.module_name = "env"; file = runtime_file; source_map_file = None } + :: runtime_inputs + @ List.map + ~f:(fun (file, source_map_file) -> + { Binaryen.module_name = "OCaml"; file; source_map_file }) + wat_files) + ~opt_output_sourcemap:output.opt_sm + ~output_file:output.file (); - if binaryen_times () then Format.eprintf " binaryen opt: %a@." Timer.print t; - Option.iter - ~f:(update_sourcemap ~sourcemap_root ~sourcemap_don't_inline_content) - opt_sourcemap_file; - primitives + if binaryen_times () then Format.eprintf " binaryen link: %a@." Timer.print t; + Linker.list_all () in - if dynlink - then - optimize_and_finish - ~opt_input_sourcemap:opt_temp_sourcemap - ~input_file:temp_file - (Linker.list_all ()) - else - Fs.with_intermediate_file (Filename.temp_file "wasm-dce" ".wasm") - @@ fun temp_file' -> - opt_with - Fs.with_intermediate_file - (if enable_source_maps - then Some (Filename.temp_file "wasm-dce" ".wasm.map") - else None) - @@ fun opt_temp_sourcemap' -> + (* A transformer reads the previous step's output and writes the next; + it returns the primitive list known after it runs. [dce] computes a + pruned list, [optimize] passes the incoming one through. *) + let dce ~(input : step_io) ~(output : step_io) ~primitives:_ = let t = Timer.make ~get_time:Unix.time () in let primitives = Binaryen.dead_code_elimination ~dependencies:Runtime_files.dependencies - ~opt_input_sourcemap:opt_temp_sourcemap - ~opt_output_sourcemap:opt_temp_sourcemap' - ~input_file:temp_file - ~output_file:temp_file' + ~opt_input_sourcemap:input.opt_sm + ~opt_output_sourcemap:output.opt_sm + ~input_file:input.file + ~output_file:output.file in if binaryen_times () then Format.eprintf " binaryen dce: %a@." Timer.print t; - optimize_and_finish - ~opt_input_sourcemap:opt_temp_sourcemap' - ~input_file:temp_file' - primitives + primitives + in + let optimize ~(input : step_io) ~(output : step_io) ~primitives = + let t = Timer.make ~get_time:Unix.time () in + Binaryen.optimize + ~profile + ~opt_input_sourcemap:input.opt_sm + ~opt_output_sourcemap:output.opt_sm + ~input_file:input.file + ~output_file:output.file + (); + if binaryen_times () then Format.eprintf " binaryen opt: %a@." Timer.print t; + primitives + in + (* The pipeline shape, decided in one place: link, then DCE unless we are + building a dynamically-linkable unit, then wasm-opt. *) + let transformers = + List.filter_map + ~f:(fun x -> x) + [ (if dynlink then None else Some ("wasm-dce", dce)) + ; Some ("wasm-opt", optimize) + ] + in + let final = { file = output_file; opt_sm = opt_sourcemap_file } in + (* Run the transformers in order: each but the last writes to a temp + file; the last writes straight to [final], so nothing is ever copied + through memory just to land at the output path. *) + let rec run_transformers ~input ~primitives = function + | [] -> primitives + | [ (_, last) ] -> last ~input ~output:final ~primitives + | (name, step) :: rest -> + with_temp_step name + @@ fun output -> + let primitives = step ~input ~output ~primitives in + run_transformers ~input:output ~primitives rest + in + let primitives = + match transformers with + | [] -> link ~output:final + | _ -> + with_temp_step "wasm-merged" + @@ fun linked -> + let primitives = link ~output:linked in + run_transformers ~input:linked ~primitives transformers + in + Option.iter + ~f:(update_sourcemap ~sourcemap_root ~sourcemap_don't_inline_content) + opt_sourcemap_file; + primitives let link_runtime ~profile runtime_wasm_files output_file = if List.is_empty runtime_wasm_files From b1235914821270c1cbe9d079c62dff3d3cfeb7a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Vouillon?= Date: Tue, 26 May 2026 23:36:17 +0200 Subject: [PATCH 2/6] Wasm: skip wasm-opt at --opt 1 At --opt 1, drop the [optimize] (wasm-opt) pass from the pipeline, both in whole-program linking (remove it from the transformer list) and in separate compilation (write the unit's wasm directly instead of routing it through [Binaryen.optimize]). Our own passes handle what wasm-opt would have done; at --opt 2/3 wasm-opt still runs. --- compiler/bin-wasm_of_ocaml/compile.ml | 73 +++++++++++++++++---------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/compiler/bin-wasm_of_ocaml/compile.ml b/compiler/bin-wasm_of_ocaml/compile.ml index e8e0111dae..59449464f7 100644 --- a/compiler/bin-wasm_of_ocaml/compile.ml +++ b/compiler/bin-wasm_of_ocaml/compile.ml @@ -217,13 +217,16 @@ let link_and_optimize if binaryen_times () then Format.eprintf " binaryen opt: %a@." Timer.print t; primitives in - (* The pipeline shape, decided in one place: link, then DCE unless we are - building a dynamically-linkable unit, then wasm-opt. *) + (* The pipeline shape, decided in one place: DCE unless we are building a + dynamically-linkable unit; wasm-opt only above --opt 1 (at --opt 1 we + rely on our own passes instead). *) let transformers = List.filter_map ~f:(fun x -> x) [ (if dynlink then None else Some ("wasm-dce", dce)) - ; Some ("wasm-opt", optimize) + ; (match (profile : Profile.t) with + | O1 -> None + | O2 | O3 -> Some ("wasm-opt", optimize)) ] in let final = { file = output_file; opt_sm = opt_sourcemap_file } in @@ -610,31 +613,45 @@ let run else None) @@ fun opt_tmp_map_file -> let unit_data, shapes = - Fs.with_intermediate_file (Filename.temp_file unit_name ".wasm") - @@ fun input_file -> - opt_with - Fs.with_intermediate_file - (if enable_source_maps - then Some (Filename.temp_file unit_name ".wasm.map") - else None) - @@ fun opt_input_sourcemap -> - let fragments, shapes = - output - code - ~wat_file: - (Filename.concat (Filename.dirname output_file) (unit_name ^ ".wat")) - ~unit_name:(Some unit_name) - ~file:input_file - ~opt_source_map_file:opt_input_sourcemap - in - Binaryen.optimize - ~profile - ~opt_input_sourcemap - ~opt_output_sourcemap:opt_tmp_map_file - ~input_file - ~output_file:tmp_wasm_file - (); - { Link.unit_name; unit_info; fragments }, shapes + match profile with + | Profile.O1 -> + (* At O1, skip Binaryen.optimize — write directly *) + let fragments, shapes = + output + code + ~wat_file: + (Filename.concat (Filename.dirname output_file) (unit_name ^ ".wat")) + ~unit_name:(Some unit_name) + ~file:tmp_wasm_file + ~opt_source_map_file:opt_tmp_map_file + in + { Link.unit_name; unit_info; fragments }, shapes + | O2 | O3 -> + Fs.with_intermediate_file (Filename.temp_file unit_name ".wasm") + @@ fun input_file -> + opt_with + Fs.with_intermediate_file + (if enable_source_maps + then Some (Filename.temp_file unit_name ".wasm.map") + else None) + @@ fun opt_input_sourcemap -> + let fragments, shapes = + output + code + ~wat_file: + (Filename.concat (Filename.dirname output_file) (unit_name ^ ".wat")) + ~unit_name:(Some unit_name) + ~file:input_file + ~opt_source_map_file:opt_input_sourcemap + in + Binaryen.optimize + ~profile + ~opt_input_sourcemap + ~opt_output_sourcemap:opt_tmp_map_file + ~input_file + ~output_file:tmp_wasm_file + (); + { Link.unit_name; unit_info; fragments }, shapes in cont unit_data unit_name tmp_wasm_file opt_tmp_map_file shapes cmi_files in From ca36b66d7461da85f3c51cfc5b10b4701930c220 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Vouillon?= Date: Tue, 21 Apr 2026 14:06:58 +0200 Subject: [PATCH 3/6] Wasm: coalesce locals Add a [Var_coalescing] pass: a linear-scan variable coalescing pass for the Wasm backend, mirroring [Js_variable_coalescing]. When --opt 1 skips Binaryen we need our own pass to reuse locals; at --opt 2/3 it is off and Binaryen still handles coalescing. Runs in [post_process_function_body]; gated on [O1] and the new [wasm-var-coalescing] flag. --- compiler/bin-wasm_of_ocaml/compile.ml | 2 + compiler/lib-wasm/gc_target.ml | 9 +- compiler/lib-wasm/generate.ml | 83 ++- compiler/lib-wasm/generate.mli | 3 +- compiler/lib-wasm/target_sig.ml | 4 +- compiler/lib-wasm/var_coalescing.ml | 941 ++++++++++++++++++++++++++ compiler/lib-wasm/var_coalescing.mli | 36 + compiler/lib/config.ml | 2 + compiler/lib/config.mli | 2 + 9 files changed, 1057 insertions(+), 25 deletions(-) create mode 100644 compiler/lib-wasm/var_coalescing.ml create mode 100644 compiler/lib-wasm/var_coalescing.mli diff --git a/compiler/bin-wasm_of_ocaml/compile.ml b/compiler/bin-wasm_of_ocaml/compile.ml index 59449464f7..b4b3772301 100644 --- a/compiler/bin-wasm_of_ocaml/compile.ml +++ b/compiler/bin-wasm_of_ocaml/compile.ml @@ -309,6 +309,7 @@ let generate_prelude ~out_file = let context = Generate.start () in let _ = Generate.f + ~profile ~context ~unit_name:(Some "prelude") ~live_vars:variable_uses @@ -522,6 +523,7 @@ let run let context = Generate.start () in let toplevel_name, generated_js = Generate.f + ~profile ~context ~unit_name ~live_vars:variable_uses diff --git a/compiler/lib-wasm/gc_target.ml b/compiler/lib-wasm/gc_target.ml index 7b8af02e43..5bd35587d1 100644 --- a/compiler/lib-wasm/gc_target.ml +++ b/compiler/lib-wasm/gc_target.ml @@ -2056,7 +2056,14 @@ let handle_exceptions ~result_typ ~fall_through ~context body x exn_handler = let* () = no_event in exn_handler ~result_typ ~fall_through ~context) -let post_process_function_body = Initialize_locals.f +let post_process_function_body ~profile ~param_names ~param_types ~locals body = + let locals, body = + match (profile : Profile.t) with + | O1 when Config.Flag.wasm_var_coalescing () -> + Var_coalescing.f ~param_names ~param_types ~locals body + | O1 | O2 | O3 -> locals, body + in + Initialize_locals.f ~param_names ~locals body let entry_point ~toplevel_fun = let code = diff --git a/compiler/lib-wasm/generate.ml b/compiler/lib-wasm/generate.ml index 024ce84c0c..9a007b6ccb 100644 --- a/compiler/lib-wasm/generate.ml +++ b/compiler/lib-wasm/generate.ml @@ -2131,6 +2131,7 @@ module Generate (Target : Target_sig.S) = struct ~context let translate_function + ~profile p ctx name_opt @@ -2336,7 +2337,32 @@ module Generate (Target : Target_sig.S) = struct | Some loc -> event loc | None -> return ()) in - let locals, body = post_process_function_body ~param_names ~locals body in + let signature = + match name_opt with + | None -> Type.primitive_type param_count + | Some f -> + if Typing.can_unbox_parameters ctx.fun_info f + then + { W.params = + List.map + ~f:(fun x -> + Option.value + ~default:Type.value + (unboxed_type (Typing.var_type ctx.types x))) + params + @ [ Type.value ] + ; result = [ Option.value ~default:Type.value (unboxed_type return_type) ] + } + else Type.func_type (param_count - 1) + in + let locals, body = + post_process_function_body + ~profile + ~param_names + ~param_types:signature.params + ~locals + body + in W.Function { name = (match name_opt with @@ -2347,23 +2373,7 @@ module Generate (Target : Target_sig.S) = struct | None -> Option.map ~f:(fun name -> name ^ ".init") unit_name | Some _ -> None) ; typ = None - ; signature = - (match name_opt with - | None -> Type.primitive_type param_count - | Some f -> - if Typing.can_unbox_parameters ctx.fun_info f - then - { W.params = - List.map - ~f:(fun x -> - Option.value - ~default:Type.value - (unboxed_type (Typing.var_type ctx.types x))) - params - @ [ Type.value ] - ; result = [ Option.value ~default:Type.value (unboxed_type return_type) ] - } - else Type.func_type (param_count - 1)) + ; signature ; param_names ; locals ; body @@ -2427,6 +2437,7 @@ module Generate (Target : Target_sig.S) = struct add_start_function ~context (init_function ~context ~to_link) let f + ~profile ~context:global_context ~unit_name (p : Code.program) @@ -2458,7 +2469,16 @@ module Generate (Target : Target_sig.S) = struct Code.fold_closures_outermost_first p (fun name_opt params cont cloc -> - translate_function p ctx name_opt ~toplevel_name ~unit_name params cont cloc) + translate_function + ~profile + p + ctx + name_opt + ~toplevel_name + ~unit_name + params + cont + cloc) [] in let functions = @@ -2574,7 +2594,15 @@ let init = G.init let start () = make_context ~value_type:Gc_target.Type.value -let f ~context ~unit_name p ~live_vars ~in_cps ~deadcode_sentinel ~global_flow_data = +let f + ~profile + ~context + ~unit_name + p + ~live_vars + ~in_cps + ~deadcode_sentinel + ~global_flow_data = let global_flow_state, global_flow_info = global_flow_data in let fun_info = Call_graph_analysis.f p global_flow_info in let types = @@ -2584,7 +2612,16 @@ let f ~context ~unit_name p ~live_vars ~in_cps ~deadcode_sentinel ~global_flow_d let p = Structure.norm p in let p = fix_switch_branches p in let res = - G.f ~context ~unit_name ~live_vars ~in_cps ~global_flow_info ~fun_info ~types p + G.f + ~profile + ~context + ~unit_name + ~live_vars + ~in_cps + ~global_flow_info + ~fun_info + ~types + p in if times () then Format.eprintf " code gen.: %a@." Timer.print t; res @@ -2608,12 +2645,14 @@ let wasm_output ch ~opt_source_map_file ~context = if times () then Format.eprintf " output: %a@." Timer.print t let compile ~unit_name code = + let profile = Profile.O1 in let Driver.{ program; variable_uses; in_cps; deadcode_sentinel; _ }, global_flow_data = - Driver.optimize_for_wasm ~shapes:false ~profile:O1 code + Driver.optimize_for_wasm ~shapes:false ~profile code in let context = start () in let toplevel_name, fragments = f + ~profile ~context ~unit_name ~live_vars:variable_uses diff --git a/compiler/lib-wasm/generate.mli b/compiler/lib-wasm/generate.mli index 47cfb17095..1ac8ce29b3 100644 --- a/compiler/lib-wasm/generate.mli +++ b/compiler/lib-wasm/generate.mli @@ -21,7 +21,8 @@ val init : unit -> unit val start : unit -> Code_generation.context val f : - context:Code_generation.context + profile:Profile.t + -> context:Code_generation.context -> unit_name:string option -> Code.program -> live_vars:int array diff --git a/compiler/lib-wasm/target_sig.ml b/compiler/lib-wasm/target_sig.ml index f13e4aac5c..360dd4645e 100644 --- a/compiler/lib-wasm/target_sig.ml +++ b/compiler/lib-wasm/target_sig.ml @@ -297,7 +297,9 @@ module type S = sig -> unit Code_generation.t val post_process_function_body : - param_names:Wasm_ast.var list + profile:Profile.t + -> param_names:Wasm_ast.var list + -> param_types:Wasm_ast.value_type list -> locals:(Wasm_ast.var * Wasm_ast.value_type) list -> Wasm_ast.instruction list -> (Wasm_ast.var * Wasm_ast.value_type) list * Wasm_ast.instruction list diff --git a/compiler/lib-wasm/var_coalescing.ml b/compiler/lib-wasm/var_coalescing.ml new file mode 100644 index 0000000000..72628726ee --- /dev/null +++ b/compiler/lib-wasm/var_coalescing.ml @@ -0,0 +1,941 @@ +(* Wasm_of_ocaml compiler + * http://www.ocsigen.org/js_of_ocaml/ + * Copyright (C) 2026 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, with linking exception; + * either version 2.1 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + *) + +(* + Linear-scan variable coalescing for the Wasm backend. + + The algorithm mirrors [Js_variable_coalescing]: + 1. Build a CFG of the function body. + 2. Compute backward liveness to a fixed point via [Dgraph.Solver]. + 3. Derive per-variable live ranges (intervals with possible holes). + 4. Run linear-scan allocation, bucketed by Wasm [value_type], with + copy hints from [local.set x (local.get y)] patterns. + 5. Rewrite [LocalGet]/[LocalSet]/[LocalTee] through the substitution + and drop [LocalSet(x, LocalGet x)] copies introduced by merging. + 6. Rebuild the [locals] list with only representatives. + + Parameters participate as candidates — they are all live at entry + (enforced by a synthetic Def-of-all-params entry node) so two + parameters never merge, but a non-parameter whose range does not + interfere with a parameter can be rewritten to reuse the parameter's + slot. + + [Try] expressions are handled by creating a synthetic [catch_entry] + node whose successors are the resolved catch targets (branch labels + in the enclosing block stack). Because an exception can fire from + anywhere inside the body, every variable live at [catch_entry] is + considered live across the entire try body — enforced by extending + active live ranges back to the body entry when [catch_entry] is + reached during live-range construction. +*) + +open! Stdlib +module Var = Code.Var +module W = Wasm_ast + +let times = Debug.find "times" + +let debug = Debug.find "wasm-var-coalescing" + +let stats = Debug.find "stats" + +(* --------------------------------------------------------------------- *) +(* CFG construction *) +(* --------------------------------------------------------------------- *) + +module Node = struct + type t = int +end + +module NodeSet = struct + type t = BitSet.t + + type elt = int + + let iter f t = BitSet.iter ~f t + + let mem = BitSet.mem + + let add = BitSet.set + + let remove = BitSet.unset + + let copy = BitSet.copy +end + +module NodeTbl = struct + type 'a t = 'a array + + type key = int + + type size = int + + let get t k = t.(k) + + let set t k v = t.(k) <- v + + let make = Array.make +end + +module G = Dgraph.Make_Imperative (Node) (NodeSet) (NodeTbl) + +module Domain = struct + type t = Var.Set.t + + let equal = Var.Set.equal + + let bot = Var.Set.empty +end + +module Solver = G.Solver (Domain) + +type action = + | Use of Var.Set.t + | Def of Var.Set.t + | Nop + +let defs_of_action = function + | Def d -> d + | Use _ | Nop -> Var.Set.empty + +let uses_of_action = function + | Use u -> u + | Def _ | Nop -> Var.Set.empty + +type node_id = int + +type stmt_graph = + { entry : node_id + ; size : int + ; actions : action array + ; succs : node_id list array + ; coalescing_hints : Var.t Var.Hashtbl.t + ; try_blocks : node_id Int.Hashtbl.t + (** [catch_entry -> try_body_entry]. When the live-range pass reaches + [catch_entry], every variable currently live is extended back to + the start of the try body, modeling the fact that an exception + can fire anywhere inside the body. *) + } + +type graph_builder = + { mutable nodes : (node_id * action * node_id list) list + ; hints : Var.t Var.Hashtbl.t + ; tries : node_id Int.Hashtbl.t + } + +(* Block stack frame: the target node id reached by [br n] with n equal + to this frame's depth. For a [Block], that target is the node + following the block (the block's exit). For a [Loop], that target is + the loop entry (back-edge). *) + +let nth_opt l n = + let rec loop l n = + match l, n with + | [], _ -> None + | x :: _, 0 -> Some x + | _ :: r, n -> loop r (n - 1) + in + loop l n + +(* The CFG is built with each sub-expression producing its own node + chain, laid out in Wasm evaluation order (left-to-right for + operands, args-before-funcref for [call_ref]). Each [local.get], + [local.tee], and [local.set] becomes a single node with a [Use] or + [Def] action; structural operators contribute no node by themselves. + + [build_expr block_stack exit_node e] appends the CFG for [e] to the + graph; control flows in forward execution order from the returned + entry node through to [exit_node]. *) + +let build_cfg ~candidates ~param_vars instrs = + let builder = + { nodes = []; hints = Var.Hashtbl.create 16; tries = Int.Hashtbl.create 8 } + in + let next_id = ref 0 in + let reserve_id () = + let id = !next_id in + incr next_id; + id + in + let set_node id action succs = builder.nodes <- (id, action, succs) :: builder.nodes in + let add_node action succs = + let id = reserve_id () in + set_node id action succs; + id + in + let add_hint x y = + if Var.Set.mem x candidates && Var.Set.mem y candidates + then Var.Hashtbl.replace builder.hints x y + in + let target_of_br block_stack n = + match nth_opt block_stack n with + | Some t -> t + | None -> + (* [br] out of the function. Conservatively model as a dead end. *) + add_node Nop [] + in + let use_node x exit_node = + if Var.Set.mem x candidates + then add_node (Use (Var.Set.singleton x)) [ exit_node ] + else exit_node + in + let def_node x exit_node = + if Var.Set.mem x candidates + then add_node (Def (Var.Set.singleton x)) [ exit_node ] + else exit_node + in + let rec build_expr block_stack exit_node (e : W.expression) = + match e with + | Const _ | GlobalGet _ | Pop _ | RefFunc _ | RefNull _ -> exit_node + | LocalGet x -> use_node x exit_node + | LocalTee (x, e') -> + let tee = def_node x exit_node in + build_expr block_stack tee e' + | UnOp (_, e') + | I32WrapI64 e' + | I64ExtendI32 (_, e') + | F32DemoteF64 e' + | F64PromoteF32 e' + | RefI31 e' + | I31Get (_, e') + | ArrayLen e' + | StructGet (_, _, _, e') + | RefCast (_, e') + | RefTest (_, e') + | ExternConvertAny e' + | AnyConvertExtern e' -> build_expr block_stack exit_node e' + | Br_on_cast (depth, _, _, e') + | Br_on_cast_fail (depth, _, _, e') + | Br_on_null (depth, e') -> + (* Conditional branch: on a successful type test (or null test), + branch to the label [depth] with the value on the stack; + otherwise continue with the value on the stack. *) + let target = target_of_br block_stack depth in + let branch = add_node Nop [ target; exit_node ] in + build_expr block_stack branch e' + | BinOp (_, e1, e2) + | ArrayNew (_, e1, e2) + | ArrayNewData (_, _, e1, e2) + | ArrayGet (_, _, e1, e2) + | RefEq (e1, e2) -> + (* [e1] evaluated first, then [e2]. *) + let after_e1 = build_expr block_stack exit_node e2 in + build_expr block_stack after_e1 e1 + | Call (_, l) | ArrayNewFixed (_, l) | StructNew (_, l) -> + build_exprs_left_to_right block_stack exit_node l + | Call_ref (_, f, l) -> + (* Wasm [call_ref] evaluates args first, then the funcref. *) + let after_f = build_expr block_stack exit_node f in + build_exprs_left_to_right block_stack after_f l + | IfExpr (_, cond, e1, e2) -> + (* Evaluate [cond], then branch to [e1] or [e2]. [br 0] inside + either branch targets the continuation after the IfExpr. *) + let block_stack' = exit_node :: block_stack in + let then_entry = build_expr block_stack' exit_node e1 in + let else_entry = build_expr block_stack' exit_node e2 in + let branch = add_node Nop [ then_entry; else_entry ] in + build_expr block_stack branch cond + | BlockExpr (_, l) -> + (* [BlockExpr] is a block targetable by [br] at its own depth. *) + let block_stack' = exit_node :: block_stack in + build_instrs block_stack' exit_node l + | Seq (l, e') -> + let after_l = build_expr block_stack exit_node e' in + build_instrs block_stack after_l l + | Try (_, body, catches) -> + (* [Try] is a block: [br 0] inside the body targets the normal + continuation after the [Try]. Catch labels refer to the + enclosing block stack (not counting the [Try] itself), so they + are resolved against [block_stack]. *) + let block_stack' = exit_node :: block_stack in + let inner_body_entry = build_instrs block_stack' exit_node body in + (* Wrap the body entry in a fresh [Nop] node whose only + predecessor is the [Try] wrapper we are about to create. This + ensures that in the DFS used to compute the RPO layout, + [body_entry] is numbered strictly after [catch_entry], which + is visited first from the wrapper. Without this shim the body + entry can be reached via the catch targets (through + convergent paths in the enclosing code), putting it inside + [catch_entry]'s DFS subtree and breaking the invariant that + [body_order < catch_order]. *) + let body_entry = add_node Nop [ inner_body_entry ] in + (* Synthetic catch entry whose [live_in] is the union of the + live_in sets of the catch targets. Making it a successor of + the Try wrapper ensures it is reachable and thus has its + liveness computed by the dataflow solver. *) + let catch_targets = + List.map catches ~f:(fun (_, depth, _) -> target_of_br block_stack depth) + in + let catch_entry = add_node Nop catch_targets in + Int.Hashtbl.replace builder.tries catch_entry body_entry; + add_node Nop [ body_entry; catch_entry ] + and build_exprs_left_to_right block_stack exit_node l = + List.fold_right l ~init:exit_node ~f:(fun e next -> build_expr block_stack next e) + and build_instrs block_stack exit_node instrs = + List.fold_right instrs ~init:exit_node ~f:(fun instr next -> + build_instr block_stack next instr) + and build_instr block_stack exit_node instr = + match (instr : W.instruction) with + | Nop | Event _ -> exit_node + | Unreachable | Rethrow _ -> add_node Nop [] + | Drop e | Push e -> build_expr block_stack exit_node e + | LocalSet (x, e) -> + (match e with + | LocalGet y -> add_hint x y + | _ -> ()); + let set = def_node x exit_node in + build_expr block_stack set e + | GlobalSet (_, e) -> build_expr block_stack exit_node e + | StructSet (_, _, e1, e2) -> + let after_e1 = build_expr block_stack exit_node e2 in + build_expr block_stack after_e1 e1 + | ArraySet (_, e1, e2, e3) -> + let after_e2 = build_expr block_stack exit_node e3 in + let after_e1 = build_expr block_stack after_e2 e2 in + build_expr block_stack after_e1 e1 + | CallInstr (_, l) -> build_exprs_left_to_right block_stack exit_node l + | Return None -> add_node Nop [] + | Return (Some e) -> + let dead = add_node Nop [] in + build_expr block_stack dead e + | Throw (_, e) -> + let dead = add_node Nop [] in + build_expr block_stack dead e + | Return_call (_, l) -> + let dead = add_node Nop [] in + build_exprs_left_to_right block_stack dead l + | Return_call_ref (_, f, l) -> + let dead = add_node Nop [] in + let after_f = build_expr block_stack dead f in + build_exprs_left_to_right block_stack after_f l + | Br (n, None) -> target_of_br block_stack n + | Br (n, Some e) -> + let target = target_of_br block_stack n in + build_expr block_stack target e + | Br_if (n, cond) -> + let target = target_of_br block_stack n in + let branch = add_node Nop [ target; exit_node ] in + build_expr block_stack branch cond + | Br_table (e, ns, default) -> + let all_targets = + target_of_br block_stack default :: List.map ~f:(target_of_br block_stack) ns + in + let branch = add_node Nop all_targets in + build_expr block_stack branch e + | If (_, cond, l1, l2) -> + (* [If] is a Wasm block: [br 0] inside a branch targets the + continuation after the if. Push [exit_node] onto [block_stack]. *) + let block_stack' = exit_node :: block_stack in + let then_entry = build_instrs block_stack' exit_node l1 in + let else_entry = build_instrs block_stack' exit_node l2 in + let branch = add_node Nop [ then_entry; else_entry ] in + build_expr block_stack branch cond + | Block (_, l) -> + let block_stack' = exit_node :: block_stack in + build_instrs block_stack' exit_node l + | Loop (_, l) -> + let loop_entry = reserve_id () in + let block_stack' = loop_entry :: block_stack in + let body_entry = build_instrs block_stack' exit_node l in + set_node loop_entry Nop [ body_entry ]; + loop_entry + in + let end_node = add_node Nop [] in + let body_entry = build_instrs [] end_node instrs in + let entry = + if Var.Set.is_empty param_vars + then body_entry + else add_node (Def param_vars) [ body_entry ] + in + let size = !next_id in + let actions = Array.make size Nop in + let succs = Array.make size [] in + List.iter builder.nodes ~f:(fun (id, action, s) -> + actions.(id) <- action; + succs.(id) <- s); + { entry + ; size + ; actions + ; succs + ; coalescing_hints = builder.hints + ; try_blocks = builder.tries + } + +(* --------------------------------------------------------------------- *) +(* Liveness (backward dataflow) and live range extraction *) +(* --------------------------------------------------------------------- *) + +let compute_liveness g = + let domain_set = BitSet.create' g.size in + for i = 0 to g.size - 1 do + BitSet.set domain_set i + done; + let preds = Array.make (Array.length g.succs) [] in + Array.iteri g.succs ~f:(fun i l -> + List.iter l ~f:(fun j -> preds.(j) <- i :: preds.(j))); + let inv_graph = + { G.domain = domain_set; iter_children = (fun f i -> List.iter ~f preds.(i)) } + in + let transfer state node_id = + let live_out = + List.fold_left g.succs.(node_id) ~init:Var.Set.empty ~f:(fun acc id -> + Var.Set.union acc state.(id)) + in + let def = defs_of_action g.actions.(node_id) in + let use = uses_of_action g.actions.(node_id) in + Var.Set.union use (Var.Set.diff live_out def) + in + Solver.f g.size inv_graph transfer + +module Live_range = struct + type interval = + { start_pos : int + ; end_pos : int + } + + type t = + { id : Var.t + ; mutable ranges : interval list (* sorted by start_pos *) + ; mutable assigned : bool + } + + let create v = { id = v; ranges = []; assigned = false } + + let add_range t start_pos end_pos = + let rec loop s e acc = function + | [] -> List.rev ({ start_pos = s; end_pos = e } :: acc) + | r :: rest -> + if e < r.start_pos - 1 + then List.rev_append acc ({ start_pos = s; end_pos = e } :: r :: rest) + else if s > r.end_pos + 1 + then loop s e (r :: acc) rest + else + let new_start = min s r.start_pos in + let new_end = max e r.end_pos in + loop new_start new_end acc rest + in + t.ranges <- + (match t.ranges with + | [] -> [ { start_pos; end_pos } ] + | r :: _ when r.start_pos > end_pos + 1 -> { start_pos; end_pos } :: t.ranges + | _ -> loop start_pos end_pos [] t.ranges) + + let add_ranges t other_ranges = + match t.ranges, other_ranges with + | [], l | l, [] -> t.ranges <- l + | l1, l2 -> + let rec loop acc l1 l2 = + match l1, l2 with + | [], l | l, [] -> List.rev_append acc l + | h1 :: t1, h2 :: t2 -> + if h1.start_pos < h2.start_pos then step acc h1 t1 l2 else step acc h2 t2 l1 + and step acc current rest other = + match acc with + | prev :: acc_rest when prev.end_pos + 1 >= current.start_pos -> + let merged = { prev with end_pos = max prev.end_pos current.end_pos } in + loop (merged :: acc_rest) rest other + | _ -> loop (current :: acc) rest other + in + t.ranges <- loop [] l1 l2 + + let get_start_pos t = + match t.ranges with + | [] -> max_int + | r :: _ -> r.start_pos + + let get_first_hole t = + match t.ranges with + | [] -> 0 + | r :: _ -> r.end_pos + 1 + + let rec advance t position = + match t.ranges with + | [] -> `Dead + | r :: rem -> + if r.end_pos < position + then ( + t.ranges <- rem; + advance t position) + else if r.start_pos > position + then `Inactive + else `Active + + let intersects t1 t2 = + let rec loop l1 l2 = + match l1, l2 with + | [], _ | _, [] -> false + | r1 :: rest1, r2 :: rest2 -> + if r1.end_pos < r2.start_pos + then loop rest1 l2 + else if r2.end_pos < r1.start_pos + then loop l1 rest2 + else true + in + loop t1.ranges t2.ranges +end + +(* Compute live ranges. Uses the same 2x position granularity as + [Js_variable_coalescing]: each CFG node at linear position i maps to + positions 2*i (pre, for uses) and 2*i+1 (post, for defs). *) +let compute_live_ranges g live_in_map candidates param_vars = + let visited = Array.make g.size false in + let layout = Array.make g.size 0 in + let i = ref g.size in + let rec list_rev_iter ~f l = + match l with + | [] -> () + | x :: r -> + list_rev_iter ~f r; + f x + in + let rec dfs n = + if not visited.(n) + then ( + visited.(n) <- true; + list_rev_iter g.succs.(n) ~f:dfs; + decr i; + layout.(!i) <- n) + in + dfs g.entry; + let num_reachable = g.size - !i in + let layout = if !i = 0 then layout else Array.sub layout ~pos:!i ~len:num_reachable in + let node_order = Array.make g.size (-1) in + Array.iteri layout ~f:(fun i n -> node_order.(n) <- i); + let ranges = Var.Hashtbl.create (Var.Set.cardinal candidates) in + Var.Set.iter (fun v -> Var.Hashtbl.add ranges v (Live_range.create v)) candidates; + let active_ranges = Var.Hashtbl.create 64 in + let commit_range v start_pos end_pos = + let r = Var.Hashtbl.find ranges v in + Live_range.add_range r start_pos end_pos + in + for order = num_reachable - 1 downto 0 do + let node_id = layout.(order) in + let start_idx = 2 * order in + let end_idx = (2 * order) + 1 in + let succs = g.succs.(node_id) in + let is_fallthrough = + match succs with + | [ s ] -> order < num_reachable - 1 && layout.(order + 1) = s + | _ -> false + in + if not is_fallthrough + then ( + let live_out = + List.fold_left succs ~init:Var.Set.empty ~f:(fun acc sid -> + Var.Set.union acc live_in_map.(sid)) + in + let to_remove = ref [] in + Var.Hashtbl.iter + (fun v high -> + if not (Var.Set.mem v live_out) + then ( + commit_range v (end_idx + 1) high; + to_remove := v :: !to_remove)) + active_ranges; + List.iter !to_remove ~f:(Var.Hashtbl.remove active_ranges); + Var.Set.iter + (fun v -> + if not (Var.Hashtbl.mem active_ranges v) + then Var.Hashtbl.add active_ranges v end_idx) + live_out); + let defs = defs_of_action g.actions.(node_id) in + Var.Set.iter + (fun v -> + match Var.Hashtbl.find_opt active_ranges v with + | Some high -> + commit_range v end_idx high; + Var.Hashtbl.remove active_ranges v + | None -> commit_range v end_idx end_idx) + defs; + let uses = uses_of_action g.actions.(node_id) in + Var.Set.iter + (fun v -> + if not (Var.Hashtbl.mem active_ranges v) + then Var.Hashtbl.add active_ranges v start_idx) + uses; + (* Try-catch range extension: reaching a [catch_entry] in RPO means we + have processed the catch targets. Whatever is live here must be + considered live throughout the try body, because an exception can + fire anywhere in the body and jump to a catch target. Extend every + currently-active range back to the start of the body, then clear + the active set so the body's own liveness is computed without + double-counting the exception path. + + [body_entry] is a dummy [Nop] shim whose only predecessor is the + Try wrapper, so its RPO order is guaranteed to be strictly less + than [catch_entry]'s. *) + match Int.Hashtbl.find_opt g.try_blocks node_id with + | None -> () + | Some body_entry_id -> + let body_order = node_order.(body_entry_id) in + assert (body_order < order); + let body_start_idx = 2 * body_order in + Var.Hashtbl.iter (fun v high -> commit_range v body_start_idx high) active_ranges; + Var.Hashtbl.clear active_ranges + done; + Var.Hashtbl.iter (fun v high -> commit_range v 0 high) active_ranges; + Var.Set.iter (fun v -> commit_range v 0 0) param_vars; + ranges + +(* --------------------------------------------------------------------- *) +(* Type compatibility *) +(* --------------------------------------------------------------------- *) + +let heap_type_equal (a : W.heap_type) (b : W.heap_type) = + match a, b with + | W.Func, W.Func + | Extern, Extern + | Any, Any + | Eq, Eq + | Struct, Struct + | Array, Array + | I31, I31 + | None_, None_ -> true + | Type x, Type y -> Var.equal x y + | ( (Func | Extern | Any | Eq | Struct | Array | I31 | None_ | Type _) + , (Func | Extern | Any | Eq | Struct | Array | I31 | None_ | Type _) ) -> false + +let ref_type_equal (a : W.ref_type) (b : W.ref_type) = + Bool.equal a.nullable b.nullable && heap_type_equal a.typ b.typ + +let value_type_equal (a : W.value_type) (b : W.value_type) = + match a, b with + | W.I32, W.I32 | I64, I64 | F32, F32 | F64, F64 -> true + | Ref ra, Ref rb -> ref_type_equal ra rb + | (I32 | I64 | F32 | F64 | Ref _), (I32 | I64 | F32 | F64 | Ref _) -> false + +(* --------------------------------------------------------------------- *) +(* Linear scan register allocation *) +(* --------------------------------------------------------------------- *) + +module Active_pqueue = Pqueue.Make (struct + type t = Live_range.t + + let compare r r' = compare (Live_range.get_first_hole r) (Live_range.get_first_hole r') +end) + +module Inactive_pqueue = Pqueue.Make (struct + type t = int * Var.t + + let compare (p, _) (p', _) = compare (p : int) p' +end) + +let allocate_registers subst types ranges hints = + let hint_count = ref 0 in + let opportunistic_count = ref 0 in + let sorted_intervals = + let intervals = Var.Hashtbl.fold (fun _ r acc -> r :: acc) ranges [] in + List.sort + ~cmp:(fun a b -> + Int.compare (Live_range.get_start_pos a) (Live_range.get_start_pos b)) + intervals + in + let active = ref Active_pqueue.empty in + let inactive_queue = ref Inactive_pqueue.empty in + let inactive = Var.Hashtbl.create 128 in + let free_pool = ref [] in + let rec update_active_queue position = + match Active_pqueue.find_min !active with + | exception Not_found -> () + | r -> ( + if Live_range.get_first_hole r <= position + then + let active' = Active_pqueue.remove_min !active in + match Live_range.advance r position with + | `Dead -> + free_pool := r :: !free_pool; + active := active'; + update_active_queue position + | `Inactive -> + inactive_queue := + Inactive_pqueue.add (Live_range.get_start_pos r, r.id) !inactive_queue; + Var.Hashtbl.replace inactive r.id r; + active := active'; + update_active_queue position + | `Active -> + active := Active_pqueue.add r active'; + update_active_queue position) + in + let rec update_inactive_queue position = + match Inactive_pqueue.find_min !inactive_queue with + | exception Not_found -> () + | p, v -> ( + if p <= position + then + let inactive' = Inactive_pqueue.remove_min !inactive_queue in + match Var.Hashtbl.find_opt inactive v with + | None -> + inactive_queue := inactive'; + update_inactive_queue position + | Some r -> ( + match Live_range.advance r position with + | `Dead -> + free_pool := r :: !free_pool; + inactive_queue := inactive'; + Var.Hashtbl.remove inactive v; + update_inactive_queue position + | `Inactive -> + inactive_queue := + Inactive_pqueue.add (Live_range.get_start_pos r, r.id) inactive'; + update_inactive_queue position + | `Active -> + active := Active_pqueue.add r !active; + inactive_queue := inactive'; + Var.Hashtbl.remove inactive v; + update_inactive_queue position)) + in + let type_of v = Var.Hashtbl.find types v in + let compatible a b = value_type_equal (type_of a) (type_of b) in + let get_free_matching wanted = + let rec loop kept pool = + match pool with + | [] -> + free_pool := List.rev kept; + None + | r :: rs -> + if r.Live_range.assigned + then loop kept rs + else if compatible r.Live_range.id wanted + then ( + free_pool := List.rev_append kept rs; + Some r) + else loop (r :: kept) rs + in + loop [] !free_pool + in + List.iter sorted_intervals ~f:(fun current -> + let position = Live_range.get_start_pos current in + update_active_queue position; + update_inactive_queue position; + let hint_repr = + match Var.Hashtbl.find_opt hints current.Live_range.id with + | None -> None + | Some src -> ( + match Var.Hashtbl.find_opt subst src with + | None -> None + | Some var -> ( + if not (compatible var current.id) + then None + else + let r = Var.Hashtbl.find ranges var in + match Live_range.advance r position with + | `Dead -> + r.assigned <- true; + Some r + | `Inactive -> + if Live_range.intersects r current + then None + else ( + Var.Hashtbl.remove inactive r.id; + Some r) + | `Active -> None)) + in + let repr = + match hint_repr with + | Some r -> + incr hint_count; + r + | None -> ( + let candidate = + let rec loop q count = + if count >= 50 || Inactive_pqueue.is_empty q + then None + else + let _, v = Inactive_pqueue.find_min q in + let q' = Inactive_pqueue.remove_min q in + match Var.Hashtbl.find_opt inactive v with + | None -> loop q' count + | Some iv -> + if + compatible iv.id current.id + && not (Live_range.intersects iv current) + then Some iv + else loop q' (count + 1) + in + loop !inactive_queue 0 + in + match candidate with + | Some r -> + incr opportunistic_count; + Var.Hashtbl.remove inactive r.id; + r + | None -> ( + match get_free_matching current.id with + | Some r -> + incr opportunistic_count; + r + | None -> current)) + in + if not (Var.equal current.id repr.id) + then ( + Var.forget_generated_name current.id; + Var.forget_generated_name repr.id; + Live_range.add_ranges repr current.Live_range.ranges; + Var.Hashtbl.replace subst current.id repr.id); + active := Active_pqueue.add repr !active); + !hint_count, !opportunistic_count + +(* --------------------------------------------------------------------- *) +(* Body rewriting *) +(* --------------------------------------------------------------------- *) + +let subst_var subst x = + match Var.Hashtbl.find_opt subst x with + | None -> x + | Some y -> y + +let rec rewrite_expr subst (e : W.expression) : W.expression = + match e with + | Const _ | GlobalGet _ | Pop _ | RefFunc _ | RefNull _ -> e + | LocalGet x -> LocalGet (subst_var subst x) + | LocalTee (x, e') -> LocalTee (subst_var subst x, rewrite_expr subst e') + | UnOp (op, e') -> UnOp (op, rewrite_expr subst e') + | I32WrapI64 e' -> I32WrapI64 (rewrite_expr subst e') + | I64ExtendI32 (s, e') -> I64ExtendI32 (s, rewrite_expr subst e') + | F32DemoteF64 e' -> F32DemoteF64 (rewrite_expr subst e') + | F64PromoteF32 e' -> F64PromoteF32 (rewrite_expr subst e') + | RefI31 e' -> RefI31 (rewrite_expr subst e') + | I31Get (s, e') -> I31Get (s, rewrite_expr subst e') + | ArrayLen e' -> ArrayLen (rewrite_expr subst e') + | StructGet (s, ty, i, e') -> StructGet (s, ty, i, rewrite_expr subst e') + | RefCast (ty, e') -> RefCast (ty, rewrite_expr subst e') + | RefTest (ty, e') -> RefTest (ty, rewrite_expr subst e') + | Br_on_cast (i, a, b, e') -> Br_on_cast (i, a, b, rewrite_expr subst e') + | Br_on_cast_fail (i, a, b, e') -> Br_on_cast_fail (i, a, b, rewrite_expr subst e') + | Br_on_null (i, e') -> Br_on_null (i, rewrite_expr subst e') + | ExternConvertAny e' -> ExternConvertAny (rewrite_expr subst e') + | AnyConvertExtern e' -> AnyConvertExtern (rewrite_expr subst e') + | BinOp (op, e1, e2) -> BinOp (op, rewrite_expr subst e1, rewrite_expr subst e2) + | ArrayNew (ty, e1, e2) -> ArrayNew (ty, rewrite_expr subst e1, rewrite_expr subst e2) + | ArrayNewData (ty, d, e1, e2) -> + ArrayNewData (ty, d, rewrite_expr subst e1, rewrite_expr subst e2) + | ArrayGet (s, ty, e1, e2) -> + ArrayGet (s, ty, rewrite_expr subst e1, rewrite_expr subst e2) + | RefEq (e1, e2) -> RefEq (rewrite_expr subst e1, rewrite_expr subst e2) + | Call (f, l) -> Call (f, List.map l ~f:(rewrite_expr subst)) + | ArrayNewFixed (ty, l) -> ArrayNewFixed (ty, List.map l ~f:(rewrite_expr subst)) + | StructNew (ty, l) -> StructNew (ty, List.map l ~f:(rewrite_expr subst)) + | Call_ref (f, g, l) -> + Call_ref (f, rewrite_expr subst g, List.map l ~f:(rewrite_expr subst)) + | IfExpr (ty, cond, e1, e2) -> + IfExpr (ty, rewrite_expr subst cond, rewrite_expr subst e1, rewrite_expr subst e2) + | BlockExpr (ty, l) -> BlockExpr (ty, rewrite_instrs subst l) + | Seq (l, e') -> Seq (rewrite_instrs subst l, rewrite_expr subst e') + | Try (ty, body, catches) -> Try (ty, rewrite_instrs subst body, catches) + +and rewrite_instr subst (i : W.instruction) : W.instruction option = + match i with + | Nop | Event _ | Unreachable | Rethrow _ | Br (_, None) | Return None -> Some i + | Drop e -> Some (Drop (rewrite_expr subst e)) + | Push e -> Some (Push (rewrite_expr subst e)) + | LocalSet (x, e) -> ( + let x' = subst_var subst x in + let e' = rewrite_expr subst e in + match e' with + | LocalGet y when Var.equal x' y -> None + | _ -> Some (LocalSet (x', e'))) + | GlobalSet (x, e) -> Some (GlobalSet (x, rewrite_expr subst e)) + | StructSet (ty, i, e1, e2) -> + Some (StructSet (ty, i, rewrite_expr subst e1, rewrite_expr subst e2)) + | ArraySet (ty, e1, e2, e3) -> + Some + (ArraySet (ty, rewrite_expr subst e1, rewrite_expr subst e2, rewrite_expr subst e3)) + | CallInstr (f, l) -> Some (CallInstr (f, List.map l ~f:(rewrite_expr subst))) + | Return_call (f, l) -> Some (Return_call (f, List.map l ~f:(rewrite_expr subst))) + | Return_call_ref (f, g, l) -> + Some (Return_call_ref (f, rewrite_expr subst g, List.map l ~f:(rewrite_expr subst))) + | Return (Some e) -> Some (Return (Some (rewrite_expr subst e))) + | Throw (t, e) -> Some (Throw (t, rewrite_expr subst e)) + | Br (i, Some e) -> Some (Br (i, Some (rewrite_expr subst e))) + | Br_if (i, e) -> Some (Br_if (i, rewrite_expr subst e)) + | Br_table (e, l, d) -> Some (Br_table (rewrite_expr subst e, l, d)) + | Loop (ty, l) -> Some (Loop (ty, rewrite_instrs subst l)) + | Block (ty, l) -> Some (Block (ty, rewrite_instrs subst l)) + | If (ty, cond, l1, l2) -> + Some + (If (ty, rewrite_expr subst cond, rewrite_instrs subst l1, rewrite_instrs subst l2)) + +and rewrite_instrs subst l = List.filter_map l ~f:(rewrite_instr subst) + +(* --------------------------------------------------------------------- *) +(* Entry point *) +(* --------------------------------------------------------------------- *) + +let f ~param_names ~param_types ~locals instrs = + let t = Timer.make () in + let num_params = List.length param_names in + assert (List.length param_types = num_params); + (* In pretty mode, only coalesce compiler-generated variables so + user-given OCaml names survive in the Wasm name section (and are + visible in browser devtools). *) + let eligible v = (not (Config.Flag.pretty ())) || Var.generated_name v in + let types = Var.Hashtbl.create (num_params + List.length locals) in + let param_vars = + List.fold_left2 param_names param_types ~init:Var.Set.empty ~f:(fun acc v t -> + Var.Hashtbl.replace types v t; + if eligible v then Var.Set.add v acc else acc) + in + let candidates = + List.fold_left locals ~init:param_vars ~f:(fun acc (v, t) -> + Var.Hashtbl.replace types v t; + if eligible v then Var.Set.add v acc else acc) + in + let num_candidates = Var.Set.cardinal candidates in + if num_candidates <= 1 + then locals, instrs + else + let g = build_cfg ~candidates ~param_vars instrs in + let live_in_map = compute_liveness g in + let ranges = compute_live_ranges g live_in_map candidates param_vars in + let subst = Var.Hashtbl.create num_candidates in + let hint_count, opportunistic_count = + allocate_registers subst types ranges g.coalescing_hints + in + if debug () + then + Format.eprintf + "wasm var-coalescing: %d candidates, %d hint + %d opportunistic@." + num_candidates + hint_count + opportunistic_count; + (* Only candidates that merged into a different representative are + recorded in [subst]; its size is therefore the count of changes. *) + if Var.Hashtbl.length subst = 0 + then ( + if times () then Format.eprintf " wasm var coalescing: %a@." Timer.print t; + locals, instrs) + else + let instrs = rewrite_instrs subst instrs in + (* Rebuild locals: keep only those whose representative is themselves, + i.e. those that weren't merged away. Preserve original ordering to + keep the Wasm output deterministic. *) + let kept_locals = + List.filter locals ~f:(fun (v, _) -> not (Var.Hashtbl.mem subst v)) + in + if times () then Format.eprintf " wasm var coalescing: %a@." Timer.print t; + if stats () + then + Format.eprintf + "Stats - wasm var coalescing: %d candidates, %d coalesced (%d hint, %d \ + opportunistic)@." + num_candidates + (hint_count + opportunistic_count) + hint_count + opportunistic_count; + kept_locals, instrs diff --git a/compiler/lib-wasm/var_coalescing.mli b/compiler/lib-wasm/var_coalescing.mli new file mode 100644 index 0000000000..df3ffb6147 --- /dev/null +++ b/compiler/lib-wasm/var_coalescing.mli @@ -0,0 +1,36 @@ +(* Wasm_of_ocaml compiler + * http://www.ocsigen.org/js_of_ocaml/ + * Copyright (C) 2026 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, with linking exception; + * either version 2.1 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + *) + +(** Liveness analysis and variable coalescing for the Wasm backend. + + Merges Wasm locals (including parameters) with disjoint live ranges so + that a single local is reused. This shrinks the function's [locals] + declaration and eliminates redundant [local.set]/[local.get] copies. + + Parameters are valid coalescing targets: a local whose live range does + not overlap a parameter's can be rewritten to use the parameter's + index. Two parameters never merge because they are all live at + function entry. *) + +val f : + param_names:Code.Var.t list + -> param_types:Wasm_ast.value_type list + -> locals:(Code.Var.t * Wasm_ast.value_type) list + -> Wasm_ast.instruction list + -> (Code.Var.t * Wasm_ast.value_type) list * Wasm_ast.instruction list diff --git a/compiler/lib/config.ml b/compiler/lib/config.ml index 69db92887d..48787daa3a 100644 --- a/compiler/lib/config.ml +++ b/compiler/lib/config.ml @@ -103,6 +103,8 @@ module Flag = struct let var_coalescing = o ~name:"var-coalescing" ~default:true + let wasm_var_coalescing = o ~name:"wasm-var-coalescing" ~default:true + let header = o ~name:"header" ~default:true let auto_link = o ~name:"auto-link" ~default:true diff --git a/compiler/lib/config.mli b/compiler/lib/config.mli index 2a540f68bc..54b4118690 100644 --- a/compiler/lib/config.mli +++ b/compiler/lib/config.mli @@ -52,6 +52,8 @@ module Flag : sig val var_coalescing : unit -> bool + val wasm_var_coalescing : unit -> bool + val debugger : unit -> bool val pretty : unit -> bool From be715c37d1c227e9022e34c40096dae4d9714c26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Vouillon?= Date: Mon, 11 May 2026 16:55:05 +0200 Subject: [PATCH 4/6] Wasm: sink local.set toward the first use MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a [Local_sink] pass that rewrites local.set x e ... local.get x into ... local.tee x e when the sink is sound: we must not cross another write to [x], and moving [e] past intermediate code must not reorder observable effects. Because [Local_sink] always introduces [local.tee] (never inlines [e] itself), some of the tees it produces are immediately dead: the variable is never read again. Extend [Var_coalescing] to detect such dead tees using the liveness it already computes — a [LocalTee x _] Def node whose successors do not list [x] in their [live_in] is a dead store. Replace it with its inner expression, and drop the local from the function's [locals] list when its last reference was the tee we just erased. Runs first in [post_process_function_body], before [Var_coalescing]; gated on [O1] and the new [wasm-local-sink] flag. --- compiler/lib-wasm/gc_target.ml | 9 + compiler/lib-wasm/generate.ml | 2 + compiler/lib-wasm/local_sink.ml | 723 +++++++++++++++++++++++++++ compiler/lib-wasm/local_sink.mli | 39 ++ compiler/lib-wasm/var_coalescing.ml | 437 ++++++++++++---- compiler/lib-wasm/var_coalescing.mli | 4 + compiler/lib/config.ml | 2 + compiler/lib/config.mli | 2 + 8 files changed, 1128 insertions(+), 90 deletions(-) create mode 100644 compiler/lib-wasm/local_sink.ml create mode 100644 compiler/lib-wasm/local_sink.mli diff --git a/compiler/lib-wasm/gc_target.ml b/compiler/lib-wasm/gc_target.ml index 5bd35587d1..3b960c95d2 100644 --- a/compiler/lib-wasm/gc_target.ml +++ b/compiler/lib-wasm/gc_target.ml @@ -2057,6 +2057,15 @@ let handle_exceptions ~result_typ ~fall_through ~context body x exn_handler = exn_handler ~result_typ ~fall_through ~context) let post_process_function_body ~profile ~param_names ~param_types ~locals body = + (* At [--opt 1] we skip [wasm-opt] entirely (both for .cmo/.cma and + for executables), so our own passes are the only ones tightening + the body. [Local_sink] runs first: shortening live ranges and + dropping [local.set]s simplifies the input to [Var_coalescing]. *) + let body = + match (profile : Profile.t) with + | O1 when Config.Flag.wasm_local_sink () -> Local_sink.f body + | O1 | O2 | O3 -> body + in let locals, body = match (profile : Profile.t) with | O1 when Config.Flag.wasm_var_coalescing () -> diff --git a/compiler/lib-wasm/generate.ml b/compiler/lib-wasm/generate.ml index 9a007b6ccb..9b4779ad80 100644 --- a/compiler/lib-wasm/generate.ml +++ b/compiler/lib-wasm/generate.ml @@ -2623,6 +2623,8 @@ let f ~types p in + Local_sink.report_stats (); + Var_coalescing.report_stats (); if times () then Format.eprintf " code gen.: %a@." Timer.print t; res diff --git a/compiler/lib-wasm/local_sink.ml b/compiler/lib-wasm/local_sink.ml new file mode 100644 index 0000000000..fc960d0088 --- /dev/null +++ b/compiler/lib-wasm/local_sink.ml @@ -0,0 +1,723 @@ +(* Wasm_of_ocaml compiler + * http://www.ocsigen.org/js_of_ocaml/ + * Copyright (C) 2026 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, with linking exception; + * either version 2.1 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + *) + +(* + Sink [local.set x e] to the first reachable [local.get x], turning the + pair into a single [local.tee x e]. We search forward in evaluation + order, stop at any control-flow boundary or compound construct, and + bail if we meet another write to [x] (a [local.set x] or [local.tee + x]) before a read. + + Effect ordering: moving [e] past other code is sound iff + + - [e] itself has no observable side effects (no writes, no calls, + no traps beyond the ones [effect_free] already tolerates); + - every sub-expression we cross is [effect_free]; and + - no instruction we cross writes any local that [e] reads — if it + did, [e]'s reads would see a different value at the new position. + + The last condition is what makes an "[effect_free] implies safe" + rule unsound: [effect_free] expressions can still read mutable + locals, and if the intermediate code writes those locals the move + would change [e]'s result. + + Traps: [effect_free] deliberately treats potentially-trapping reads + (e.g. [StructGet]/[ArrayGet]/[RefCast]/[I31Get] on a null or + out-of-bounds operand) as having no observable effect. We assume + traps never happen — the same assumption Binaryen runs under + (--traps-never-happen) — so reordering such a read relative to other + code, or across an [Event], cannot change observable behaviour: there + is no trap whose occurrence or source-map attribution we could move. +*) + +open! Stdlib +module W = Wasm_ast +module Var = Code.Var + +(* Hard upper bound on how far a single sink attempt may search forward. + The walker is O(N) per candidate with O(N) candidates per function, so + without a cap the pass is O(N²). Most profitable sinks have the target + within a few steps; this bound just clips the long tail. *) +let max_walk_distance = 32 + +let times = Debug.find "times" + +let stats = Debug.find "stats" + +(* Aggregated statistics across all calls to [f]. The pass runs once per + Wasm function; per-function logs are noisy, so we accumulate here and + emit a single summary via [report_stats]. *) +let total_time = ref 0. + +let total_calls = ref 0 + +let total_candidates = ref 0 + +let total_sunk = ref 0 + +let total_budget_exhausted = ref 0 + +let total_sink_distance = ref 0 + +let report_stats () = + if !total_calls > 0 + then ( + if times () then Format.eprintf " wasm local sink: %.2f@." !total_time; + if stats () + then ( + let avg_sink_distance = + if !total_sunk = 0 + then 0. + else float_of_int !total_sink_distance /. float_of_int !total_sunk + in + Format.eprintf + "Stats - wasm local sink: %d functions, %d candidates, %d sunk, %d \ + budget-limited, avg distance %.1f@." + !total_calls + !total_candidates + !total_sunk + !total_budget_exhausted + avg_sink_distance; + total_time := 0.; + total_calls := 0; + total_candidates := 0; + total_sunk := 0; + total_budget_exhausted := 0; + total_sink_distance := 0)) + +(* Same as [Gc_target.effect_free]. Copied here (it is a small and + self-contained helper) to avoid exposing it as a shared interface. *) +let rec effect_free (e : W.expression) = + match e with + | Const _ | LocalGet _ | GlobalGet _ | RefFunc _ | RefNull _ -> true + | UnOp (_, e') + | I32WrapI64 e' + | I64ExtendI32 (_, e') + | F32DemoteF64 e' + | F64PromoteF32 e' + | RefI31 e' + | I31Get (_, e') + | ArrayLen e' + | StructGet (_, _, _, e') + | RefCast (_, e') + | RefTest (_, e') + | ExternConvertAny e' + | AnyConvertExtern e' -> effect_free e' + | BinOp (_, e1, e2) + | ArrayNew (_, e1, e2) + | ArrayNewData (_, _, e1, e2) + | ArrayGet (_, _, e1, e2) + | RefEq (e1, e2) -> effect_free e1 && effect_free e2 + | LocalTee _ + | BlockExpr _ + | Call _ + | Seq _ + | Pop _ + | Call_ref _ + | Br_on_cast _ + | Br_on_cast_fail _ + | Br_on_null _ + | Try _ -> false + | IfExpr (_, e1, e2, e3) -> effect_free e1 && effect_free e2 && effect_free e3 + | ArrayNewFixed (_, l) | StructNew (_, l) -> List.for_all ~f:effect_free l + +(* Locals read and written by an expression. A [LocalTee y e'] both reads + [e']'s locals and writes [y]. *) +type rw = + { mutable reads : Var.Set.t + ; mutable writes : Var.Set.t + } + +let rec collect_rw rw (e : W.expression) = + match e with + (* [Pop] reads the implicit Wasm value stack, not a local, so it + contributes nothing to [reads]/[writes]. This means the + stack-ordering dependency between a [Push] and a later [Pop] is not + modelled here: sinking a [Pop]-bearing [e] across a [Push] would let + the [Pop] consume the just-pushed value instead of the original + stack top. This is safe only because codegen consumes every [Pop] + immediately (e.g. try/catch results are [Push]ed and popped right + away), so a [local.set]/[local.get] pair never straddles a [Push] of + an unrelated value. *) + | Const _ | GlobalGet _ | RefFunc _ | RefNull _ | Pop _ -> () + | LocalGet x -> rw.reads <- Var.Set.add x rw.reads + | LocalTee (y, e') -> + rw.writes <- Var.Set.add y rw.writes; + collect_rw rw e' + | UnOp (_, e') + | I32WrapI64 e' + | I64ExtendI32 (_, e') + | F32DemoteF64 e' + | F64PromoteF32 e' + | RefI31 e' + | I31Get (_, e') + | ArrayLen e' + | StructGet (_, _, _, e') + | RefCast (_, e') + | RefTest (_, e') + | ExternConvertAny e' + | AnyConvertExtern e' + | Br_on_cast (_, _, _, e') + | Br_on_cast_fail (_, _, _, e') + | Br_on_null (_, e') -> collect_rw rw e' + | BinOp (_, e1, e2) + | ArrayNew (_, e1, e2) + | ArrayNewData (_, _, e1, e2) + | ArrayGet (_, _, e1, e2) + | RefEq (e1, e2) -> + collect_rw rw e1; + collect_rw rw e2 + | Call (_, l) | ArrayNewFixed (_, l) | StructNew (_, l) -> + List.iter l ~f:(collect_rw rw) + | Call_ref (_, f, l) -> + collect_rw rw f; + List.iter l ~f:(collect_rw rw) + | IfExpr (_, c, t, el) -> + collect_rw rw c; + collect_rw rw t; + collect_rw rw el + | BlockExpr (_, l) -> List.iter l ~f:(collect_rw_instr rw) + | Seq (l, e') -> + List.iter l ~f:(collect_rw_instr rw); + collect_rw rw e' + | Try (_, body, _) -> List.iter body ~f:(collect_rw_instr rw) + +and collect_rw_instr rw (i : W.instruction) = + match i with + | Nop | Event _ | Br (_, None) | Return None | Rethrow _ | Unreachable -> () + | Drop e + | Push e + | GlobalSet (_, e) + | Br (_, Some e) + | Br_if (_, e) + | Br_table (e, _, _) + | Throw (_, e) + | Return (Some e) -> collect_rw rw e + | LocalSet (y, e) -> + rw.writes <- Var.Set.add y rw.writes; + collect_rw rw e + | StructSet (_, _, e1, e2) -> + collect_rw rw e1; + collect_rw rw e2 + | ArraySet (_, e1, e2, e3) -> + collect_rw rw e1; + collect_rw rw e2; + collect_rw rw e3 + | CallInstr (_, l) | Return_call (_, l) -> List.iter l ~f:(collect_rw rw) + | Return_call_ref (_, f, l) -> + collect_rw rw f; + List.iter l ~f:(collect_rw rw) + | Loop (_, l) | Block (_, l) -> List.iter l ~f:(collect_rw_instr rw) + | If (_, c, t, el) -> + collect_rw rw c; + List.iter t ~f:(collect_rw_instr rw); + List.iter el ~f:(collect_rw_instr rw) + +let rw_of_expr e = + let rw = { reads = Var.Set.empty; writes = Var.Set.empty } in + collect_rw rw e; + rw.reads, rw.writes + +(* Reads of a sub-expression, used at [may_cross_sibling] check points. *) +let reads_of_expr e = + let rw = { reads = Var.Set.empty; writes = Var.Set.empty } in + collect_rw rw e; + rw.reads + +(* Walker result for a single expression. [Clean] means "no occurrence + of x in this expression; the caller may continue past, using + [effect_free] to gate the path-clean state". *) +type walk_result = + | Found of W.expression + | Bail + | Clean + +(* [ctx] bundles the parameters that don't change during a sink attempt: + the target variable, the expression to sink, the locals it reads and + writes, and whether the expression itself is [effect_free]. + - [reads]: crossing a write to any of these would change [e]'s result. + - [writes]: crossing a read of any of these would make the reader see + the pre-sink value instead of the one [e] would have stored. + When [e_effect_free] is [false] we may not cross any *evaluated* + sub-expression or instruction even if it is itself [effect_free] — + the path could read heap/global state that [e]'s side effects would + change. *) +type ctx = + { x : Var.t + ; e_to_sink : W.expression + ; reads : Var.Set.t + ; writes : Var.Set.t + ; e_effect_free : bool + ; mutable budget : int + } + +(* Spend one unit of walk budget. Returns [false] when the budget is + exhausted, in which case callers must Bail rather than continue. We + only tick at the points where the walker steps forward in evaluation + order (instruction-to-instruction or sibling-to-sibling); descending + into a unary sub-expression is free. Every caller bails immediately on + a [false] result, so the budget-exhausted branch is reached at most + once per sink attempt and the counter is incremented exactly once. *) +let tick ctx = + if ctx.budget <= 0 + then ( + incr total_budget_exhausted; + false) + else ( + ctx.budget <- ctx.budget - 1; + true) + +(* True iff [vs] (a set of *reads* by intermediate code) does not + intersect [ctx.writes] — i.e. no variable that [e] writes is + observed by the intermediate code. *) +let reads_disjoint_from_e_writes ctx vs = + let exception Found in + try + Var.Set.iter (fun x -> if Var.Set.mem x ctx.writes then raise Found) vs; + true + with Found -> false + +(* Purely-local expression — no heap/global reads, no calls, no traps. + Stricter than [effect_free]: a [GlobalGet] or [ArrayGet] is + [effect_free] but not [trivially_pure], because crossing it with an + effectful [e] would reorder a read against [e]'s writes. *) +let rec trivially_pure (e : W.expression) = + match e with + | W.Const _ | RefFunc _ | RefNull _ | LocalGet _ | Pop _ -> true + | UnOp (_, e') + | I32WrapI64 e' + | I64ExtendI32 (_, e') + | F32DemoteF64 e' + | F64PromoteF32 e' + | RefI31 e' + | I31Get (_, e') + | ExternConvertAny e' + | AnyConvertExtern e' -> trivially_pure e' + | BinOp (_, e1, e2) | RefEq (e1, e2) -> trivially_pure e1 && trivially_pure e2 + | LocalTee _ + | GlobalGet _ + | ArrayLen _ + | StructGet _ + | ArrayGet _ + | ArrayNew _ + | ArrayNewData _ + | ArrayNewFixed _ + | StructNew _ + | RefCast _ + | RefTest _ + | Br_on_cast _ + | Br_on_cast_fail _ + | Br_on_null _ + | Call _ + | Call_ref _ + | IfExpr _ + | BlockExpr _ + | Seq _ + | Try _ -> false + +(* A sibling sub-expression was walked [Clean] (no x) and we're about + to continue to the next sibling. This is the reorder point: [e] will + evaluate *after* [sibling] in the sunk version, whereas originally + [e] ran first. Three conditions must all hold: + - [sibling] has no observable side effects; + - either [e] is effect-free or [sibling] is trivially pure (no + heap/global reads, no calls); + - [sibling]'s reads are disjoint from [e]'s writes — otherwise + [sibling] would read the pre-sink value of a local that [e] + overwrites. (The converse — [sibling] writing something [e] + reads — is caught by the walker returning [Bail] at intermediate + [LocalSet]/[LocalTee] and by the instruction-level continue + checks in [try_sink_in_instr].) *) +let may_cross_sibling ctx sibling = + effect_free sibling + && (ctx.e_effect_free || trivially_pure sibling) + && reads_disjoint_from_e_writes ctx (reads_of_expr sibling) + +let rec walk_expr ctx (e : W.expression) = + match e with + | W.Const _ | GlobalGet _ | RefFunc _ | RefNull _ | Pop _ -> Clean + | LocalGet y -> + if Var.equal y ctx.x then Found (W.LocalTee (ctx.x, ctx.e_to_sink)) else Clean + | LocalTee (y, _) when Var.equal y ctx.x -> + (* Another write to [x] — bail. *) + Bail + | LocalTee (y, e') -> ( + (* Reading [e'] first, then this tee writes [y]. Crossing this + point means [e] would run after the tee. Bail if: + - [y] is read by [e] (we'd read the tee's value instead of the + pre-tee one), or + - [y] is written by [e] (the final value of [y] would change). + + This [y] check is only a partial guard; it does NOT account for + [e']'s own effects or for [e'] reading a local that [e] writes. + The full safety comes from elsewhere: a [Clean] result here is + only ever crossed by a caller that gates continuation on + [may_cross_sibling], i.e. on [effect_free] of the enclosing + expression — and [effect_free] is [false] for anything + containing a [LocalTee]. So an expression with a tee can never + actually be stepped over; the [y] check is belt-and-braces. *) + match walk_expr ctx e' with + | Found e'' -> Found (W.LocalTee (y, e'')) + | Bail -> Bail + | Clean -> + if Var.Set.mem y ctx.reads || Var.Set.mem y ctx.writes then Bail else Clean) + | UnOp (op, e') -> wrap_unary (fun e -> W.UnOp (op, e)) ctx e' + | I32WrapI64 e' -> wrap_unary (fun e -> W.I32WrapI64 e) ctx e' + | I64ExtendI32 (s, e') -> wrap_unary (fun e -> W.I64ExtendI32 (s, e)) ctx e' + | F32DemoteF64 e' -> wrap_unary (fun e -> W.F32DemoteF64 e) ctx e' + | F64PromoteF32 e' -> wrap_unary (fun e -> W.F64PromoteF32 e) ctx e' + | RefI31 e' -> wrap_unary (fun e -> W.RefI31 e) ctx e' + | I31Get (s, e') -> wrap_unary (fun e -> W.I31Get (s, e)) ctx e' + | ArrayLen e' -> wrap_unary (fun e -> W.ArrayLen e) ctx e' + | StructGet (s, ty, i, e') -> wrap_unary (fun e -> W.StructGet (s, ty, i, e)) ctx e' + | RefCast (ty, e') -> wrap_unary (fun e -> W.RefCast (ty, e)) ctx e' + | RefTest (ty, e') -> wrap_unary (fun e -> W.RefTest (ty, e)) ctx e' + | ExternConvertAny e' -> wrap_unary (fun e -> W.ExternConvertAny e) ctx e' + | AnyConvertExtern e' -> wrap_unary (fun e -> W.AnyConvertExtern e) ctx e' + | BinOp (op, e1, e2) -> wrap_binary (fun a b -> W.BinOp (op, a, b)) ctx e1 e2 + | ArrayNew (ty, e1, e2) -> wrap_binary (fun a b -> W.ArrayNew (ty, a, b)) ctx e1 e2 + | ArrayNewData (ty, d, e1, e2) -> + wrap_binary (fun a b -> W.ArrayNewData (ty, d, a, b)) ctx e1 e2 + | ArrayGet (s, ty, e1, e2) -> + wrap_binary (fun a b -> W.ArrayGet (s, ty, a, b)) ctx e1 e2 + | RefEq (e1, e2) -> wrap_binary (fun a b -> W.RefEq (a, b)) ctx e1 e2 + | Call (f, args) -> wrap_list (fun args' -> W.Call (f, args')) ctx args + | ArrayNewFixed (ty, args) -> + wrap_list (fun args' -> W.ArrayNewFixed (ty, args')) ctx args + | StructNew (ty, args) -> wrap_list (fun args' -> W.StructNew (ty, args')) ctx args + | Call_ref (ty, f, args) -> ( + (* Wasm evaluates args before the funcref. *) + match wrap_list_intermediate ctx args with + | `Found args' -> Found (W.Call_ref (ty, f, args')) + | `Bail -> Bail + | `Clean -> ( + (* Between args and f, we've crossed every arg — allowed + only if each is [may_cross_sibling]-safe. That check was + already made inside [wrap_list_intermediate]. *) + match walk_expr ctx f with + | Found f' -> Found (W.Call_ref (ty, f', args)) + | Bail -> Bail + | Clean -> Clean)) + | IfExpr _ + | BlockExpr _ + | Seq _ + | Try _ + | Br_on_cast _ + | Br_on_cast_fail _ + | Br_on_null _ -> + (* Compound / control-flow-like expressions; we don't descend past + these for sinking purposes. *) + Bail + +and wrap_unary make ctx e' = + match walk_expr ctx e' with + | Found e'' -> Found (make e'') + | Bail -> Bail + | Clean -> Clean + +and wrap_binary make ctx e1 e2 = + match walk_expr ctx e1 with + | Found e1' -> Found (make e1' e2) + | Bail -> Bail + | Clean -> ( + if not (may_cross_sibling ctx e1 && tick ctx) + then Bail + else + match walk_expr ctx e2 with + | Found e2' -> Found (make e1 e2') + | Bail -> Bail + | Clean -> Clean) + +and wrap_list make ctx args = + match wrap_list_intermediate ctx args with + | `Found args' -> Found (make args') + | `Bail -> Bail + | `Clean -> Clean + +and wrap_list_intermediate ctx args = + let rec loop acc = function + | [] -> `Clean + | a :: rest -> ( + match walk_expr ctx a with + | Found a' -> `Found (List.rev_append acc (a' :: rest)) + | Bail -> `Bail + | Clean -> + if not (may_cross_sibling ctx a && tick ctx) + then `Bail + else loop (a :: acc) rest) + in + loop [] args + +(* Walk a single instruction looking for a sink target for [ctx.x]. *) +type instr_result = + | IFound of W.instruction + | IBail + | IClean + +let try_sink_in_instr ctx instr : instr_result = + let wrap_one make e = + match walk_expr ctx e with + | Found e' -> IFound (make e') + | Bail -> IBail + | Clean -> IClean + in + match (instr : W.instruction) with + | Nop | Event _ -> IClean + | Drop e -> wrap_one (fun e -> W.Drop e) e + | Push e -> wrap_one (fun e -> W.Push e) e + | LocalSet (y, e) when Var.equal y ctx.x -> ( + (* x may still appear inside [e] (evaluated before the set). If we + find and rewrite it there, we stop (the [local.set x] after + would be a shadowing write). If [e] is [Clean], this is a + shadowing write without a sink target → bail. *) + match walk_expr ctx e with + | Found e' -> IFound (W.LocalSet (y, e')) + | Bail -> IBail + | Clean -> IBail) + | LocalSet (y, e) -> wrap_one (fun e -> W.LocalSet (y, e)) e + | GlobalSet (g, e) -> wrap_one (fun e -> W.GlobalSet (g, e)) e + | StructSet (ty, i, e1, e2) -> ( + match walk_expr ctx e1 with + | Found e1' -> IFound (W.StructSet (ty, i, e1', e2)) + | Bail -> IBail + | Clean -> ( + if not (may_cross_sibling ctx e1 && tick ctx) + then IBail + else + match walk_expr ctx e2 with + | Found e2' -> IFound (W.StructSet (ty, i, e1, e2')) + | Bail -> IBail + | Clean -> IClean)) + | ArraySet (ty, e1, e2, e3) -> ( + match walk_expr ctx e1 with + | Found e1' -> IFound (W.ArraySet (ty, e1', e2, e3)) + | Bail -> IBail + | Clean -> ( + if not (may_cross_sibling ctx e1 && tick ctx) + then IBail + else + match walk_expr ctx e2 with + | Found e2' -> IFound (W.ArraySet (ty, e1, e2', e3)) + | Bail -> IBail + | Clean -> ( + if not (may_cross_sibling ctx e2 && tick ctx) + then IBail + else + match walk_expr ctx e3 with + | Found e3' -> IFound (W.ArraySet (ty, e1, e2, e3')) + | Bail -> IBail + | Clean -> IClean))) + | CallInstr (f, args) -> ( + match wrap_list_intermediate ctx args with + | `Found args' -> IFound (W.CallInstr (f, args')) + | `Bail -> IBail + | `Clean -> IClean) + (* Control-flow-terminal instructions with sub-expressions: we can still + rewrite within the expression, but cannot continue past on a Clean. *) + | Return (Some e) -> ( + match walk_expr ctx e with + | Found e' -> IFound (W.Return (Some e')) + | Bail -> IBail + | Clean -> IBail) + | Throw (t, e) -> ( + match walk_expr ctx e with + | Found e' -> IFound (W.Throw (t, e')) + | Bail -> IBail + | Clean -> IBail) + | Br (n, Some e) -> ( + match walk_expr ctx e with + | Found e' -> IFound (W.Br (n, Some e')) + | Bail -> IBail + | Clean -> IBail) + | Br_if (n, e) -> ( + match walk_expr ctx e with + | Found e' -> IFound (W.Br_if (n, e')) + | Bail -> IBail + | Clean -> IBail) + | Br_table (e, tl, d) -> ( + match walk_expr ctx e with + | Found e' -> IFound (W.Br_table (e', tl, d)) + | Bail -> IBail + | Clean -> IBail) + | Return_call (f, args) -> ( + match wrap_list_intermediate ctx args with + | `Found args' -> IFound (W.Return_call (f, args')) + | `Bail -> IBail + | `Clean -> IBail) + | Return_call_ref (ty, f, args) -> ( + match wrap_list_intermediate ctx args with + | `Found args' -> IFound (W.Return_call_ref (ty, f, args')) + | `Bail -> IBail + | `Clean -> ( + match walk_expr ctx f with + | Found f' -> IFound (W.Return_call_ref (ty, f', args)) + | Bail -> IBail + | Clean -> IBail)) + | Return None | Br (_, None) | Rethrow _ | Unreachable -> IBail + | If (ty, cond, l1, l2) -> ( + (* The condition is evaluated unconditionally before branching — + we can still sink into it. If no [x] is found there, we bail + because we can't continue past the branch. *) + match walk_expr ctx cond with + | Found cond' -> IFound (W.If (ty, cond', l1, l2)) + | Bail -> IBail + | Clean -> IBail) + | Loop _ | Block _ -> + (* No expression to walk at this level; do not sink into the body. *) + IBail + +(* Can we cross this instruction without [x] being found? Must be safe + with respect to [ctx.e_to_sink]'s potential side effects. An [Event] + carries a source-map location for everything that follows it: moving + an effectful [e_to_sink] past an event would re-attribute its call to + the wrong source position, so we only cross events when [e] is itself + effect-free. (A potentially-trapping but otherwise [effect_free] [e] + may still cross an event: see the traps assumption in the header — we + assume the trap never fires, so its attribution does not matter.) *) +let can_cross_instr ctx (instr : W.instruction) = + match instr with + | Nop -> true + | Event _ -> ctx.e_effect_free + | Drop e | Push e -> may_cross_sibling ctx e + | LocalSet (y, e) -> + may_cross_sibling ctx e + && (not (Var.Set.mem y ctx.reads)) + && not (Var.Set.mem y ctx.writes) + | _ -> false + +let try_sink_in_list ctx instrs = + let rec loop acc = function + | [] -> None + | instr :: rest -> ( + match try_sink_in_instr ctx instr with + | IFound instr' -> Some (List.rev_append acc (instr' :: rest)) + | IBail -> None + | IClean -> + if can_cross_instr ctx instr && tick ctx + then loop (instr :: acc) rest + else None) + in + loop [] instrs + +(* Bottom-up transformation: recurse first, then try to sink each + [local.set] into the (already-transformed) tail. *) +let rec transform_instrs instrs = + match instrs with + | [] -> [] + | W.LocalSet (x, e) :: rest -> ( + let e = transform_expr e in + let rest = transform_instrs rest in + let reads, writes = rw_of_expr e in + let ctx = + { x + ; e_to_sink = e + ; reads + ; writes + ; e_effect_free = effect_free e + ; budget = max_walk_distance + } + in + incr total_candidates; + match try_sink_in_list ctx rest with + | Some new_rest -> + incr total_sunk; + total_sink_distance := !total_sink_distance + (max_walk_distance - ctx.budget); + new_rest + | None -> W.LocalSet (x, e) :: rest) + | (W.Event _ as ev) :: rest -> ( + (* Sinking a [local.set] that previously separated two events + leaves them adjacent. Drop the earlier one so the closer + (later) event wins, matching the policy used in + [parse_bytecode], [code_generation], [deadcode], etc. *) + let rest = transform_instrs rest in + match rest with + | W.Event _ :: _ -> rest + | _ -> ev :: rest) + | instr :: rest -> + let instr = transform_instr instr in + let rest = transform_instrs rest in + instr :: rest + +and transform_instr (instr : W.instruction) : W.instruction = + match instr with + | Nop | Event _ | Br (_, None) | Return None | Rethrow _ | Unreachable -> instr + | Drop e -> Drop (transform_expr e) + | Push e -> Push (transform_expr e) + | LocalSet (x, e) -> LocalSet (x, transform_expr e) + | GlobalSet (g, e) -> GlobalSet (g, transform_expr e) + | StructSet (ty, i, e1, e2) -> StructSet (ty, i, transform_expr e1, transform_expr e2) + | ArraySet (ty, e1, e2, e3) -> + ArraySet (ty, transform_expr e1, transform_expr e2, transform_expr e3) + | CallInstr (f, args) -> CallInstr (f, List.map args ~f:transform_expr) + | Return (Some e) -> Return (Some (transform_expr e)) + | Throw (t, e) -> Throw (t, transform_expr e) + | Br (n, Some e) -> Br (n, Some (transform_expr e)) + | Br_if (n, e) -> Br_if (n, transform_expr e) + | Br_table (e, tl, d) -> Br_table (transform_expr e, tl, d) + | Return_call (f, args) -> Return_call (f, List.map args ~f:transform_expr) + | Return_call_ref (ty, f, args) -> + Return_call_ref (ty, transform_expr f, List.map args ~f:transform_expr) + | Loop (ty, l) -> Loop (ty, transform_instrs l) + | Block (ty, l) -> Block (ty, transform_instrs l) + | If (ty, cond, l1, l2) -> + If (ty, transform_expr cond, transform_instrs l1, transform_instrs l2) + +and transform_expr (e : W.expression) : W.expression = + match e with + | Const _ | LocalGet _ | GlobalGet _ | Pop _ | RefFunc _ | RefNull _ -> e + | UnOp (op, e') -> UnOp (op, transform_expr e') + | I32WrapI64 e' -> I32WrapI64 (transform_expr e') + | I64ExtendI32 (s, e') -> I64ExtendI32 (s, transform_expr e') + | F32DemoteF64 e' -> F32DemoteF64 (transform_expr e') + | F64PromoteF32 e' -> F64PromoteF32 (transform_expr e') + | RefI31 e' -> RefI31 (transform_expr e') + | I31Get (s, e') -> I31Get (s, transform_expr e') + | ArrayLen e' -> ArrayLen (transform_expr e') + | StructGet (s, ty, i, e') -> StructGet (s, ty, i, transform_expr e') + | RefCast (ty, e') -> RefCast (ty, transform_expr e') + | RefTest (ty, e') -> RefTest (ty, transform_expr e') + | ExternConvertAny e' -> ExternConvertAny (transform_expr e') + | AnyConvertExtern e' -> AnyConvertExtern (transform_expr e') + | Br_on_cast (i, ty, ty', e') -> Br_on_cast (i, ty, ty', transform_expr e') + | Br_on_cast_fail (i, ty, ty', e') -> Br_on_cast_fail (i, ty, ty', transform_expr e') + | Br_on_null (i, e') -> Br_on_null (i, transform_expr e') + | LocalTee (x, e') -> LocalTee (x, transform_expr e') + | BinOp (op, e1, e2) -> BinOp (op, transform_expr e1, transform_expr e2) + | ArrayNew (ty, e1, e2) -> ArrayNew (ty, transform_expr e1, transform_expr e2) + | ArrayNewData (ty, d, e1, e2) -> + ArrayNewData (ty, d, transform_expr e1, transform_expr e2) + | ArrayGet (s, ty, e1, e2) -> ArrayGet (s, ty, transform_expr e1, transform_expr e2) + | RefEq (e1, e2) -> RefEq (transform_expr e1, transform_expr e2) + | Call (f, args) -> Call (f, List.map args ~f:transform_expr) + | ArrayNewFixed (ty, args) -> ArrayNewFixed (ty, List.map args ~f:transform_expr) + | StructNew (ty, args) -> StructNew (ty, List.map args ~f:transform_expr) + | Call_ref (ty, f, args) -> + Call_ref (ty, transform_expr f, List.map args ~f:transform_expr) + | IfExpr (ty, cond, t, el) -> + IfExpr (ty, transform_expr cond, transform_expr t, transform_expr el) + | BlockExpr (ty, l) -> BlockExpr (ty, transform_instrs l) + | Seq (l, e') -> Seq (transform_instrs l, transform_expr e') + | Try (ty, body, catches) -> Try (ty, transform_instrs body, catches) + +let f instrs = + let t = Timer.make () in + incr total_calls; + let instrs = transform_instrs instrs in + total_time := !total_time +. Timer.get t; + instrs diff --git a/compiler/lib-wasm/local_sink.mli b/compiler/lib-wasm/local_sink.mli new file mode 100644 index 0000000000..a8d08fe719 --- /dev/null +++ b/compiler/lib-wasm/local_sink.mli @@ -0,0 +1,39 @@ +(* Wasm_of_ocaml compiler + * http://www.ocsigen.org/js_of_ocaml/ + * Copyright (C) 2026 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, with linking exception; + * either version 2.1 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + *) + +(** Local.set sinking for the Wasm backend. + + For each [local.set x e] in a function, the pass tries to push the + write forward to the first subsequent [local.get x], turning the + [set]/[get] pair into a single [local.tee]. The sink is applied only + when it does not cross another write to [x] (no intervening + [local.set x] / [local.tee x]), does not cross a control-flow + boundary, and does not reorder effects unsafely (conservatively, we + require [e] or the intervening code to be effect-free). + + The pass does not touch the [locals] list — it only deletes some + [local.set]s and turns some [local.get x]s into [local.tee x ...]. + Variables that become dead as a result are cleaned up later by + [Var_coalescing]. *) + +val f : Wasm_ast.instruction list -> Wasm_ast.instruction list + +val report_stats : unit -> unit +(** Emit aggregated time/stats logs accumulated across all [f] calls and + reset the counters. Honours the [times] and [stats] debug flags. *) diff --git a/compiler/lib-wasm/var_coalescing.ml b/compiler/lib-wasm/var_coalescing.ml index 72628726ee..f064d91e61 100644 --- a/compiler/lib-wasm/var_coalescing.ml +++ b/compiler/lib-wasm/var_coalescing.ml @@ -51,10 +51,45 @@ module W = Wasm_ast let times = Debug.find "times" -let debug = Debug.find "wasm-var-coalescing" - let stats = Debug.find "stats" +(* Aggregated statistics across all calls to [f]. The pass runs once per + Wasm function; per-function logs are noisy, so we accumulate here and + emit a single summary via [report_stats]. *) +let total_time = ref 0. + +let total_calls = ref 0 + +let total_candidates = ref 0 + +let total_hint_count = ref 0 + +let total_opportunistic_count = ref 0 + +let total_dead_tee_count = ref 0 + +let report_stats () = + if !total_calls > 0 + then ( + if times () then Format.eprintf " wasm var coalescing: %.2f@." !total_time; + if stats () + then + Format.eprintf + "Stats - wasm var coalescing: %d functions, %d candidates, %d coalesced (%d \ + hint, %d opportunistic), %d dead tees@." + !total_calls + !total_candidates + (!total_hint_count + !total_opportunistic_count) + !total_hint_count + !total_opportunistic_count + !total_dead_tee_count; + total_time := 0.; + total_calls := 0; + total_candidates := 0; + total_hint_count := 0; + total_opportunistic_count := 0; + total_dead_tee_count := 0) + (* --------------------------------------------------------------------- *) (* CFG construction *) (* --------------------------------------------------------------------- *) @@ -131,12 +166,18 @@ type stmt_graph = [catch_entry], every variable currently live is extended back to the start of the try body, modeling the fact that an exception can fire anywhere inside the body. *) + ; tee_nodes : node_id list + (** IDs of the [Def] nodes created for every [LocalTee] in AST + traversal order. Used by the dead-tee sweep after liveness + to decide which tees store a value no subsequent instruction + will read. *) } type graph_builder = { mutable nodes : (node_id * action * node_id list) list ; hints : Var.t Var.Hashtbl.t ; tries : node_id Int.Hashtbl.t + ; mutable tees : node_id list (* accumulated in reverse AST order *) } (* Block stack frame: the target node id reached by [br n] with n equal @@ -165,7 +206,7 @@ let nth_opt l n = let build_cfg ~candidates ~param_vars instrs = let builder = - { nodes = []; hints = Var.Hashtbl.create 16; tries = Int.Hashtbl.create 8 } + { nodes = []; hints = Var.Hashtbl.create 16; tries = Int.Hashtbl.create 8; tees = [] } in let next_id = ref 0 in let reserve_id () = @@ -200,12 +241,30 @@ let build_cfg ~candidates ~param_vars instrs = then add_node (Def (Var.Set.singleton x)) [ exit_node ] else exit_node in + (* For [LocalTee] we additionally remember the Def node we just + created so that the dead-tee sweep can later consult it. When [x] + is not a candidate, no Def node is created (we return [exit_node] + unchanged) and no tee-sweep is needed for that position either, + so we don't record anything. *) + let tee_node x exit_node = + if Var.Set.mem x candidates + then ( + let id = add_node (Def (Var.Set.singleton x)) [ exit_node ] in + builder.tees <- id :: builder.tees; + id) + else ( + (* Record [-1] as a placeholder so the rewriter can still pop one + entry per [LocalTee] it encounters in the AST, keeping the + traversal orders synchronised. *) + builder.tees <- -1 :: builder.tees; + exit_node) + in let rec build_expr block_stack exit_node (e : W.expression) = match e with | Const _ | GlobalGet _ | Pop _ | RefFunc _ | RefNull _ -> exit_node | LocalGet x -> use_node x exit_node | LocalTee (x, e') -> - let tee = def_node x exit_node in + let tee = tee_node x exit_node in build_expr block_stack tee e' | UnOp (_, e') | I32WrapI64 e' @@ -375,6 +434,7 @@ let build_cfg ~candidates ~param_vars instrs = ; succs ; coalescing_hints = builder.hints ; try_blocks = builder.tries + ; tee_nodes = List.rev builder.tees } (* --------------------------------------------------------------------- *) @@ -800,78 +860,156 @@ let subst_var subst x = | None -> x | Some y -> y -let rec rewrite_expr subst (e : W.expression) : W.expression = +(* State threaded through the rewrite: the substitution, a mutable queue + of [tee_nodes] popped as we encounter each [LocalTee] in the AST, the + set of dead-tee node ids (as classified before rename), and an + accumulator of variables that had their only write eliminated by a + dead-tee replacement. *) +type rewriter = + { subst : Var.t Var.Hashtbl.t + ; mutable tee_queue : node_id list + ; dead_tees : unit Int.Hashtbl.t + ; erased : unit Var.Hashtbl.t + } + +let pop_tee r = + match r.tee_queue with + | [] -> + (* Every [LocalTee] encountered by the rewriter corresponds to one + entry recorded by [build_cfg]. If the queue is empty here, the + two traversals have drifted out of sync. *) + assert false + | id :: rest -> + r.tee_queue <- rest; + id + +(* A [rev_map]-style helper that applies [f] to list elements in + right-to-left order, matching [List.fold_right]'s visit order used + by [build_cfg] — the tee queue must be consumed in the same order it + was produced. *) +let map_right_to_left f l = List.fold_right l ~init:[] ~f:(fun x acc -> f x :: acc) + +let rec rewrite_expr r (e : W.expression) : W.expression = match e with | Const _ | GlobalGet _ | Pop _ | RefFunc _ | RefNull _ -> e - | LocalGet x -> LocalGet (subst_var subst x) - | LocalTee (x, e') -> LocalTee (subst_var subst x, rewrite_expr subst e') - | UnOp (op, e') -> UnOp (op, rewrite_expr subst e') - | I32WrapI64 e' -> I32WrapI64 (rewrite_expr subst e') - | I64ExtendI32 (s, e') -> I64ExtendI32 (s, rewrite_expr subst e') - | F32DemoteF64 e' -> F32DemoteF64 (rewrite_expr subst e') - | F64PromoteF32 e' -> F64PromoteF32 (rewrite_expr subst e') - | RefI31 e' -> RefI31 (rewrite_expr subst e') - | I31Get (s, e') -> I31Get (s, rewrite_expr subst e') - | ArrayLen e' -> ArrayLen (rewrite_expr subst e') - | StructGet (s, ty, i, e') -> StructGet (s, ty, i, rewrite_expr subst e') - | RefCast (ty, e') -> RefCast (ty, rewrite_expr subst e') - | RefTest (ty, e') -> RefTest (ty, rewrite_expr subst e') - | Br_on_cast (i, a, b, e') -> Br_on_cast (i, a, b, rewrite_expr subst e') - | Br_on_cast_fail (i, a, b, e') -> Br_on_cast_fail (i, a, b, rewrite_expr subst e') - | Br_on_null (i, e') -> Br_on_null (i, rewrite_expr subst e') - | ExternConvertAny e' -> ExternConvertAny (rewrite_expr subst e') - | AnyConvertExtern e' -> AnyConvertExtern (rewrite_expr subst e') - | BinOp (op, e1, e2) -> BinOp (op, rewrite_expr subst e1, rewrite_expr subst e2) - | ArrayNew (ty, e1, e2) -> ArrayNew (ty, rewrite_expr subst e1, rewrite_expr subst e2) + | LocalGet x -> LocalGet (subst_var r.subst x) + | LocalTee (x, e') -> + (* [build_cfg] records the tee's Def-node id BEFORE recursing into + [e']; mirror that here. *) + let id = pop_tee r in + let dead = id <> -1 && Int.Hashtbl.mem r.dead_tees id in + let e'' = rewrite_expr r e' in + if dead + then ( + Var.Hashtbl.replace r.erased x (); + e'') + else LocalTee (subst_var r.subst x, e'') + | UnOp (op, e') -> UnOp (op, rewrite_expr r e') + | I32WrapI64 e' -> I32WrapI64 (rewrite_expr r e') + | I64ExtendI32 (s, e') -> I64ExtendI32 (s, rewrite_expr r e') + | F32DemoteF64 e' -> F32DemoteF64 (rewrite_expr r e') + | F64PromoteF32 e' -> F64PromoteF32 (rewrite_expr r e') + | RefI31 e' -> RefI31 (rewrite_expr r e') + | I31Get (s, e') -> I31Get (s, rewrite_expr r e') + | ArrayLen e' -> ArrayLen (rewrite_expr r e') + | StructGet (s, ty, i, e') -> StructGet (s, ty, i, rewrite_expr r e') + | RefCast (ty, e') -> RefCast (ty, rewrite_expr r e') + | RefTest (ty, e') -> RefTest (ty, rewrite_expr r e') + | Br_on_cast (i, a, b, e') -> Br_on_cast (i, a, b, rewrite_expr r e') + | Br_on_cast_fail (i, a, b, e') -> Br_on_cast_fail (i, a, b, rewrite_expr r e') + | Br_on_null (i, e') -> Br_on_null (i, rewrite_expr r e') + | ExternConvertAny e' -> ExternConvertAny (rewrite_expr r e') + | AnyConvertExtern e' -> AnyConvertExtern (rewrite_expr r e') + (* Binary cases: [build_cfg] processes [e2] first (fold-right style). *) + | BinOp (op, e1, e2) -> + let e2' = rewrite_expr r e2 in + let e1' = rewrite_expr r e1 in + BinOp (op, e1', e2') + | ArrayNew (ty, e1, e2) -> + let e2' = rewrite_expr r e2 in + let e1' = rewrite_expr r e1 in + ArrayNew (ty, e1', e2') | ArrayNewData (ty, d, e1, e2) -> - ArrayNewData (ty, d, rewrite_expr subst e1, rewrite_expr subst e2) + let e2' = rewrite_expr r e2 in + let e1' = rewrite_expr r e1 in + ArrayNewData (ty, d, e1', e2') | ArrayGet (s, ty, e1, e2) -> - ArrayGet (s, ty, rewrite_expr subst e1, rewrite_expr subst e2) - | RefEq (e1, e2) -> RefEq (rewrite_expr subst e1, rewrite_expr subst e2) - | Call (f, l) -> Call (f, List.map l ~f:(rewrite_expr subst)) - | ArrayNewFixed (ty, l) -> ArrayNewFixed (ty, List.map l ~f:(rewrite_expr subst)) - | StructNew (ty, l) -> StructNew (ty, List.map l ~f:(rewrite_expr subst)) - | Call_ref (f, g, l) -> - Call_ref (f, rewrite_expr subst g, List.map l ~f:(rewrite_expr subst)) + let e2' = rewrite_expr r e2 in + let e1' = rewrite_expr r e1 in + ArrayGet (s, ty, e1', e2') + | RefEq (e1, e2) -> + let e2' = rewrite_expr r e2 in + let e1' = rewrite_expr r e1 in + RefEq (e1', e2') + (* List cases: args processed right-to-left. *) + | Call (f, l) -> Call (f, map_right_to_left (rewrite_expr r) l) + | ArrayNewFixed (ty, l) -> ArrayNewFixed (ty, map_right_to_left (rewrite_expr r) l) + | StructNew (ty, l) -> StructNew (ty, map_right_to_left (rewrite_expr r) l) + (* [Call_ref]: funcref first, then args right-to-left. *) + | Call_ref (ty, f, l) -> + let f' = rewrite_expr r f in + let l' = map_right_to_left (rewrite_expr r) l in + Call_ref (ty, f', l') + (* [IfExpr]: [e1], then [e2], then [cond] — [build_cfg] order. *) | IfExpr (ty, cond, e1, e2) -> - IfExpr (ty, rewrite_expr subst cond, rewrite_expr subst e1, rewrite_expr subst e2) - | BlockExpr (ty, l) -> BlockExpr (ty, rewrite_instrs subst l) - | Seq (l, e') -> Seq (rewrite_instrs subst l, rewrite_expr subst e') - | Try (ty, body, catches) -> Try (ty, rewrite_instrs subst body, catches) - -and rewrite_instr subst (i : W.instruction) : W.instruction option = + let e1' = rewrite_expr r e1 in + let e2' = rewrite_expr r e2 in + let cond' = rewrite_expr r cond in + IfExpr (ty, cond', e1', e2') + | BlockExpr (ty, l) -> BlockExpr (ty, rewrite_instrs r l) + | Seq (l, e') -> + let e'' = rewrite_expr r e' in + let l' = rewrite_instrs r l in + Seq (l', e'') + | Try (ty, body, catches) -> Try (ty, rewrite_instrs r body, catches) + +and rewrite_instr r (i : W.instruction) : W.instruction option = match i with | Nop | Event _ | Unreachable | Rethrow _ | Br (_, None) | Return None -> Some i - | Drop e -> Some (Drop (rewrite_expr subst e)) - | Push e -> Some (Push (rewrite_expr subst e)) + | Drop e -> Some (Drop (rewrite_expr r e)) + | Push e -> Some (Push (rewrite_expr r e)) | LocalSet (x, e) -> ( - let x' = subst_var subst x in - let e' = rewrite_expr subst e in + let e' = rewrite_expr r e in + let x' = subst_var r.subst x in match e' with | LocalGet y when Var.equal x' y -> None | _ -> Some (LocalSet (x', e'))) - | GlobalSet (x, e) -> Some (GlobalSet (x, rewrite_expr subst e)) + | GlobalSet (x, e) -> Some (GlobalSet (x, rewrite_expr r e)) | StructSet (ty, i, e1, e2) -> - Some (StructSet (ty, i, rewrite_expr subst e1, rewrite_expr subst e2)) + let e2' = rewrite_expr r e2 in + let e1' = rewrite_expr r e1 in + Some (StructSet (ty, i, e1', e2')) | ArraySet (ty, e1, e2, e3) -> - Some - (ArraySet (ty, rewrite_expr subst e1, rewrite_expr subst e2, rewrite_expr subst e3)) - | CallInstr (f, l) -> Some (CallInstr (f, List.map l ~f:(rewrite_expr subst))) - | Return_call (f, l) -> Some (Return_call (f, List.map l ~f:(rewrite_expr subst))) - | Return_call_ref (f, g, l) -> - Some (Return_call_ref (f, rewrite_expr subst g, List.map l ~f:(rewrite_expr subst))) - | Return (Some e) -> Some (Return (Some (rewrite_expr subst e))) - | Throw (t, e) -> Some (Throw (t, rewrite_expr subst e)) - | Br (i, Some e) -> Some (Br (i, Some (rewrite_expr subst e))) - | Br_if (i, e) -> Some (Br_if (i, rewrite_expr subst e)) - | Br_table (e, l, d) -> Some (Br_table (rewrite_expr subst e, l, d)) - | Loop (ty, l) -> Some (Loop (ty, rewrite_instrs subst l)) - | Block (ty, l) -> Some (Block (ty, rewrite_instrs subst l)) + let e3' = rewrite_expr r e3 in + let e2' = rewrite_expr r e2 in + let e1' = rewrite_expr r e1 in + Some (ArraySet (ty, e1', e2', e3')) + | CallInstr (f, l) -> Some (CallInstr (f, map_right_to_left (rewrite_expr r) l)) + | Return_call (f, l) -> Some (Return_call (f, map_right_to_left (rewrite_expr r) l)) + | Return_call_ref (ty, f, l) -> + let f' = rewrite_expr r f in + let l' = map_right_to_left (rewrite_expr r) l in + Some (Return_call_ref (ty, f', l')) + | Return (Some e) -> Some (Return (Some (rewrite_expr r e))) + | Throw (t, e) -> Some (Throw (t, rewrite_expr r e)) + | Br (i, Some e) -> Some (Br (i, Some (rewrite_expr r e))) + | Br_if (i, e) -> Some (Br_if (i, rewrite_expr r e)) + | Br_table (e, l, d) -> Some (Br_table (rewrite_expr r e, l, d)) + | Loop (ty, l) -> Some (Loop (ty, rewrite_instrs r l)) + | Block (ty, l) -> Some (Block (ty, rewrite_instrs r l)) | If (ty, cond, l1, l2) -> - Some - (If (ty, rewrite_expr subst cond, rewrite_instrs subst l1, rewrite_instrs subst l2)) - -and rewrite_instrs subst l = List.filter_map l ~f:(rewrite_instr subst) + let l1' = rewrite_instrs r l1 in + let l2' = rewrite_instrs r l2 in + let cond' = rewrite_expr r cond in + Some (If (ty, cond', l1', l2')) + +(* [build_cfg] processes instruction lists right-to-left via [fold_right]. + Use the same order here so the tee queue is consumed consistently. *) +and rewrite_instrs r l = + List.fold_right l ~init:[] ~f:(fun i acc -> + match rewrite_instr r i with + | Some i' -> i' :: acc + | None -> acc) (* --------------------------------------------------------------------- *) (* Entry point *) @@ -899,43 +1037,162 @@ let f ~param_names ~param_types ~locals instrs = let num_candidates = Var.Set.cardinal candidates in if num_candidates <= 1 then locals, instrs - else + else ( + incr total_calls; + total_candidates := !total_candidates + num_candidates; let g = build_cfg ~candidates ~param_vars instrs in let live_in_map = compute_liveness g in + (* Dead-tee detection using the liveness already computed. A + [LocalTee x e] Def node is "dead" iff [x] is not in the node's + [live_out] — no subsequent instruction will read what the tee + stored. This must be done on the ORIGINAL variables, before + rename: coalescing only guarantees disjoint live ranges for + the vars it merges, not that the tee's write is unobserved by + the slot's other users. + + [live_out] here is the union over the node's NORMAL-flow + successors only; it does not include the exception-edge + liveness that [compute_live_ranges] adds via the [try_blocks] + back-extension. That is sound because the input is in (loop-) + SSA form: a local read by a catch handler is defined by a block + that dominates the handler, and since the handler is reachable + from anywhere in the protected body, no interior body point + dominates it. Hence the defining [LocalTee] always sits BEFORE + the [Try], never inside the body, and an interior tee can never + be the sole definition of a variable observed only on the + exceptional path. If that invariant were ever broken, an + interior tee whose value is read only by a handler would be + wrongly dropped here. *) + let dead_tees = Int.Hashtbl.create 16 in + List.iter g.tee_nodes ~f:(fun id -> + if id >= 0 + then + let live_out = + List.fold_left g.succs.(id) ~init:Var.Set.empty ~f:(fun acc s -> + Var.Set.union acc live_in_map.(s)) + in + match (g.actions.(id) : action) with + | Def d -> + let x = Var.Set.choose d in + if not (Var.Set.mem x live_out) then Int.Hashtbl.replace dead_tees id () + | Use _ | Nop -> ()); let ranges = compute_live_ranges g live_in_map candidates param_vars in let subst = Var.Hashtbl.create num_candidates in let hint_count, opportunistic_count = allocate_registers subst types ranges g.coalescing_hints in - if debug () - then - Format.eprintf - "wasm var-coalescing: %d candidates, %d hint + %d opportunistic@." - num_candidates - hint_count - opportunistic_count; - (* Only candidates that merged into a different representative are - recorded in [subst]; its size is therefore the count of changes. *) - if Var.Hashtbl.length subst = 0 + let dead_tee_count = Int.Hashtbl.length dead_tees in + total_hint_count := !total_hint_count + hint_count; + total_opportunistic_count := !total_opportunistic_count + opportunistic_count; + total_dead_tee_count := !total_dead_tee_count + dead_tee_count; + (* Short-circuit only when BOTH rewriting drivers would be no-ops. *) + if Var.Hashtbl.length subst = 0 && dead_tee_count = 0 then ( - if times () then Format.eprintf " wasm var coalescing: %a@." Timer.print t; + total_time := !total_time +. Timer.get t; locals, instrs) else - let instrs = rewrite_instrs subst instrs in - (* Rebuild locals: keep only those whose representative is themselves, - i.e. those that weren't merged away. Preserve original ordering to - keep the Wasm output deterministic. *) + let r = + { subst; tee_queue = g.tee_nodes; dead_tees; erased = Var.Hashtbl.create 16 } + in + let instrs = rewrite_instrs r instrs in + (* The tee queue must be fully consumed if [build_cfg] and the + rewriter saw the same AST. *) + assert (List.is_empty r.tee_queue); + (* Dropping dead tees may have left some locals with no + references at all. Scan the rewritten body once to find such + orphans among the vars we erased, and drop them from the + [locals] declaration. *) + let still_referenced = + if Var.Hashtbl.length r.erased = 0 + then r.erased (* unused; avoid a scan *) + else + let seen = Var.Hashtbl.create (Var.Hashtbl.length r.erased) in + let note v = if Var.Hashtbl.mem r.erased v then Var.Hashtbl.replace seen v () in + let rec scan_expr (e : W.expression) = + match e with + | Const _ | GlobalGet _ | Pop _ | RefFunc _ | RefNull _ -> () + | LocalGet x -> note x + | LocalTee (x, e') -> + note x; + scan_expr e' + | UnOp (_, e') + | I32WrapI64 e' + | I64ExtendI32 (_, e') + | F32DemoteF64 e' + | F64PromoteF32 e' + | RefI31 e' + | I31Get (_, e') + | ArrayLen e' + | StructGet (_, _, _, e') + | RefCast (_, e') + | RefTest (_, e') + | Br_on_cast (_, _, _, e') + | Br_on_cast_fail (_, _, _, e') + | Br_on_null (_, e') + | ExternConvertAny e' + | AnyConvertExtern e' -> scan_expr e' + | BinOp (_, e1, e2) + | ArrayNew (_, e1, e2) + | ArrayNewData (_, _, e1, e2) + | ArrayGet (_, _, e1, e2) + | RefEq (e1, e2) -> + scan_expr e1; + scan_expr e2 + | Call (_, l) | ArrayNewFixed (_, l) | StructNew (_, l) -> + List.iter l ~f:scan_expr + | Call_ref (_, f, l) -> + scan_expr f; + List.iter l ~f:scan_expr + | IfExpr (_, c, t, el) -> + scan_expr c; + scan_expr t; + scan_expr el + | BlockExpr (_, l) -> scan_instrs l + | Seq (l, e') -> + scan_instrs l; + scan_expr e' + | Try (_, body, _) -> scan_instrs body + and scan_instr (i : W.instruction) = + match i with + | Nop | Event _ | Br (_, None) | Return None | Rethrow _ | Unreachable -> () + | Drop e + | Push e + | GlobalSet (_, e) + | Br (_, Some e) + | Br_if (_, e) + | Br_table (e, _, _) + | Throw (_, e) + | Return (Some e) -> scan_expr e + | LocalSet (x, e) -> + note x; + scan_expr e + | StructSet (_, _, e1, e2) -> + scan_expr e1; + scan_expr e2 + | ArraySet (_, e1, e2, e3) -> + scan_expr e1; + scan_expr e2; + scan_expr e3 + | CallInstr (_, l) | Return_call (_, l) -> List.iter l ~f:scan_expr + | Return_call_ref (_, f, l) -> + scan_expr f; + List.iter l ~f:scan_expr + | Loop (_, l) | Block (_, l) -> scan_instrs l + | If (_, c, t, el) -> + scan_expr c; + scan_instrs t; + scan_instrs el + and scan_instrs l = List.iter l ~f:scan_instr in + scan_instrs instrs; + seen + in + (* Rebuild locals: drop vars merged away, or vars whose only + reference was an eliminated dead tee and who have no + surviving references in the rewritten body. *) let kept_locals = - List.filter locals ~f:(fun (v, _) -> not (Var.Hashtbl.mem subst v)) + List.filter locals ~f:(fun (v, _) -> + (not (Var.Hashtbl.mem subst v)) + && not (Var.Hashtbl.mem r.erased v && not (Var.Hashtbl.mem still_referenced v))) in - if times () then Format.eprintf " wasm var coalescing: %a@." Timer.print t; - if stats () - then - Format.eprintf - "Stats - wasm var coalescing: %d candidates, %d coalesced (%d hint, %d \ - opportunistic)@." - num_candidates - (hint_count + opportunistic_count) - hint_count - opportunistic_count; - kept_locals, instrs + total_time := !total_time +. Timer.get t; + kept_locals, instrs) diff --git a/compiler/lib-wasm/var_coalescing.mli b/compiler/lib-wasm/var_coalescing.mli index df3ffb6147..3a0a2c769a 100644 --- a/compiler/lib-wasm/var_coalescing.mli +++ b/compiler/lib-wasm/var_coalescing.mli @@ -34,3 +34,7 @@ val f : -> locals:(Code.Var.t * Wasm_ast.value_type) list -> Wasm_ast.instruction list -> (Code.Var.t * Wasm_ast.value_type) list * Wasm_ast.instruction list + +val report_stats : unit -> unit +(** Emit aggregated time/stats logs accumulated across all [f] calls and + reset the counters. Honours the [times] and [stats] debug flags. *) diff --git a/compiler/lib/config.ml b/compiler/lib/config.ml index 48787daa3a..cf474d838b 100644 --- a/compiler/lib/config.ml +++ b/compiler/lib/config.ml @@ -105,6 +105,8 @@ module Flag = struct let wasm_var_coalescing = o ~name:"wasm-var-coalescing" ~default:true + let wasm_local_sink = o ~name:"wasm-local-sink" ~default:true + let header = o ~name:"header" ~default:true let auto_link = o ~name:"auto-link" ~default:true diff --git a/compiler/lib/config.mli b/compiler/lib/config.mli index 54b4118690..0d00527384 100644 --- a/compiler/lib/config.mli +++ b/compiler/lib/config.mli @@ -54,6 +54,8 @@ module Flag : sig val wasm_var_coalescing : unit -> bool + val wasm_local_sink : unit -> bool + val debugger : unit -> bool val pretty : unit -> bool From 6a73e8f268831c7a226b28ca91021082317cbaf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Vouillon?= Date: Mon, 11 May 2026 18:24:40 +0200 Subject: [PATCH 5/6] Wasm: reorder locals so hot ones get low indices Wasm encodes local indices as LEB128 u32 (1 byte for < 128, 2 above). With [wasm-opt] skipped at --opt 1, hot locals routinely cross the 128 boundary and pay an extra byte per access. Add a [Reorder_locals] pass that sorts non-parameter locals into a numeric block followed by a reference block, with same-type runs ordered by descending total use count and individual locals inside a run by descending per-local count. Indices are derived from list position, so reordering [locals] alone suffices. Runs at the end of [post_process_function_body], after [Initialize_locals.f]; gated on [O1] and the new [wasm-reorder-locals] flag. --- compiler/lib-wasm/gc_target.ml | 12 +- compiler/lib-wasm/reorder_locals.ml | 199 +++++++++++++++++++++++++++ compiler/lib-wasm/reorder_locals.mli | 34 +++++ compiler/lib/config.ml | 2 + compiler/lib/config.mli | 2 + 5 files changed, 248 insertions(+), 1 deletion(-) create mode 100644 compiler/lib-wasm/reorder_locals.ml create mode 100644 compiler/lib-wasm/reorder_locals.mli diff --git a/compiler/lib-wasm/gc_target.ml b/compiler/lib-wasm/gc_target.ml index 3b960c95d2..bd6da643f8 100644 --- a/compiler/lib-wasm/gc_target.ml +++ b/compiler/lib-wasm/gc_target.ml @@ -2072,7 +2072,17 @@ let post_process_function_body ~profile ~param_names ~param_types ~locals body = Var_coalescing.f ~param_names ~param_types ~locals body | O1 | O2 | O3 -> locals, body in - Initialize_locals.f ~param_names ~locals body + let locals, body = Initialize_locals.f ~param_names ~locals body in + (* Reorder locals so frequently-used ones get low Wasm indices + (one-byte LEB128 encoding for indices < 128). Gated on [O1] to + match the rest of this pipeline; at [O2]/[O3] [wasm-opt] handles + local reordering. *) + let locals = + match (profile : Profile.t) with + | O1 when Config.Flag.wasm_reorder_locals () -> Reorder_locals.f ~locals body + | O1 | O2 | O3 -> locals + in + locals, body let entry_point ~toplevel_fun = let code = diff --git a/compiler/lib-wasm/reorder_locals.ml b/compiler/lib-wasm/reorder_locals.ml new file mode 100644 index 0000000000..e5deefbff6 --- /dev/null +++ b/compiler/lib-wasm/reorder_locals.ml @@ -0,0 +1,199 @@ +(* Wasm_of_ocaml compiler + * http://www.ocsigen.org/js_of_ocaml/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, with linking exception; + * either version 2.1 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + *) + +open! Stdlib +module W = Wasm_ast +module Var = Code.Var + +(* + Reorder a function's non-parameter locals. The resulting layout is + + [ numeric block | reference block ] + + Each block is a sequence of same-type runs (i32, i64, f32, f64 in + the numeric block; the various ref types in the reference block). + Within a block, runs are ordered by descending total use count, and + within each run the individual locals are ordered by descending use + count. Parameters are not part of this list and keep their + signature-fixed indices. + + Why low indices matter. [local.get]/[local.set]/[local.tee] encode + their local index as LEB128, so indices 0..127 take one byte and + 128..16383 take two. Placing high-use locals first shaves a byte + off each of their accesses in the emitted Wasm. + + Why numerics come before references. V8's Liftoff baseline expects + two contiguous regions: numerics first (zero-initialised via a fast + block-zeroing path) and references after (null-initialised by a + single follow-up pass). OCaml code is reference-heavy, so the + leading numeric block tends to be small and the hot ref locals + still get low indices. + + Why same-type runs. The binary format encodes consecutive locals of + the same type as a single [(count, type)] pair, and V8 Liftoff + initialises each type-run as one block, so contiguity makes the + function section smaller and warm-up faster. + + Why no body rewrite is needed. [local.get]/[local.set]/[local.tee] + reference locals by [Var.t], not by a baked-in numeric index; + [wasm_output] derives each numeric index from the variable's position + in [param_names @ locals] at emit time. So permuting the [locals] + list is the whole transformation — the body is read only to count + uses and returned untouched. (Parameters live in [param_names], which + we never touch, so their indices are fixed.) +*) + +(* --------------------------------------------------------------------- *) +(* Use counting *) +(* --------------------------------------------------------------------- *) + +let bump counts x = + match Var.Hashtbl.find_opt counts x with + | Some n -> Var.Hashtbl.replace counts x (n + 1) + | None -> () +(* Parameters are absent from [counts], so their reads go uncounted. *) + +let rec count_expr counts (e : W.expression) = + match e with + | Const _ | GlobalGet _ | RefFunc _ | RefNull _ | Pop _ -> () + | LocalGet x -> bump counts x + | LocalTee (x, e') -> + bump counts x; + count_expr counts e' + | UnOp (_, e') + | I32WrapI64 e' + | I64ExtendI32 (_, e') + | F32DemoteF64 e' + | F64PromoteF32 e' + | RefI31 e' + | I31Get (_, e') + | ArrayLen e' + | StructGet (_, _, _, e') + | RefCast (_, e') + | RefTest (_, e') + | Br_on_cast (_, _, _, e') + | Br_on_cast_fail (_, _, _, e') + | Br_on_null (_, e') + | ExternConvertAny e' + | AnyConvertExtern e' -> count_expr counts e' + | BinOp (_, e1, e2) + | ArrayNew (_, e1, e2) + | ArrayNewData (_, _, e1, e2) + | ArrayGet (_, _, e1, e2) + | RefEq (e1, e2) -> + count_expr counts e1; + count_expr counts e2 + | Call (_, l) | ArrayNewFixed (_, l) | StructNew (_, l) -> + List.iter l ~f:(count_expr counts) + | Call_ref (_, f, l) -> + count_expr counts f; + List.iter l ~f:(count_expr counts) + | IfExpr (_, c, t, el) -> + count_expr counts c; + count_expr counts t; + count_expr counts el + | BlockExpr (_, l) -> count_instrs counts l + | Seq (l, e') -> + count_instrs counts l; + count_expr counts e' + | Try (_, body, _) -> count_instrs counts body + +and count_instr counts (i : W.instruction) = + match i with + | Nop | Event _ | Br (_, None) | Return None | Rethrow _ | Unreachable -> () + | Drop e + | Push e + | GlobalSet (_, e) + | Br (_, Some e) + | Br_if (_, e) + | Br_table (e, _, _) + | Throw (_, e) + | Return (Some e) -> count_expr counts e + | LocalSet (x, e) -> + bump counts x; + count_expr counts e + | StructSet (_, _, e1, e2) -> + count_expr counts e1; + count_expr counts e2 + | ArraySet (_, e1, e2, e3) -> + count_expr counts e1; + count_expr counts e2; + count_expr counts e3 + | CallInstr (_, l) | Return_call (_, l) -> List.iter l ~f:(count_expr counts) + | Return_call_ref (_, f, l) -> + count_expr counts f; + List.iter l ~f:(count_expr counts) + | Loop (_, l) | Block (_, l) -> count_instrs counts l + | If (_, c, t, el) -> + count_expr counts c; + count_instrs counts t; + count_instrs counts el + +and count_instrs counts l = List.iter l ~f:(count_instr counts) + +(* --------------------------------------------------------------------- *) +(* Reordering *) +(* --------------------------------------------------------------------- *) + +let is_ref_type (t : W.value_type) = + match t with + | Ref _ -> true + | I32 | I64 | F32 | F64 -> false + +(* Bucket the locals by type — one bucket holding every local of a given + type — preserving the first-occurrence order of the types. Returns a + list of (type, locals-of-that-type). + Collecting all same-type locals into a single bucket is what lets the + final layout emit one [(count, type)] run per type. *) +let group_by_type locals = + let types = + List.rev + (List.fold_left locals ~init:[] ~f:(fun seen (_, t) -> + if List.mem ~eq:Poly.equal t seen then seen else t :: seen)) + in + List.map types ~f:(fun t -> t, List.filter locals ~f:(fun (_, t') -> Poly.equal t t')) + +(* Stable sort [l] by descending key [f x]. *) +let sort_desc ~key l = List.stable_sort l ~cmp:(fun a b -> compare (key b) (key a)) + +let f ~locals body = + let counts = Var.Hashtbl.create (List.length locals) in + List.iter locals ~f:(fun (x, _) -> Var.Hashtbl.add counts x 0); + count_instrs counts body; + let count_of x = Var.Hashtbl.find counts x in + (* Split into reference-typed and numeric-typed locals. [List.partition] + preserves relative order, so [group_by_type] sees the types in their + original first-occurrence order. *) + let refs, nums = List.partition locals ~f:(fun (_, t) -> is_ref_type t) in + let reorder_group group = + let runs = group_by_type group in + let runs = + List.map runs ~f:(fun (t, members) -> + let sorted = sort_desc members ~key:(fun (x, _) -> count_of x) in + let total = List.fold_left sorted ~init:0 ~f:(fun s (x, _) -> s + count_of x) in + t, sorted, total) + in + let runs = sort_desc runs ~key:(fun (_, _, total) -> total) in + List.concat_map runs ~f:(fun (_, sorted, _) -> sorted) + in + (* Numeric-typed locals are typically much rarer in OCaml code than + reference-typed ones, so placing them first keeps the high-use + reference locals in a contiguous run starting right after — none + of them pushed across the 128-index boundary by intervening + numerics. *) + reorder_group nums @ reorder_group refs diff --git a/compiler/lib-wasm/reorder_locals.mli b/compiler/lib-wasm/reorder_locals.mli new file mode 100644 index 0000000000..3e242abaf7 --- /dev/null +++ b/compiler/lib-wasm/reorder_locals.mli @@ -0,0 +1,34 @@ +(* Wasm_of_ocaml compiler + * http://www.ocsigen.org/js_of_ocaml/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published by + * the Free Software Foundation, with linking exception; + * either version 2.1 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + *) + +(** Reorder a function's [locals] list so frequently-used locals get + lower Wasm indices (one-byte LEB128 encoding for indices < 128). + + Parameters are untouched; only the non-parameter [locals] list is + reordered. The instruction body is unchanged — the Wasm local + index of a variable is determined by its position in + [param_names @ locals], so rearranging the list is enough. + + Reference-type locals and numeric-type locals are kept in separate + blocks so that the run-length encoding performed by + [Wasm_output.coalesce_locals] doesn't get fragmented. *) + +val f : + locals:(Wasm_ast.var * Wasm_ast.value_type) list + -> Wasm_ast.instruction list + -> (Wasm_ast.var * Wasm_ast.value_type) list diff --git a/compiler/lib/config.ml b/compiler/lib/config.ml index cf474d838b..29187b4385 100644 --- a/compiler/lib/config.ml +++ b/compiler/lib/config.ml @@ -107,6 +107,8 @@ module Flag = struct let wasm_local_sink = o ~name:"wasm-local-sink" ~default:true + let wasm_reorder_locals = o ~name:"wasm-reorder-locals" ~default:true + let header = o ~name:"header" ~default:true let auto_link = o ~name:"auto-link" ~default:true diff --git a/compiler/lib/config.mli b/compiler/lib/config.mli index 0d00527384..dc9a957a45 100644 --- a/compiler/lib/config.mli +++ b/compiler/lib/config.mli @@ -56,6 +56,8 @@ module Flag : sig val wasm_local_sink : unit -> bool + val wasm_reorder_locals : unit -> bool + val debugger : unit -> bool val pretty : unit -> bool From 63bc2a72b9731204398d90beb501f3466d1b0a08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Vouillon?= Date: Tue, 26 May 2026 23:58:43 +0200 Subject: [PATCH 6/6] CHANGES: skip wasm-opt at --opt 1 (#2238) --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 7d0fb12e3d..ebdb0092e1 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -10,6 +10,7 @@ * Compiler: improved shape computation (#2198) * Add the --build-config and --apply-build-config flags (#2177) * Runtime/wasm: optimized some bigstring primitives (#2144) +* Compiler/wasm: skip wasm-opt at --opt 1 (#2238) * Lib: many additional `Dom_html` bindings (#2221) * Lib: add `Performance` module (#2221) * Put more values into global variables (#2211)