From 96602d547b1c267132f7b4a6c428dea3e970ee61 Mon Sep 17 00:00:00 2001 From: ekatsym Date: Fri, 28 Jan 2022 10:50:14 +0900 Subject: [PATCH 01/13] add node_action --- src/syntax/ppx_sedlex.ml | 17 ++++++++++++----- src/syntax/sedlex.ml | 22 ++++++++++++++++++---- src/syntax/sedlex.mli | 7 ++++++- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index e184313d..5b8dc531 100644 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -175,6 +175,8 @@ let partition (name, p) = [%e body] ] +(* Alias offset *) + (* Code generation for the automata *) let best_final final = @@ -196,8 +198,8 @@ let call_state lexbuf auto state = let gen_state lexbuf auto i (trans, final) = let loc = default_loc in - let partition = Array.map fst trans in - let cases = Array.mapi (fun i (_, j) -> case ~lhs:(pint ~loc i) ~guard:None ~rhs:(call_state lexbuf auto j)) trans in + let partition = Array.map (fun (c, _, _) -> c) trans in + let cases = Array.mapi (fun i (_, j, _acts) -> case ~lhs:(pint ~loc i) ~guard:None ~rhs:(call_state lexbuf auto j)) trans in let cases = Array.to_list cases in let body () = pexp_match ~loc @@ -217,7 +219,7 @@ let gen_recflag auto = Array.iter (fun (trans_i, _) -> Array.iter - (fun (_, j) -> + (fun (_, j, _) -> let (trans_j, _) = auto.(j) in if Array.length trans_j > 0 then raise Exit) trans_i) @@ -353,12 +355,17 @@ let regexp_of_pattern env = with Not_found -> err p.ppat_loc (Printf.sprintf "unbound regexp %s" x) end + | Ppat_alias (pat, {txt=label}) -> + let begin_offset_slot_var = "__" ^ label ^ "_begin_offset" in + let end_offset_slot_var = "__" ^ label ^"_end_offset" in + aux pat + |> Sedlex.set_post_action (`save_offset end_offset_slot_var) + |> Sedlex.set_pre_action (`save_offset begin_offset_slot_var) | _ -> - err p.ppat_loc "this pattern is not a valid regexp" + err p.ppat_loc "this pattern is not a valid regexp" in aux - let previous = ref [] let regexps = ref [] let should_set_cookies = ref false diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml index aeb42d36..9fc54eb6 100644 --- a/src/syntax/sedlex.ml +++ b/src/syntax/sedlex.ml @@ -8,18 +8,29 @@ module Cset = Sedlex_cset type node = { id : int; + mutable action : node_action list; mutable eps : node list; mutable trans : (Cset.t * node) list; } +and node_action = [`save_offset of string] (* Compilation regexp -> NFA *) type regexp = node -> node +let set_pre_action act re succ = + let init = re succ in + init.action <- act :: init.action; + init + +let set_post_action act re succ = + succ.action <- act :: succ.action; + re succ + let cur_id = ref 0 let new_node () = incr cur_id; - { id = !cur_id; eps = []; trans = [] } + { id = !cur_id; action = []; eps = []; trans = [] } let seq r1 r2 succ = r1 (r2 succ) @@ -94,8 +105,8 @@ let transition (state : state) = (* Merge transition with the same target *) let rec norm = function | (c1, n1)::((c2, n2)::q as l) -> - if n1 == n2 then norm ((Cset.union c1 c2, n1)::q) - else (c1, n1)::(norm l) + if n1 == n2 then norm ((Cset.union c1 c2, n1)::q) + else (c1, n1)::(norm l) | l -> l in let t = List.concat (List.map (fun n -> n.trans) state) in let t = norm (List.sort (fun (_, n1) (_, n2) -> n1.id - n2.id) t) in @@ -133,7 +144,10 @@ let compile rs = incr counter; Hashtbl.add states state i; let trans = transition state in - let trans = Array.map (fun (p, t) -> (p, aux t)) trans in + let trans = + Array.map + (fun (p, t) -> (p, aux t, List.concat_map (fun n -> n.action) t)) + trans in let finals = Array.map (fun (_, f) -> List.memq f state) rs in Hashtbl.add states_def i (trans, finals); i diff --git a/src/syntax/sedlex.mli b/src/syntax/sedlex.mli index d0e03926..65cdccc1 100644 --- a/src/syntax/sedlex.mli +++ b/src/syntax/sedlex.mli @@ -4,6 +4,11 @@ type regexp +type node_action = [`save_offset of string] + +val set_pre_action: node_action -> regexp -> regexp +val set_post_action: node_action -> regexp -> regexp + val chars: Sedlex_cset.t -> regexp val seq: regexp -> regexp -> regexp val alt: regexp -> regexp -> regexp @@ -21,4 +26,4 @@ val intersection: regexp -> regexp -> regexp option (* If each argument is a single [chars] regexp, returns a regexp which matches the intersection set. Otherwise returns [None]. *) -val compile: regexp array -> ((Sedlex_cset.t * int) array * bool array) array +val compile: regexp array -> ((Sedlex_cset.t * int * node_action list) array * bool array) array From 92c738e50cee161a52bd3050ea5273796b367864 Mon Sep 17 00:00:00 2001 From: hongoh Date: Wed, 2 Feb 2022 18:44:04 +0900 Subject: [PATCH 02/13] WIP: gen_aliases. --- src/syntax/ppx_sedlex.ml | 26 ++++++++++++++++++++------ src/syntax/sedlex.ml | 2 +- src/syntax/sedlex.mli | 2 +- 3 files changed, 22 insertions(+), 8 deletions(-) mode change 100644 => 100755 src/syntax/ppx_sedlex.ml diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml old mode 100644 new mode 100755 index 5b8dc531..147675f5 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -199,7 +199,7 @@ let call_state lexbuf auto state = let gen_state lexbuf auto i (trans, final) = let loc = default_loc in let partition = Array.map (fun (c, _, _) -> c) trans in - let cases = Array.mapi (fun i (_, j, _acts) -> case ~lhs:(pint ~loc i) ~guard:None ~rhs:(call_state lexbuf auto j)) trans in + let cases = Array.mapi (fun i (_, j, _) -> case ~lhs:(pint ~loc i) ~guard:None ~rhs:(call_state lexbuf auto j)) trans in let cases = Array.to_list cases in let body () = pexp_match ~loc @@ -212,6 +212,12 @@ let gen_state lexbuf auto i (trans, final) = | Some _ when Array.length trans = 0 -> [] | Some i -> ret [%expr Sedlexing.mark [%e evar ~loc lexbuf] [%e eint ~loc i]; [%e body ()]] +let gen_alias lexbuf _auto _i _e = + let loc = default_loc in + [value_binding ~loc + ~pat:(ppat_tuple ~loc [ppat_any ~loc; pvar ~loc "dummy"]) + ~expr:[%expr Sedlexing.loc [%e evar ~loc lexbuf]]] + let gen_recflag auto = (* The generated function is not recursive if the transitions end in states with no further transitions. *) @@ -232,7 +238,15 @@ let gen_definition lexbuf l error = let loc = default_loc in let brs = Array.of_list l in let auto = Sedlex.compile (Array.map fst brs) in - let cases = Array.to_list (Array.mapi (fun i (_, e) -> case ~lhs:(pint ~loc i) ~guard:None ~rhs:e) brs) in + (* auto = [|([|(c11, i11, a11); ...; (c1m, i1m, a1m)|], + * [|b11; ...; b1n|]); + * ...; + * ([|(cm1, im1, am1); ...; (cmm, imm, amm)|], + * [|bm1; ...; bmn|])|] + * where ``n'' is the number of regexp and ``m'' is the number of compiled DFA states *) + let aliases : value_binding list array = Array.mapi (fun i br -> (gen_alias lexbuf auto i (snd br))) brs in + let _aliases : value_binding list = List.flatten (Array.to_list aliases) in + let cases = Array.to_list (Array.mapi (fun i (_, e) -> case ~lhs:(pint ~loc i) ~guard:None ~rhs:(e)) brs) in let states = Array.mapi (gen_state lexbuf auto) auto in let states = List.flatten (Array.to_list states) in pexp_let ~loc (gen_recflag auto) states @@ -356,11 +370,11 @@ let regexp_of_pattern env = err p.ppat_loc (Printf.sprintf "unbound regexp %s" x) end | Ppat_alias (pat, {txt=label}) -> - let begin_offset_slot_var = "__" ^ label ^ "_begin_offset" in - let end_offset_slot_var = "__" ^ label ^"_end_offset" in + let begin_offset_slot_var = "__"^label^"_begin_offset" in + let end_offset_slot_var = "__"^label^"_end_offset" in aux pat - |> Sedlex.set_post_action (`save_offset end_offset_slot_var) - |> Sedlex.set_pre_action (`save_offset begin_offset_slot_var) + |> Sedlex.set_post_action (label, `save_offset end_offset_slot_var) + |> Sedlex.set_pre_action (label, `save_offset begin_offset_slot_var) | _ -> err p.ppat_loc "this pattern is not a valid regexp" in diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml index 9fc54eb6..848cc210 100644 --- a/src/syntax/sedlex.ml +++ b/src/syntax/sedlex.ml @@ -12,7 +12,7 @@ type node = { mutable eps : node list; mutable trans : (Cset.t * node) list; } -and node_action = [`save_offset of string] +and node_action = string * [`save_offset of string] (* Compilation regexp -> NFA *) diff --git a/src/syntax/sedlex.mli b/src/syntax/sedlex.mli index 65cdccc1..94eb023c 100644 --- a/src/syntax/sedlex.mli +++ b/src/syntax/sedlex.mli @@ -4,7 +4,7 @@ type regexp -type node_action = [`save_offset of string] +type node_action = string * [`save_offset of string] val set_pre_action: node_action -> regexp -> regexp val set_post_action: node_action -> regexp -> regexp From 146dd2a9757698e0e32d5bd68dc78b9c5ef0d205 Mon Sep 17 00:00:00 2001 From: hongoh Date: Fri, 4 Feb 2022 19:20:45 +0900 Subject: [PATCH 03/13] modify regexp and apply changes to related functions --- src/syntax/ppx_sedlex.ml | 31 ++++++++---- src/syntax/sedlex.ml | 101 +++++++++++++++++++++++++-------------- src/syntax/sedlex.mli | 11 ++++- 3 files changed, 97 insertions(+), 46 deletions(-) mode change 100644 => 100755 src/syntax/sedlex.ml mode change 100644 => 100755 src/syntax/sedlex.mli diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index 147675f5..7a60f3be 100755 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -176,6 +176,7 @@ let partition (name, p) = ] (* Alias offset *) +let offset_counter = ref (-1) (* Code generation for the automata *) @@ -190,6 +191,10 @@ let state_fun state = Printf.sprintf "__sedlex_state_%i" state let call_state lexbuf auto state = let (trans, final) = auto.(state) in + (* + let (_, _, acts) = trans in + let acts = List.map (fun (label, `save_offset offset)) acts + *) if Array.length trans = 0 then match best_final final with | Some i -> eint ~loc:default_loc i @@ -212,11 +217,14 @@ let gen_state lexbuf auto i (trans, final) = | Some _ when Array.length trans = 0 -> [] | Some i -> ret [%expr Sedlexing.mark [%e evar ~loc lexbuf] [%e eint ~loc i]; [%e body ()]] -let gen_alias lexbuf _auto _i _e = +let gen_alias lexbuf _auto i = let loc = default_loc in [value_binding ~loc - ~pat:(ppat_tuple ~loc [ppat_any ~loc; pvar ~loc "dummy"]) - ~expr:[%expr Sedlexing.loc [%e evar ~loc lexbuf]]] + ~pat:(pvar ~loc ("branch_"^string_of_int i^"_dummy")) + ~expr:[%expr Sedlexing.sub_lexeme + (! [%e evar ~loc "dummy_start_offset"]) + (! [%e evar ~loc "dummy_end_offset"]) + [%e evar ~loc lexbuf]]] let gen_recflag auto = (* The generated function is not recursive if the transitions end @@ -244,8 +252,12 @@ let gen_definition lexbuf l error = * ([|(cm1, im1, am1); ...; (cmm, imm, amm)|], * [|bm1; ...; bmn|])|] * where ``n'' is the number of regexp and ``m'' is the number of compiled DFA states *) - let aliases : value_binding list array = Array.mapi (fun i br -> (gen_alias lexbuf auto i (snd br))) brs in + (* + let alias_slots = Array.map (fun (trans, _) -> List.concat_map (fun (_, _, a) -> a) (Array.to_list trans)) auto in + let alias_slots = List.flatten (Array.to_list alias_slots) in + let aliases : value_binding list array = Array.mapi (fun i _ -> (gen_alias lexbuf auto i)) brs in let _aliases : value_binding list = List.flatten (Array.to_list aliases) in + *) let cases = Array.to_list (Array.mapi (fun i (_, e) -> case ~lhs:(pint ~loc i) ~guard:None ~rhs:(e)) brs) in let states = Array.mapi (gen_state lexbuf auto) auto in let states = List.flatten (Array.to_list states) in @@ -369,12 +381,13 @@ let regexp_of_pattern env = with Not_found -> err p.ppat_loc (Printf.sprintf "unbound regexp %s" x) end - | Ppat_alias (pat, {txt=label}) -> - let begin_offset_slot_var = "__"^label^"_begin_offset" in - let end_offset_slot_var = "__"^label^"_end_offset" in + | Ppat_alias (pat, {txt=var}) -> + incr offset_counter; + let begin_offset_slot_var = string_of_int !offset_counter^"__"^var^"_begin_offset" in + let end_offset_slot_var = string_of_int !offset_counter^"__"^var^"_end_offset" in aux pat - |> Sedlex.set_post_action (label, `save_offset end_offset_slot_var) - |> Sedlex.set_pre_action (label, `save_offset begin_offset_slot_var) + |> Sedlex.set_post_action (`save_offset {orig=var; slot=end_offset_slot_var}) + |> Sedlex.set_pre_action (`save_offset {orig=var; slot=begin_offset_slot_var}) | _ -> err p.ppat_loc "this pattern is not a valid regexp" in diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml old mode 100644 new mode 100755 index 848cc210..8f9644ad --- a/src/syntax/sedlex.ml +++ b/src/syntax/sedlex.ml @@ -4,6 +4,11 @@ module Cset = Sedlex_cset +module StringMap = Map.Make(struct + type t = string + let compare = compare +end) + (* NFA *) type node = { @@ -12,62 +17,86 @@ type node = { mutable eps : node list; mutable trans : (Cset.t * node) list; } -and node_action = string * [`save_offset of string] +and node_action = [`save_offset of save_offset_action] +and save_offset_action = { + orig : string; + slot : string; +} (* Compilation regexp -> NFA *) -type regexp = node -> node +type regexp = { + nfa : node -> node; + named_groups : named_slots StringMap.t; +} +and named_slots = { + begin_var : string; + end_var : string; +} -let set_pre_action act re succ = - let init = re succ in - init.action <- act :: init.action; - init +let get_slots name re = + let slots = StringMap.find name re.named_groups in + (slots.begin_var, slots.end_var) -let set_post_action act re succ = - succ.action <- act :: succ.action; - re succ +let set_pre_action act re = + {re with + nfa = (fun succ -> + let init = re.nfa succ in + init.action <- act :: init.action; + init)} + +let set_post_action act re = + {re with + nfa = (fun succ -> + succ.action <- act :: succ.action; + re.nfa succ)} let cur_id = ref 0 let new_node () = incr cur_id; { id = !cur_id; action = []; eps = []; trans = [] } -let seq r1 r2 succ = r1 (r2 succ) +let regexp_of_nfa nfa = + {nfa = nfa; + named_groups = StringMap.empty;} + +let seq r1 r2 = + regexp_of_nfa (fun succ -> r1.nfa (r2.nfa succ)) let is_chars final = function | {eps = []; trans = [c, f]} when f == final -> Some c | _ -> None -let chars c succ = - let n = new_node () in - n.trans <- [c,succ]; - n - -let alt r1 r2 succ = - let nr1 = r1 succ and nr2 = r2 succ in - match is_chars succ nr1, is_chars succ nr2 with - | Some c1, Some c2 -> chars (Cset.union c1 c2) succ - | _ -> +let chars c = regexp_of_nfa (fun succ -> let n = new_node () in - n.eps <- [r1 succ; r2 succ]; - n - -let rep r succ = - let n = new_node () in - n.eps <- [r n; succ]; - n + n.trans <- [c,succ]; + n) + +let alt r1 r2 = regexp_of_nfa (fun succ -> + let nr1 = r1.nfa succ and nr2 = r2.nfa succ in + match is_chars succ nr1, is_chars succ nr2 with + | Some c1, Some c2 -> (chars (Cset.union c1 c2)).nfa succ + | _ -> + let n = new_node () in + n.eps <- [r1.nfa succ; r2.nfa succ]; + n) + +let rep r = regexp_of_nfa (fun succ -> + let n = new_node () in + n.eps <- [r.nfa n; succ]; + n) -let plus r succ = - let n = new_node () in - let nr = r n in - n.eps <- [nr; succ]; - nr +let plus r = regexp_of_nfa (fun succ -> + let n = new_node () in + let nr = r.nfa n in + n.eps <- [nr; succ]; + nr) -let eps succ = succ (* eps for epsilon *) +let eps = regexp_of_nfa (fun succ -> succ) (* eps for epsilon *) let compl r = let n = new_node () in - match is_chars n (r n) with + match is_chars n (r.nfa n) with | Some c -> Some (chars (Cset.difference Cset.any c)) | _ -> @@ -75,7 +104,7 @@ let compl r = let pair_op f r0 r1 = (* Construct subtract or intersection *) let n = new_node () in - let to_chars r = is_chars n (r n) in + let to_chars r = is_chars n (r.nfa n) in match to_chars r0, to_chars r1 with | Some c0, Some c1 -> Some (chars (f c0 c1)) @@ -88,7 +117,7 @@ let intersection = pair_op Cset.intersection let compile_re re = let final = new_node () in - (re final, final) + (re.nfa final, final) (* Determinization *) diff --git a/src/syntax/sedlex.mli b/src/syntax/sedlex.mli old mode 100644 new mode 100755 index 94eb023c..9326350b --- a/src/syntax/sedlex.mli +++ b/src/syntax/sedlex.mli @@ -3,9 +3,18 @@ (* Copyright 2005, 2013 by Alain Frisch and LexiFi. *) type regexp +and named_slots = { + begin_var: string; + end_var: string; +} -type node_action = string * [`save_offset of string] +type node_action = [`save_offset of save_offset_action] +and save_offset_action = { + orig : string; + slot : string; +} +val get_slots: string -> regexp -> string * string val set_pre_action: node_action -> regexp -> regexp val set_post_action: node_action -> regexp -> regexp From b7fc02c0e67522a2f1f32a2d118218034afc30f8 Mon Sep 17 00:00:00 2001 From: hongoh Date: Tue, 8 Feb 2022 20:53:08 +0900 Subject: [PATCH 04/13] add an as-pattern test --- examples/as_pattern.ml | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 examples/as_pattern.ml diff --git a/examples/as_pattern.ml b/examples/as_pattern.ml new file mode 100644 index 00000000..9f732f6e --- /dev/null +++ b/examples/as_pattern.ml @@ -0,0 +1,12 @@ +let rec token buf = + match%sedlex buf with + | eof -> print_endline "\tEnd" + | (Star white_space), ((Star (Compl white_space)) as text), (Star white_space) -> + print_endline text; token buf + | any -> print_endline "other"; token buf + | _ -> failwith "Internal failure: Reached impossible place" + + +let () = + let lexbuf = Sedlexing.Utf8.from_string "It takes all the running you can do, to keep in the same place." in + token lexbuf From d910e117eec4fea781cfd0efcc0554c310e90f9f Mon Sep 17 00:00:00 2001 From: hongoh Date: Thu, 10 Feb 2022 18:48:24 +0900 Subject: [PATCH 05/13] update test --- examples/as_pattern.ml | 11 +++++++++-- examples/dune | 9 ++++++++- src/syntax/ppx_sedlex.ml | 0 src/syntax/sedlex.ml | 0 src/syntax/sedlex.mli | 0 5 files changed, 17 insertions(+), 3 deletions(-) mode change 100755 => 100644 src/syntax/ppx_sedlex.ml mode change 100755 => 100644 src/syntax/sedlex.ml mode change 100755 => 100644 src/syntax/sedlex.mli diff --git a/examples/as_pattern.ml b/examples/as_pattern.ml index 9f732f6e..65c9a6ea 100644 --- a/examples/as_pattern.ml +++ b/examples/as_pattern.ml @@ -1,8 +1,15 @@ let rec token buf = match%sedlex buf with | eof -> print_endline "\tEnd" - | (Star white_space), ((Star (Compl white_space)) as text), (Star white_space) -> - print_endline text; token buf + | white_space -> print_endline "\tWhitespace"; token buf + | ((Plus ('a' .. 'z' | 'A' .. 'Z')) as text, white_space) -> + print_string "as-pattern text:\t"; + print_endline (String.of_seq (Array.to_seq (Array.map Uchar.to_char text))); + token buf + | (',' | '.') as x -> + print_string "as-pattern x:\t"; + print_endline (String.of_seq (Array.to_seq (Array.map Uchar.to_char x))); + token buf | any -> print_endline "other"; token buf | _ -> failwith "Internal failure: Reached impossible place" diff --git a/examples/dune b/examples/dune index 2eb6eaaf..821017ab 100644 --- a/examples/dune +++ b/examples/dune @@ -1,5 +1,5 @@ (executables - (names tokenizer regressions complement subtraction repeat performance) + (names tokenizer regressions complement subtraction repeat as_pattern performance) (libraries sedlex sedlex_ppx) (preprocess (pps sedlex.ppx)) @@ -40,6 +40,13 @@ (action (run %{<}))) +(rule + (alias runtest) + (deps + (:< as_pattern.exe)) + (action + (run %{<}))) + (rule (alias runtest) (deps diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml old mode 100755 new mode 100644 diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml old mode 100755 new mode 100644 diff --git a/src/syntax/sedlex.mli b/src/syntax/sedlex.mli old mode 100755 new mode 100644 From fd3a40852c49ac587eb24a83e606b2a515739bcd Mon Sep 17 00:00:00 2001 From: hongoh Date: Thu, 10 Feb 2022 18:56:52 +0900 Subject: [PATCH 06/13] change node_action timing from before transition to after entering the node, add gen_aliase_slots and gen_aliases, and add alias binding in each branches. but it still does not work correctly --- src/syntax/ppx_sedlex.ml | 126 ++++++++++++++++++++++++--------------- src/syntax/sedlex.ml | 75 ++++++++++++++--------- src/syntax/sedlex.mli | 14 ++--- 3 files changed, 130 insertions(+), 85 deletions(-) diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index 7a60f3be..6edbeff7 100644 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -187,29 +187,71 @@ let best_final final = done; !fin +let gen_alisas_slots auto = + let loc = default_loc in + let module S = Set.Make(struct type t = string let compare = compare end) in + let slots = ref [] in + let seen = ref S.empty in + Array.iter (fun (_, _, acts) -> List.iter (function + | (`save_offset slot) -> + if S.mem slot !seen then () + else slots := slot :: !slots; seen := S.add slot !seen + | _ -> assert false) acts) auto; + List.map (fun slot -> value_binding ~loc ~pat:(pvar ~loc slot) ~expr:[%expr ref (-1)]) !slots + +let gen_aliases lexbuf re = + let loc = default_loc in + List.map (fun name -> + let begin_slot, end_slot = Sedlex.get_slots name re in + value_binding ~loc + ~pat:(pvar ~loc name) + ~expr:( +(* + pexp_sequence ~loc + [%expr print_endline + ("Sedlexing.sub_lexeme lexbuf " + ^string_of_int (! [%e evar ~loc begin_slot])^" " + ^string_of_int ((! [%e evar ~loc end_slot]) - (! [%e evar ~loc begin_slot])))] +*) + [%expr Sedlexing.sub_lexeme + [%e evar ~loc lexbuf] + (! [%e evar ~loc begin_slot]) + ((! [%e evar ~loc end_slot]) - (! [%e evar ~loc begin_slot]))])) + (Sedlex.get_names re) + let state_fun state = Printf.sprintf "__sedlex_state_%i" state let call_state lexbuf auto state = - let (trans, final) = auto.(state) in - (* - let (_, _, acts) = trans in - let acts = List.map (fun (label, `save_offset offset)) acts - *) + let (trans, final, _) = auto.(state) in if Array.length trans = 0 then match best_final final with | Some i -> eint ~loc:default_loc i | None -> assert false else appfun (state_fun state) [evar ~loc:default_loc lexbuf] -let gen_state lexbuf auto i (trans, final) = +let gen_state lexbuf auto i (trans, final, actions) = let loc = default_loc in - let partition = Array.map (fun (c, _, _) -> c) trans in - let cases = Array.mapi (fun i (_, j, _) -> case ~lhs:(pint ~loc i) ~guard:None ~rhs:(call_state lexbuf auto j)) trans in + let partition = Array.map fst trans in + let actions = List.map (function + | `save_offset act -> +(* + pexp_sequence ~loc + [%expr + print_string [%e estring ~loc (act^" := ")]; + print_int (snd (Sedlexing.loc [%e evar ~loc lexbuf])); + print_newline ();] +*) + [%expr [%e evar ~loc act] := (snd (Sedlexing.loc [%e evar ~loc lexbuf]))] + | _ -> assert false) actions in + let cases = Array.mapi (fun i (_, j) -> + case ~lhs:(pint ~loc i) ~guard:None ~rhs:(call_state lexbuf auto j)) trans in let cases = Array.to_list cases in let body () = - pexp_match ~loc - (appfun (partition_name partition) [[%expr Sedlexing.__private__next_int [%e evar ~loc lexbuf]]]) - (cases @ [case ~lhs:[%pat? _] ~guard:None ~rhs:[%expr Sedlexing.backtrack [%e evar ~loc lexbuf]]]) + esequence ~loc + (actions @ + [pexp_match ~loc + (appfun (partition_name partition) [[%expr Sedlexing.__private__next_int [%e evar ~loc lexbuf]]]) + (cases @ [case ~lhs:[%pat? _] ~guard:None ~rhs:[%expr Sedlexing.backtrack [%e evar ~loc lexbuf]]])]) in let ret body = [ value_binding ~loc ~pat:(pvar ~loc (state_fun i)) ~expr:(pexp_function ~loc [case ~lhs:(pvar ~loc lexbuf) ~guard:None ~rhs:body]) ] in match best_final final with @@ -217,24 +259,15 @@ let gen_state lexbuf auto i (trans, final) = | Some _ when Array.length trans = 0 -> [] | Some i -> ret [%expr Sedlexing.mark [%e evar ~loc lexbuf] [%e eint ~loc i]; [%e body ()]] -let gen_alias lexbuf _auto i = - let loc = default_loc in - [value_binding ~loc - ~pat:(pvar ~loc ("branch_"^string_of_int i^"_dummy")) - ~expr:[%expr Sedlexing.sub_lexeme - (! [%e evar ~loc "dummy_start_offset"]) - (! [%e evar ~loc "dummy_end_offset"]) - [%e evar ~loc lexbuf]]] - let gen_recflag auto = (* The generated function is not recursive if the transitions end in states with no further transitions. *) try Array.iter - (fun (trans_i, _) -> + (fun (trans_i, _, _) -> Array.iter - (fun (_, j, _) -> - let (trans_j, _) = auto.(j) in + (fun (_, j) -> + let (trans_j, _, _) = auto.(j) in if Array.length trans_j > 0 then raise Exit) trans_i) auto; @@ -246,28 +279,26 @@ let gen_definition lexbuf l error = let loc = default_loc in let brs = Array.of_list l in let auto = Sedlex.compile (Array.map fst brs) in - (* auto = [|([|(c11, i11, a11); ...; (c1m, i1m, a1m)|], - * [|b11; ...; b1n|]); - * ...; - * ([|(cm1, im1, am1); ...; (cmm, imm, amm)|], - * [|bm1; ...; bmn|])|] - * where ``n'' is the number of regexp and ``m'' is the number of compiled DFA states *) - (* - let alias_slots = Array.map (fun (trans, _) -> List.concat_map (fun (_, _, a) -> a) (Array.to_list trans)) auto in - let alias_slots = List.flatten (Array.to_list alias_slots) in - let aliases : value_binding list array = Array.mapi (fun i _ -> (gen_alias lexbuf auto i)) brs in - let _aliases : value_binding list = List.flatten (Array.to_list aliases) in - *) - let cases = Array.to_list (Array.mapi (fun i (_, e) -> case ~lhs:(pint ~loc i) ~guard:None ~rhs:(e)) brs) in + let alias_slots = gen_alisas_slots auto in + let aliases = Array.map (fun (re, _) -> gen_aliases lexbuf re) brs in + let cases = Array.to_list (Array.mapi (fun i (_, e) -> + let expression = match aliases.(i) with + | [] -> e + | aliases -> pexp_let ~loc Nonrecursive aliases e in + case ~lhs:(pint ~loc i) ~guard:None ~rhs:expression) brs) in let states = Array.mapi (gen_state lexbuf auto) auto in let states = List.flatten (Array.to_list states) in - pexp_let ~loc (gen_recflag auto) states - (pexp_sequence ~loc - [%expr Sedlexing.start [%e evar ~loc lexbuf]] - (pexp_match ~loc (appfun (state_fun 0) [evar ~loc lexbuf]) - (cases @ [case ~lhs:(ppat_any ~loc) ~guard:None ~rhs:error]) - ) - ) + let body () = + pexp_let ~loc (gen_recflag auto) states + (pexp_sequence ~loc + [%expr Sedlexing.start [%e evar ~loc lexbuf]] + (pexp_match ~loc (appfun (state_fun 0) [evar ~loc lexbuf]) + (cases @ [case ~lhs:(ppat_any ~loc) ~guard:None ~rhs:error]) + ) + ) in + match alias_slots with + | [] -> body () + | _ -> pexp_let ~loc Nonrecursive alias_slots (body ()) (* Lexer specification parser *) @@ -383,11 +414,12 @@ let regexp_of_pattern env = end | Ppat_alias (pat, {txt=var}) -> incr offset_counter; - let begin_offset_slot_var = string_of_int !offset_counter^"__"^var^"_begin_offset" in - let end_offset_slot_var = string_of_int !offset_counter^"__"^var^"_end_offset" in + let begin_offset_slot_var = "__sedlex_"^var^"_begin_offset_"^(string_of_int !offset_counter) in + let end_offset_slot_var = "__sedlex_"^var^"_end_offset_"^(string_of_int !offset_counter) in aux pat - |> Sedlex.set_post_action (`save_offset {orig=var; slot=end_offset_slot_var}) - |> Sedlex.set_pre_action (`save_offset {orig=var; slot=begin_offset_slot_var}) + |> Sedlex.set_slots var (begin_offset_slot_var, end_offset_slot_var) + |> Sedlex.set_post_action (`save_offset end_offset_slot_var) + |> Sedlex.set_pre_action (`save_offset begin_offset_slot_var) | _ -> err p.ppat_loc "this pattern is not a valid regexp" in diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml index 8f9644ad..7032bc28 100644 --- a/src/syntax/sedlex.ml +++ b/src/syntax/sedlex.ml @@ -17,11 +17,7 @@ type node = { mutable eps : node list; mutable trans : (Cset.t * node) list; } -and node_action = [`save_offset of save_offset_action] -and save_offset_action = { - orig : string; - slot : string; -} +and node_action = [`save_offset of string] (* Compilation regexp -> NFA *) @@ -34,10 +30,15 @@ and named_slots = { end_var : string; } +let get_names re = StringMap.fold (fun name _ acc -> name :: acc) re.named_groups [] + let get_slots name re = let slots = StringMap.find name re.named_groups in (slots.begin_var, slots.end_var) +let set_slots name (begin_slot, end_slot) re = + {re with named_groups = StringMap.add name {begin_var=begin_slot; end_var=end_slot;} re.named_groups} + let set_pre_action act re = {re with nfa = (fun succ -> @@ -60,8 +61,17 @@ let regexp_of_nfa nfa = {nfa = nfa; named_groups = StringMap.empty;} +let merge_named_groups g1 g2 = + StringMap.merge (fun _ s1_opt s2_opt -> + match s1_opt, s2_opt with + | Some s, None | None, Some s -> Some s + | None, None -> None + | Some _, Some _ -> failwith "duplicate named_slots with the same name.") + g1 g2 + let seq r1 r2 = - regexp_of_nfa (fun succ -> r1.nfa (r2.nfa succ)) + {nfa = (fun succ -> r1.nfa (r2.nfa succ)); + named_groups = merge_named_groups r1.named_groups r2.named_groups;} let is_chars final = function | {eps = []; trans = [c, f]} when f == final -> Some c @@ -72,25 +82,31 @@ let chars c = regexp_of_nfa (fun succ -> n.trans <- [c,succ]; n) -let alt r1 r2 = regexp_of_nfa (fun succ -> - let nr1 = r1.nfa succ and nr2 = r2.nfa succ in - match is_chars succ nr1, is_chars succ nr2 with - | Some c1, Some c2 -> (chars (Cset.union c1 c2)).nfa succ - | _ -> - let n = new_node () in - n.eps <- [r1.nfa succ; r2.nfa succ]; - n) - -let rep r = regexp_of_nfa (fun succ -> - let n = new_node () in - n.eps <- [r.nfa n; succ]; - n) +let alt r1 r2 = + {nfa = (fun succ -> + let nr1 = r1.nfa succ and nr2 = r2.nfa succ in + match is_chars succ nr1, is_chars succ nr2 with + | Some c1, Some c2 -> (chars (Cset.union c1 c2)).nfa succ + | _ -> + let n = new_node () in + n.eps <- [r1.nfa succ; r2.nfa succ]; + n); + named_groups = merge_named_groups r1.named_groups r2.named_groups;} + +let rep r = + {r with + nfa = (fun succ -> + let n = new_node () in + n.eps <- [r.nfa n; succ]; + n);} -let plus r = regexp_of_nfa (fun succ -> - let n = new_node () in - let nr = r.nfa n in - n.eps <- [nr; succ]; - nr) +let plus r = + {r with + nfa = (fun succ -> + let n = new_node () in + let nr = r.nfa n in + n.eps <- [nr; succ]; + nr);} let eps = regexp_of_nfa (fun succ -> succ) (* eps for epsilon *) @@ -98,7 +114,8 @@ let compl r = let n = new_node () in match is_chars n (r.nfa n) with | Some c -> - Some (chars (Cset.difference Cset.any c)) + Some {(chars (Cset.difference Cset.any c)) with + named_groups = r.named_groups;} | _ -> None @@ -107,7 +124,8 @@ let pair_op f r0 r1 = (* Construct subtract or intersection *) let to_chars r = is_chars n (r.nfa n) in match to_chars r0, to_chars r1 with | Some c0, Some c1 -> - Some (chars (f c0 c1)) + Some {(chars (f c0 c1)) with + named_groups = merge_named_groups r0.named_groups r1.named_groups;} | _ -> None @@ -175,10 +193,11 @@ let compile rs = let trans = transition state in let trans = Array.map - (fun (p, t) -> (p, aux t, List.concat_map (fun n -> n.action) t)) + (fun (p, t) -> (p, aux t)) trans in let finals = Array.map (fun (_, f) -> List.memq f state) rs in - Hashtbl.add states_def i (trans, finals); + let actions = List.concat_map (fun n -> n.action) state in + Hashtbl.add states_def i (trans, finals, actions); i in let init = ref [] in diff --git a/src/syntax/sedlex.mli b/src/syntax/sedlex.mli index 9326350b..685c64b2 100644 --- a/src/syntax/sedlex.mli +++ b/src/syntax/sedlex.mli @@ -3,18 +3,12 @@ (* Copyright 2005, 2013 by Alain Frisch and LexiFi. *) type regexp -and named_slots = { - begin_var: string; - end_var: string; -} -type node_action = [`save_offset of save_offset_action] -and save_offset_action = { - orig : string; - slot : string; -} +type node_action = [`save_offset of string] +val get_names: regexp -> string list val get_slots: string -> regexp -> string * string +val set_slots: string -> string * string -> regexp -> regexp val set_pre_action: node_action -> regexp -> regexp val set_post_action: node_action -> regexp -> regexp @@ -35,4 +29,4 @@ val intersection: regexp -> regexp -> regexp option (* If each argument is a single [chars] regexp, returns a regexp which matches the intersection set. Otherwise returns [None]. *) -val compile: regexp array -> ((Sedlex_cset.t * int * node_action list) array * bool array) array +val compile: regexp array -> ((Sedlex_cset.t * int) array * bool array * node_action list) array From c63f217fd551adc34728da4e4efac2bb0fc8750e Mon Sep 17 00:00:00 2001 From: hongoh Date: Wed, 16 Feb 2022 16:01:36 +0900 Subject: [PATCH 07/13] add last "end offset" --- examples/as_pattern.ml | 2 +- src/syntax/ppx_sedlex.ml | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/examples/as_pattern.ml b/examples/as_pattern.ml index 65c9a6ea..3f3eb011 100644 --- a/examples/as_pattern.ml +++ b/examples/as_pattern.ml @@ -2,7 +2,7 @@ let rec token buf = match%sedlex buf with | eof -> print_endline "\tEnd" | white_space -> print_endline "\tWhitespace"; token buf - | ((Plus ('a' .. 'z' | 'A' .. 'Z')) as text, white_space) -> + | ((Plus ('a' .. 'z' | 'A' .. 'Z')) as text, (Star (white_space | ',' | '.'))) -> print_string "as-pattern text:\t"; print_endline (String.of_seq (Array.to_seq (Array.map Uchar.to_char text))); token buf diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index 6edbeff7..b3de383c 100644 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -222,10 +222,22 @@ let gen_aliases lexbuf re = let state_fun state = Printf.sprintf "__sedlex_state_%i" state let call_state lexbuf auto state = - let (trans, final, _) = auto.(state) in + let loc = default_loc in + let (trans, final, actions) = auto.(state) in + let actions = List.map (function + | `save_offset act -> +(* + pexp_sequence ~loc + [%expr + print_string [%e estring ~loc (act^" := ")]; + print_int (snd (Sedlexing.loc [%e evar ~loc lexbuf])); + print_newline ();] +*) + [%expr [%e evar ~loc act] := (snd (Sedlexing.loc [%e evar ~loc lexbuf]))] + | _ -> assert false) actions in if Array.length trans = 0 then match best_final final with - | Some i -> eint ~loc:default_loc i + | Some i -> esequence ~loc (actions @ [eint ~loc i]) | None -> assert false else appfun (state_fun state) [evar ~loc:default_loc lexbuf] From 57c58c8dfdae981a27a77ecc0cc469c05460cf60 Mon Sep 17 00:00:00 2001 From: hongoh Date: Wed, 16 Feb 2022 19:27:12 +0900 Subject: [PATCH 08/13] redefine node_action --- examples/dune | 0 src/syntax/ppx_sedlex.ml | 37 +++++++++++++++++++++++++------------ src/syntax/sedlex.ml | 10 +++++++++- src/syntax/sedlex.mli | 10 +++++++++- 4 files changed, 43 insertions(+), 14 deletions(-) mode change 100644 => 100755 examples/dune mode change 100644 => 100755 src/syntax/ppx_sedlex.ml mode change 100644 => 100755 src/syntax/sedlex.ml mode change 100644 => 100755 src/syntax/sedlex.mli diff --git a/examples/dune b/examples/dune old mode 100644 new mode 100755 diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml old mode 100644 new mode 100755 index b3de383c..49064700 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -189,7 +189,7 @@ let best_final final = let gen_alisas_slots auto = let loc = default_loc in - let module S = Set.Make(struct type t = string let compare = compare end) in + let module S = Set.Make(struct type t = Sedlex.save_offset_action let compare = compare end) in let slots = ref [] in let seen = ref S.empty in Array.iter (fun (_, _, acts) -> List.iter (function @@ -197,7 +197,11 @@ let gen_alisas_slots auto = if S.mem slot !seen then () else slots := slot :: !slots; seen := S.add slot !seen | _ -> assert false) acts) auto; - List.map (fun slot -> value_binding ~loc ~pat:(pvar ~loc slot) ~expr:[%expr ref (-1)]) !slots + List.map (function + | Sedlex.(Save_offset_assign {varname=var}) -> + value_binding ~loc ~pat:(pvar ~loc var) ~expr:[%expr ref None] + | Sedlex.(Save_offset_update {varname=var}) -> + value_binding ~loc ~pat:(pvar ~loc var) ~expr:[%expr ref None]) !slots let gen_aliases lexbuf re = let loc = default_loc in @@ -215,8 +219,8 @@ let gen_aliases lexbuf re = *) [%expr Sedlexing.sub_lexeme [%e evar ~loc lexbuf] - (! [%e evar ~loc begin_slot]) - ((! [%e evar ~loc end_slot]) - (! [%e evar ~loc begin_slot]))])) + (Option.get (! [%e evar ~loc begin_slot])) + ((Option.get (! [%e evar ~loc end_slot])) - (Option.get (! [%e evar ~loc begin_slot])))])) (Sedlex.get_names re) let state_fun state = Printf.sprintf "__sedlex_state_%i" state @@ -225,7 +229,7 @@ let call_state lexbuf auto state = let loc = default_loc in let (trans, final, actions) = auto.(state) in let actions = List.map (function - | `save_offset act -> + | `save_offset Sedlex.(Save_offset_assign {varname=var}) -> (* pexp_sequence ~loc [%expr @@ -233,19 +237,19 @@ let call_state lexbuf auto state = print_int (snd (Sedlexing.loc [%e evar ~loc lexbuf])); print_newline ();] *) - [%expr [%e evar ~loc act] := (snd (Sedlexing.loc [%e evar ~loc lexbuf]))] + [%expr [%e evar ~loc var] := (Some (snd (Sedlexing.loc [%e evar ~loc lexbuf])))] | _ -> assert false) actions in if Array.length trans = 0 then match best_final final with - | Some i -> esequence ~loc (actions @ [eint ~loc i]) + | Some i -> esequence ~loc (actions @ [eint ~loc:default_loc i]) | None -> assert false - else appfun (state_fun state) [evar ~loc:default_loc lexbuf] + else appfun (state_fun state) [evar ~loc lexbuf] let gen_state lexbuf auto i (trans, final, actions) = let loc = default_loc in let partition = Array.map fst trans in let actions = List.map (function - | `save_offset act -> + | `save_offset Sedlex.(Save_offset_assign {varname=var}) -> (* pexp_sequence ~loc [%expr @@ -253,7 +257,16 @@ let gen_state lexbuf auto i (trans, final, actions) = print_int (snd (Sedlexing.loc [%e evar ~loc lexbuf])); print_newline ();] *) - [%expr [%e evar ~loc act] := (snd (Sedlexing.loc [%e evar ~loc lexbuf]))] + [%expr [%e evar ~loc var] := (Some (snd (Sedlexing.loc [%e evar ~loc lexbuf])))] +(* + | `save_offset Sedlex.(Save_offset_update {varname=var; update_function=f}) -> + pexp_sequence ~loc + [%expr + print_string [%e estring ~loc (act^" := ")]; + print_int (snd (Sedlexing.loc [%e evar ~loc lexbuf])); + print_newline ();] + [%expr [%e evar ~loc var] := (f (! [%e evar ~loc var]) (snd (Sedlexing.loc [%e evar ~loc lexbuf])))] +*) | _ -> assert false) actions in let cases = Array.mapi (fun i (_, j) -> case ~lhs:(pint ~loc i) ~guard:None ~rhs:(call_state lexbuf auto j)) trans in @@ -430,8 +443,8 @@ let regexp_of_pattern env = let end_offset_slot_var = "__sedlex_"^var^"_end_offset_"^(string_of_int !offset_counter) in aux pat |> Sedlex.set_slots var (begin_offset_slot_var, end_offset_slot_var) - |> Sedlex.set_post_action (`save_offset end_offset_slot_var) - |> Sedlex.set_pre_action (`save_offset begin_offset_slot_var) + |> Sedlex.set_post_action (`save_offset Sedlex.(Save_offset_assign {varname=end_offset_slot_var})) + |> Sedlex.set_pre_action (`save_offset Sedlex.(Save_offset_assign {varname=begin_offset_slot_var})) | _ -> err p.ppat_loc "this pattern is not a valid regexp" in diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml old mode 100644 new mode 100755 index 7032bc28..b6c4b6ba --- a/src/syntax/sedlex.ml +++ b/src/syntax/sedlex.ml @@ -17,7 +17,15 @@ type node = { mutable eps : node list; mutable trans : (Cset.t * node) list; } -and node_action = [`save_offset of string] +and node_action = [`save_offset of save_offset_action] +and save_offset_action = + | Save_offset_assign of { + varname : string; + } + | Save_offset_update of { + varname : string; + update_function : prev:(int option) -> curr:int -> int option + } (* Compilation regexp -> NFA *) diff --git a/src/syntax/sedlex.mli b/src/syntax/sedlex.mli old mode 100644 new mode 100755 index 685c64b2..73abc7e6 --- a/src/syntax/sedlex.mli +++ b/src/syntax/sedlex.mli @@ -4,7 +4,15 @@ type regexp -type node_action = [`save_offset of string] +type node_action = [`save_offset of save_offset_action] +and save_offset_action = + | Save_offset_assign of { + varname : string; + } + | Save_offset_update of { + varname : string; + update_function : prev:(int option) -> curr:int -> int option + } val get_names: regexp -> string list val get_slots: string -> regexp -> string * string From a7b040a7139127bf840c13a4850a45f75ae856af Mon Sep 17 00:00:00 2001 From: hongoh Date: Thu, 17 Feb 2022 19:26:17 +0900 Subject: [PATCH 09/13] modify save offset assignment action using max and min --- src/syntax/ppx_sedlex.ml | 91 +++++++++++++++++++--------------------- src/syntax/sedlex.ml | 9 +--- src/syntax/sedlex.mli | 9 +--- 3 files changed, 48 insertions(+), 61 deletions(-) mode change 100755 => 100644 src/syntax/ppx_sedlex.ml mode change 100755 => 100644 src/syntax/sedlex.ml mode change 100755 => 100644 src/syntax/sedlex.mli diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml old mode 100755 new mode 100644 index 49064700..2468f565 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -175,8 +175,8 @@ let partition (name, p) = [%e body] ] -(* Alias offset *) -let offset_counter = ref (-1) +(* Alias offset slot counter *) +let alias_slot_counter = ref (-1) (* Code generation for the automata *) @@ -198,9 +198,9 @@ let gen_alisas_slots auto = else slots := slot :: !slots; seen := S.add slot !seen | _ -> assert false) acts) auto; List.map (function - | Sedlex.(Save_offset_assign {varname=var}) -> + | Sedlex.Save_begin_offset_assign var -> value_binding ~loc ~pat:(pvar ~loc var) ~expr:[%expr ref None] - | Sedlex.(Save_offset_update {varname=var}) -> + | Sedlex.Save_end_offset_assign var -> value_binding ~loc ~pat:(pvar ~loc var) ~expr:[%expr ref None]) !slots let gen_aliases lexbuf re = @@ -213,61 +213,58 @@ let gen_aliases lexbuf re = (* pexp_sequence ~loc [%expr print_endline - ("Sedlexing.sub_lexeme lexbuf " - ^string_of_int (! [%e evar ~loc begin_slot])^" " - ^string_of_int ((! [%e evar ~loc end_slot]) - (! [%e evar ~loc begin_slot])))] + ("Sedlexing.sub_lexeme "^[%e estring ~loc lexbuf]^" " + ^"("^string_of_int (Option.get (! [%e evar ~loc begin_slot]))^" - " + ^string_of_int (Option.get (! [%e evar ~loc begin_slot]))^") " + ^"("^string_of_int (Option.get (! [%e evar ~loc end_slot]))^" - " + ^string_of_int (Option.get (! [%e evar ~loc begin_slot]))^")")] *) - [%expr Sedlexing.sub_lexeme - [%e evar ~loc lexbuf] - (Option.get (! [%e evar ~loc begin_slot])) + [%expr Sedlexing.sub_lexeme [%e evar ~loc lexbuf] + ((Option.get (! [%e evar ~loc begin_slot])) - fst (Sedlexing.loc [%e evar ~loc lexbuf])) ((Option.get (! [%e evar ~loc end_slot])) - (Option.get (! [%e evar ~loc begin_slot])))])) (Sedlex.get_names re) let state_fun state = Printf.sprintf "__sedlex_state_%i" state +let eaction ~loc lexbuf = function + | `save_offset Sedlex.Save_begin_offset_assign var -> +(* + pexp_sequence ~loc + [%expr + print_endline ([%e estring ~loc var]^" := "^(match (! [%e evar ~loc var]) with + | None -> "Some "^string_of_int (snd (Sedlexing.loc [%e evar ~loc lexbuf])) + | Some _ -> "Some "^string_of_int (min (snd (Sedlexing.loc [%e evar ~loc lexbuf])) (Option.get (! [%e evar ~loc var])))))] +*) + [%expr [%e evar ~loc var] := match (! [%e evar ~loc var]) with + | None -> Some (snd (Sedlexing.loc [%e evar ~loc lexbuf])) + | Some _ -> Some (min (snd (Sedlexing.loc [%e evar ~loc lexbuf])) (Option.get (! [%e evar ~loc var])))] + | `save_offset Sedlex.Save_end_offset_assign var -> +(* + pexp_sequence ~loc + [%expr + print_endline ([%e estring ~loc var]^" := "^(match (! [%e evar ~loc var]) with + | None -> "Some "^string_of_int (snd (Sedlexing.loc [%e evar ~loc lexbuf])) + | Some _ -> "Some "^string_of_int (max (snd (Sedlexing.loc [%e evar ~loc lexbuf])) (Option.get (! [%e evar ~loc var])))))] +*) + [%expr [%e evar ~loc var] := match (! [%e evar ~loc var]) with + | None -> Some (snd (Sedlexing.loc [%e evar ~loc lexbuf])) + | Some _ -> Some (max (snd (Sedlexing.loc [%e evar ~loc lexbuf])) (Option.get (! [%e evar ~loc var]) ))] + | _ -> assert false + let call_state lexbuf auto state = let loc = default_loc in let (trans, final, actions) = auto.(state) in - let actions = List.map (function - | `save_offset Sedlex.(Save_offset_assign {varname=var}) -> -(* - pexp_sequence ~loc - [%expr - print_string [%e estring ~loc (act^" := ")]; - print_int (snd (Sedlexing.loc [%e evar ~loc lexbuf])); - print_newline ();] -*) - [%expr [%e evar ~loc var] := (Some (snd (Sedlexing.loc [%e evar ~loc lexbuf])))] - | _ -> assert false) actions in + let actions = List.map (eaction ~loc lexbuf) actions in if Array.length trans = 0 then match best_final final with - | Some i -> esequence ~loc (actions @ [eint ~loc:default_loc i]) + | Some i -> esequence ~loc (actions @ [eint ~loc i]) | None -> assert false else appfun (state_fun state) [evar ~loc lexbuf] let gen_state lexbuf auto i (trans, final, actions) = let loc = default_loc in let partition = Array.map fst trans in - let actions = List.map (function - | `save_offset Sedlex.(Save_offset_assign {varname=var}) -> -(* - pexp_sequence ~loc - [%expr - print_string [%e estring ~loc (act^" := ")]; - print_int (snd (Sedlexing.loc [%e evar ~loc lexbuf])); - print_newline ();] -*) - [%expr [%e evar ~loc var] := (Some (snd (Sedlexing.loc [%e evar ~loc lexbuf])))] -(* - | `save_offset Sedlex.(Save_offset_update {varname=var; update_function=f}) -> - pexp_sequence ~loc - [%expr - print_string [%e estring ~loc (act^" := ")]; - print_int (snd (Sedlexing.loc [%e evar ~loc lexbuf])); - print_newline ();] - [%expr [%e evar ~loc var] := (f (! [%e evar ~loc var]) (snd (Sedlexing.loc [%e evar ~loc lexbuf])))] -*) - | _ -> assert false) actions in + let actions = List.map (eaction ~loc lexbuf) actions in let cases = Array.mapi (fun i (_, j) -> case ~lhs:(pint ~loc i) ~guard:None ~rhs:(call_state lexbuf auto j)) trans in let cases = Array.to_list cases in @@ -438,13 +435,13 @@ let regexp_of_pattern env = err p.ppat_loc (Printf.sprintf "unbound regexp %s" x) end | Ppat_alias (pat, {txt=var}) -> - incr offset_counter; - let begin_offset_slot_var = "__sedlex_"^var^"_begin_offset_"^(string_of_int !offset_counter) in - let end_offset_slot_var = "__sedlex_"^var^"_end_offset_"^(string_of_int !offset_counter) in + incr alias_slot_counter; + let begin_offset_slot_var = "__sedlex_"^var^"_begin_offset_"^(string_of_int !alias_slot_counter) in + let end_offset_slot_var = "__sedlex_"^var^"_end_offset_"^(string_of_int !alias_slot_counter) in aux pat |> Sedlex.set_slots var (begin_offset_slot_var, end_offset_slot_var) - |> Sedlex.set_post_action (`save_offset Sedlex.(Save_offset_assign {varname=end_offset_slot_var})) - |> Sedlex.set_pre_action (`save_offset Sedlex.(Save_offset_assign {varname=begin_offset_slot_var})) + |> Sedlex.set_post_action (`save_offset (Sedlex.Save_end_offset_assign end_offset_slot_var)) + |> Sedlex.set_pre_action (`save_offset (Sedlex.Save_begin_offset_assign begin_offset_slot_var)) | _ -> err p.ppat_loc "this pattern is not a valid regexp" in diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml old mode 100755 new mode 100644 index b6c4b6ba..45b67170 --- a/src/syntax/sedlex.ml +++ b/src/syntax/sedlex.ml @@ -19,13 +19,8 @@ type node = { } and node_action = [`save_offset of save_offset_action] and save_offset_action = - | Save_offset_assign of { - varname : string; - } - | Save_offset_update of { - varname : string; - update_function : prev:(int option) -> curr:int -> int option - } + | Save_begin_offset_assign of string + | Save_end_offset_assign of string (* Compilation regexp -> NFA *) diff --git a/src/syntax/sedlex.mli b/src/syntax/sedlex.mli old mode 100755 new mode 100644 index 73abc7e6..d486c088 --- a/src/syntax/sedlex.mli +++ b/src/syntax/sedlex.mli @@ -6,13 +6,8 @@ type regexp type node_action = [`save_offset of save_offset_action] and save_offset_action = - | Save_offset_assign of { - varname : string; - } - | Save_offset_update of { - varname : string; - update_function : prev:(int option) -> curr:int -> int option - } + | Save_begin_offset_assign of string + | Save_end_offset_assign of string val get_names: regexp -> string list val get_slots: string -> regexp -> string * string From f733f3254d090ccf1f2c83a1cf407cda35c70cbf Mon Sep 17 00:00:00 2001 From: hongoh Date: Fri, 18 Feb 2022 12:29:34 +0900 Subject: [PATCH 10/13] update as_pattern example --- examples/as_pattern.ml | 17 +++++++++++++++++ examples/dune | 0 2 files changed, 17 insertions(+) mode change 100755 => 100644 examples/dune diff --git a/examples/as_pattern.ml b/examples/as_pattern.ml index 3f3eb011..40fe798e 100644 --- a/examples/as_pattern.ml +++ b/examples/as_pattern.ml @@ -1,3 +1,4 @@ +(* let rec token buf = match%sedlex buf with | eof -> print_endline "\tEnd" @@ -17,3 +18,19 @@ let rec token buf = let () = let lexbuf = Sedlexing.Utf8.from_string "It takes all the running you can do, to keep in the same place." in token lexbuf +*) + + let string_of_uchars us = + String.of_seq (Array.to_seq (Array.map Uchar.to_char us)) + +let rec token buf = + match%sedlex buf with + | eof -> print_endline "\tEnd" + | white_space -> print_endline "\tWhitespace"; token buf + | (Star 'a' as a), ('b' as b) -> print_endline (string_of_uchars a^"\t"^string_of_uchars b); token buf + | any -> print_endline "Other"; token buf + | _ -> failwith "Internal failure: Reached impossible place" + +let () = + let lexbuf = Sedlexing.Utf8.from_string "b ab aab aaab aaaab" in + token lexbuf diff --git a/examples/dune b/examples/dune old mode 100755 new mode 100644 From 21022c2b654effa106f3d7521c5ee924fb0f5dcf Mon Sep 17 00:00:00 2001 From: Haochen Xie Date: Fri, 18 Feb 2022 17:35:26 +0900 Subject: [PATCH 11/13] wip: as-pattern - add transition_action --- src/syntax/ppx_sedlex.ml | 25 +++++++++++++++++-------- src/syntax/sedlex.ml | 33 +++++++++++++++++---------------- src/syntax/sedlex.mli | 6 +++++- 3 files changed, 39 insertions(+), 25 deletions(-) diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index 2468f565..de9a2dad 100644 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -251,22 +251,31 @@ let eaction ~loc lexbuf = function | Some _ -> Some (max (snd (Sedlexing.loc [%e evar ~loc lexbuf])) (Option.get (! [%e evar ~loc var]) ))] | _ -> assert false -let call_state lexbuf auto state = +let reorder_transition_actions : Sedlex.generic_action list -> Sedlex.generic_action list + = fun acts -> acts (* XXX *) + +let call_state lexbuf auto state trans_acts = let loc = default_loc in - let (trans, final, actions) = auto.(state) in - let actions = List.map (eaction ~loc lexbuf) actions in + let (trans, final, node_acts) = auto.(state) in + let acts : Sedlex.generic_action list = + (trans_acts :> Sedlex.generic_action list) + @ (node_acts :> Sedlex.generic_action list) in + let acts = + acts + |> reorder_transition_actions + |> List.map (eaction ~loc lexbuf) in if Array.length trans = 0 then match best_final final with - | Some i -> esequence ~loc (actions @ [eint ~loc i]) + | Some i -> esequence ~loc (acts @ [eint ~loc i]) | None -> assert false else appfun (state_fun state) [evar ~loc lexbuf] let gen_state lexbuf auto i (trans, final, actions) = let loc = default_loc in - let partition = Array.map fst trans in + let partition = Array.map (fun (f,_,_) -> f) trans in let actions = List.map (eaction ~loc lexbuf) actions in - let cases = Array.mapi (fun i (_, j) -> - case ~lhs:(pint ~loc i) ~guard:None ~rhs:(call_state lexbuf auto j)) trans in + let cases = Array.mapi (fun i (_, j, acts) -> + case ~lhs:(pint ~loc i) ~guard:None ~rhs:(call_state lexbuf auto j acts)) trans in let cases = Array.to_list cases in let body () = esequence ~loc @@ -288,7 +297,7 @@ let gen_recflag auto = Array.iter (fun (trans_i, _, _) -> Array.iter - (fun (_, j) -> + (fun (_, j, _) -> let (trans_j, _, _) = auto.(j) in if Array.length trans_j > 0 then raise Exit) trans_i) diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml index 45b67170..3e71c408 100644 --- a/src/syntax/sedlex.ml +++ b/src/syntax/sedlex.ml @@ -15,12 +15,14 @@ type node = { id : int; mutable action : node_action list; mutable eps : node list; - mutable trans : (Cset.t * node) list; + mutable trans : (Cset.t * node * transition_action list) list; } and node_action = [`save_offset of save_offset_action] +and transition_action = [`step_capture_slot of string] and save_offset_action = | Save_begin_offset_assign of string | Save_end_offset_assign of string +type generic_action = [ transition_action | node_action ] (* Compilation regexp -> NFA *) @@ -77,12 +79,12 @@ let seq r1 r2 = named_groups = merge_named_groups r1.named_groups r2.named_groups;} let is_chars final = function - | {eps = []; trans = [c, f]} when f == final -> Some c + | {eps = []; trans = [c, f, _]} when f == final -> Some c | _ -> None let chars c = regexp_of_nfa (fun succ -> let n = new_node () in - n.trans <- [c,succ]; + n.trans <- [c,succ,[]]; n) let alt r1 r2 = @@ -150,36 +152,35 @@ let rec add_node state node = and add_nodes state nodes = List.fold_left add_node state nodes - let transition (state : state) = (* Merge transition with the same target *) let rec norm = function - | (c1, n1)::((c2, n2)::q as l) -> - if n1 == n2 then norm ((Cset.union c1 c2, n1)::q) - else (c1, n1)::(norm l) + | (c1, n1, acts1)::((c2, n2, acts2)::q as l) -> + if n1 == n2 then norm ((Cset.union c1 c2, n1, acts1@acts2)::q) + else (c1, n1, acts1)::(norm l) | l -> l in let t = List.concat (List.map (fun n -> n.trans) state) in - let t = norm (List.sort (fun (_, n1) (_, n2) -> n1.id - n2.id) t) in + let t = norm (List.sort (fun (_, n1, _) (_, n2, _) -> n1.id - n2.id) t) in (* Split char sets so as to make them disjoint *) - let split (all, t) (c0, n0) = + let split (all, t) (c0, n0, acts0) = let t = - (Cset.difference c0 all, [n0]) :: - List.map (fun (c, ns) -> (Cset.intersection c c0, n0::ns)) t @ - List.map (fun (c, ns) -> (Cset.difference c c0, ns)) t + (Cset.difference c0 all, [n0], acts0) :: + List.map (fun (c, ns, acts) -> (Cset.intersection c c0, n0::ns, acts0@acts)) t @ + List.map (fun (c, ns, acts) -> (Cset.difference c c0, ns, acts)) t in Cset.union all c0, - List.filter (fun (c, _) -> not (Cset.is_empty c)) t + List.filter (fun (c, _, _) -> not (Cset.is_empty c)) t in let (_,t) = List.fold_left split (Cset.empty,[]) t in (* Epsilon closure of targets *) - let t = List.map (fun (c, ns) -> (c, add_nodes [] ns)) t in + let t = List.map (fun (c, ns, acts) -> (c, add_nodes [] ns, acts)) t in (* Canonical ordering *) let t = Array.of_list t in - Array.sort (fun (c1, _) (c2, _) -> compare c1 c2) t; + Array.sort (fun (c1, _, _) (c2, _, _) -> compare c1 c2) t; t let compile rs = @@ -196,7 +197,7 @@ let compile rs = let trans = transition state in let trans = Array.map - (fun (p, t) -> (p, aux t)) + (fun (p, t, acts) -> (p, aux t, acts)) trans in let finals = Array.map (fun (_, f) -> List.memq f state) rs in let actions = List.concat_map (fun n -> n.action) state in diff --git a/src/syntax/sedlex.mli b/src/syntax/sedlex.mli index d486c088..2f814908 100644 --- a/src/syntax/sedlex.mli +++ b/src/syntax/sedlex.mli @@ -4,10 +4,12 @@ type regexp +type transition_action = [`step_capture_slot of string] type node_action = [`save_offset of save_offset_action] and save_offset_action = | Save_begin_offset_assign of string | Save_end_offset_assign of string +type generic_action = [ transition_action | node_action ] val get_names: regexp -> string list val get_slots: string -> regexp -> string * string @@ -32,4 +34,6 @@ val intersection: regexp -> regexp -> regexp option (* If each argument is a single [chars] regexp, returns a regexp which matches the intersection set. Otherwise returns [None]. *) -val compile: regexp array -> ((Sedlex_cset.t * int) array * bool array * node_action list) array +val compile: + regexp array -> + ((Sedlex_cset.t * int * transition_action list) array * bool array * node_action list) array From c761f596801a44029704229a2e508ef103140687 Mon Sep 17 00:00:00 2001 From: Haochen Xie Date: Fri, 18 Feb 2022 18:01:03 +0900 Subject: [PATCH 12/13] wip of wip --- examples/as_pattern.ml | 18 ++++++++++++++++++ src/syntax/ppx_sedlex.ml | 20 +++++++++++++++----- src/syntax/sedlex.ml | 36 +++++++++++++++++++++++++++++++++++- src/syntax/sedlex.mli | 7 ++++++- 4 files changed, 74 insertions(+), 7 deletions(-) diff --git a/examples/as_pattern.ml b/examples/as_pattern.ml index 3f3eb011..5b3bcbd2 100644 --- a/examples/as_pattern.ml +++ b/examples/as_pattern.ml @@ -17,3 +17,21 @@ let rec token buf = let () = let lexbuf = Sedlexing.Utf8.from_string "It takes all the running you can do, to keep in the same place." in token lexbuf + + +let string_of_uchars us = + String.of_seq (Array.to_seq (Array.map Uchar.to_char us)) + +let rec token buf = + match%sedlex buf with + | eof -> print_endline "\tEnd" + | white_space -> print_endline "\tWhitespace"; token buf + | (Star 'a' as a), ('b' as b) -> print_endline (string_of_uchars a^"\t"^string_of_uchars b); token buf + | any -> print_endline "Other"; token buf + | _ -> failwith "Internal failure: Reached impossible place" + +let () = + let lexbuf = Sedlexing.Utf8.from_string "b ab aab aaab aaaab" in + token lexbuf + + diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index de9a2dad..c9ea74d9 100644 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -445,12 +445,22 @@ let regexp_of_pattern env = end | Ppat_alias (pat, {txt=var}) -> incr alias_slot_counter; - let begin_offset_slot_var = "__sedlex_"^var^"_begin_offset_"^(string_of_int !alias_slot_counter) in - let end_offset_slot_var = "__sedlex_"^var^"_end_offset_"^(string_of_int !alias_slot_counter) in + let capture_slot_var = "__sedlex_"^var^"_capture_slot"^(string_of_int !alias_slot_counter) in + (* let begin_offset_slot_var = "__sedlex_"^var^"_begin_offset_"^(string_of_int !alias_slot_counter) in *) + (* let end_offset_slot_var = "__sedlex_"^var^"_end_offset_"^(string_of_int !alias_slot_counter) in *) aux pat - |> Sedlex.set_slots var (begin_offset_slot_var, end_offset_slot_var) - |> Sedlex.set_post_action (`save_offset (Sedlex.Save_end_offset_assign end_offset_slot_var)) - |> Sedlex.set_pre_action (`save_offset (Sedlex.Save_begin_offset_assign begin_offset_slot_var)) + (* |> Sedlex.set_slots var (begin_offset_slot_var, end_offset_slot_var) *) + |> Sedlex.set_slots var (capture_slot_var) + |> Sedlex.add_transition_action_to_all_internal_transitions (`step_capture_slot capture_slot_var) + + (* XXX *) + |> Sedlex.set_pre_action (`may_init_capture_slot capture_slot_var) + |> Sedlex.set_post_action (`may_finish_capture_slot capture_slot_var) + + + (* |> Sedlex.set_post_action (`save_offset (Sedlex.Save_end_offset_assign end_offset_slot_var)) *) + (* |> Sedlex.set_pre_action (`save_offset (Sedlex.Save_begin_offset_assign begin_offset_slot_var)) *) + | _ -> err p.ppat_loc "this pattern is not a valid regexp" in diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml index 3e71c408..7228f6af 100644 --- a/src/syntax/sedlex.ml +++ b/src/syntax/sedlex.ml @@ -17,7 +17,11 @@ type node = { mutable eps : node list; mutable trans : (Cset.t * node * transition_action list) list; } -and node_action = [`save_offset of save_offset_action] +and node_action = [ + | `save_offset of save_offset_action + | `may_init_capture_slot of string + | `may_finish_capture_slot of string + ] and transition_action = [`step_capture_slot of string] and save_offset_action = | Save_begin_offset_assign of string @@ -44,6 +48,7 @@ let get_slots name re = let set_slots name (begin_slot, end_slot) re = {re with named_groups = StringMap.add name {begin_var=begin_slot; end_var=end_slot;} re.named_groups} +(* XXX : consider name changing - set_ => add_ *) let set_pre_action act re = {re with nfa = (fun succ -> @@ -57,6 +62,35 @@ let set_post_action act re = succ.action <- act :: succ.action; re.nfa succ)} +module NodeSet = Set.Make(struct + type t = node + let compare a b = compare a.id b.id + end) + +let iter_all_nodes func init = + let visited = ref NodeSet.empty in + let rec aux n = + if not (NodeSet.mem n !visited) then ( + func n; visited := NodeSet.add n !visited; + List.iter aux n.eps; + List.iter (fun (_, n, _) -> aux n) n.trans + ) in + aux init + +let add_transition_action_to_all_internal_transitions act re : regexp = + { re with + nfa = begin + let nfa0 = re.nfa in + fun fin -> + let sub : node = nfa0 fin in + iter_all_nodes (fun node -> + node.trans <- + node.trans + |> List.map (fun (c, n, acts) -> c, n, act::acts)) sub; + sub + end + } + let cur_id = ref 0 let new_node () = incr cur_id; diff --git a/src/syntax/sedlex.mli b/src/syntax/sedlex.mli index 2f814908..a7b00f5e 100644 --- a/src/syntax/sedlex.mli +++ b/src/syntax/sedlex.mli @@ -5,7 +5,11 @@ type regexp type transition_action = [`step_capture_slot of string] -type node_action = [`save_offset of save_offset_action] +type node_action = [ + | `save_offset of save_offset_action + | `may_init_capture_slot of string + | `may_finish_capture_slot of string + ] and save_offset_action = | Save_begin_offset_assign of string | Save_end_offset_assign of string @@ -16,6 +20,7 @@ val get_slots: string -> regexp -> string * string val set_slots: string -> string * string -> regexp -> regexp val set_pre_action: node_action -> regexp -> regexp val set_post_action: node_action -> regexp -> regexp +val add_transition_action_to_all_internal_transitions : transition_action -> regexp -> regexp val chars: Sedlex_cset.t -> regexp val seq: regexp -> regexp -> regexp From c3a299a188130a9efe96750b5ab76a75f4243d71 Mon Sep 17 00:00:00 2001 From: hongoh Date: Fri, 25 Feb 2022 03:47:22 +0900 Subject: [PATCH 13/13] update using node actions and transition actions --- examples/as_pattern.ml | 55 +++++++---- src/syntax/ppx_sedlex.ml | 196 +++++++++++++++++++++++++-------------- src/syntax/sedlex.ml | 85 +++++++++++------ src/syntax/sedlex.mli | 17 +++- 4 files changed, 237 insertions(+), 116 deletions(-) diff --git a/examples/as_pattern.ml b/examples/as_pattern.ml index 40fe798e..f86805c9 100644 --- a/examples/as_pattern.ml +++ b/examples/as_pattern.ml @@ -1,28 +1,50 @@ -(* -let rec token buf = +let string_of_uchars us = + String.of_seq (Array.to_seq (Array.map Uchar.to_char us)) + +let rec token1 buf = match%sedlex buf with | eof -> print_endline "\tEnd" - | white_space -> print_endline "\tWhitespace"; token buf - | ((Plus ('a' .. 'z' | 'A' .. 'Z')) as text, (Star (white_space | ',' | '.'))) -> - print_string "as-pattern text:\t"; - print_endline (String.of_seq (Array.to_seq (Array.map Uchar.to_char text))); - token buf + | white_space -> print_endline "\tWhitespace"; token1 buf + | (Plus ('a' .. 'z' | 'A' .. 'Z')) as text -> + print_endline (string_of_uchars text); token1 buf | (',' | '.') as x -> - print_string "as-pattern x:\t"; - print_endline (String.of_seq (Array.to_seq (Array.map Uchar.to_char x))); - token buf - | any -> print_endline "other"; token buf + print_endline (string_of_uchars x); token1 buf + | any -> print_endline "other"; token1 buf + | _ -> failwith "Internal failure: Reached impossible place" + +let rec token2 buf = + match%sedlex buf with + | eof -> print_endline "\tEnd" + | white_space -> print_endline "\tWhitespace"; token2 buf + | "a" as x -> + print_endline (string_of_uchars x); token2 buf + | "b" as x -> + print_endline (string_of_uchars x); token2 buf + | "cde" as x -> + print_endline (string_of_uchars x); token2 buf + | ("f" as x), "g" -> + print_endline (string_of_uchars x); token2 buf + | "h", ("i" as x) -> + print_endline (string_of_uchars x); token2 buf + | ("j" as x | "k" as x) -> + print_endline (string_of_uchars x); token2 buf + | ("l" | "m") as x -> + print_endline (string_of_uchars x); token2 buf + | (Star "n" as x), (Star "o" as y) -> + print_endline (string_of_uchars x^"+"^string_of_uchars y); token2 buf + | (Plus "p" as x), (Plus "q" as y) -> + print_endline (string_of_uchars x^"+"^string_of_uchars y); token2 buf + | any -> print_endline "other"; token1 buf | _ -> failwith "Internal failure: Reached impossible place" let () = - let lexbuf = Sedlexing.Utf8.from_string "It takes all the running you can do, to keep in the same place." in - token lexbuf -*) + token1 (Sedlexing.Utf8.from_string "It takes all the running you can do, to keep in the same place."); + token2 (Sedlexing.Utf8.from_string "a b cde fg hi j k l m n o nn no oo nnn nno noo ooo pq ppq pqq pppq ppqq pqqq") - let string_of_uchars us = - String.of_seq (Array.to_seq (Array.map Uchar.to_char us)) + +(* let rec token buf = match%sedlex buf with | eof -> print_endline "\tEnd" @@ -34,3 +56,4 @@ let rec token buf = let () = let lexbuf = Sedlexing.Utf8.from_string "b ab aab aaab aaaab" in token lexbuf +*) diff --git a/src/syntax/ppx_sedlex.ml b/src/syntax/ppx_sedlex.ml index 2468f565..536f4b81 100644 --- a/src/syntax/ppx_sedlex.ml +++ b/src/syntax/ppx_sedlex.ml @@ -189,84 +189,130 @@ let best_final final = let gen_alisas_slots auto = let loc = default_loc in - let module S = Set.Make(struct type t = Sedlex.save_offset_action let compare = compare end) in + let module S = Set.Make(struct type t = string let compare = compare end) in let slots = ref [] in let seen = ref S.empty in - Array.iter (fun (_, _, acts) -> List.iter (function - | (`save_offset slot) -> - if S.mem slot !seen then () - else slots := slot :: !slots; seen := S.add slot !seen - | _ -> assert false) acts) auto; - List.map (function - | Sedlex.Save_begin_offset_assign var -> - value_binding ~loc ~pat:(pvar ~loc var) ~expr:[%expr ref None] - | Sedlex.Save_end_offset_assign var -> - value_binding ~loc ~pat:(pvar ~loc var) ~expr:[%expr ref None]) !slots - -let gen_aliases lexbuf re = + Array.iter (fun (trans, _, nacts) -> + List.iter (function + | `may_init_capture_slot slot | `may_finish_capture_slot slot -> + if S.mem slot !seen then () + else slots := slot :: !slots; seen := S.add slot !seen + | _ -> failwith "can't generate aliases: unsupported action") nacts; + Array.iter (fun (_, _, tacts) -> + List.iter (function + | `step_capture_slot slot -> + if S.mem slot !seen then () + else slots := slot :: !slots; seen := S.add slot !seen + | _ -> failwith "can't generate aliases: unsupported action") tacts) trans) auto; + List.map (fun slot -> value_binding ~loc ~pat:(pvar ~loc slot) ~expr:[%expr ref (None, None, false, [])]) !slots + +let gen_aliases re = let loc = default_loc in List.map (fun name -> - let begin_slot, end_slot = Sedlex.get_slots name re in + let slot = Sedlex.get_slot name re in value_binding ~loc ~pat:(pvar ~loc name) - ~expr:( -(* - pexp_sequence ~loc - [%expr print_endline - ("Sedlexing.sub_lexeme "^[%e estring ~loc lexbuf]^" " - ^"("^string_of_int (Option.get (! [%e evar ~loc begin_slot]))^" - " - ^string_of_int (Option.get (! [%e evar ~loc begin_slot]))^") " - ^"("^string_of_int (Option.get (! [%e evar ~loc end_slot]))^" - " - ^string_of_int (Option.get (! [%e evar ~loc begin_slot]))^")")] -*) - [%expr Sedlexing.sub_lexeme [%e evar ~loc lexbuf] - ((Option.get (! [%e evar ~loc begin_slot])) - fst (Sedlexing.loc [%e evar ~loc lexbuf])) - ((Option.get (! [%e evar ~loc end_slot])) - (Option.get (! [%e evar ~loc begin_slot])))])) + ~expr:([%expr (function + | (_, _, _, x :: _) -> x + | _ -> [||]) ![%e evar ~loc slot]])) (Sedlex.get_names re) let state_fun state = Printf.sprintf "__sedlex_state_%i" state -let eaction ~loc lexbuf = function - | `save_offset Sedlex.Save_begin_offset_assign var -> -(* - pexp_sequence ~loc - [%expr - print_endline ([%e estring ~loc var]^" := "^(match (! [%e evar ~loc var]) with - | None -> "Some "^string_of_int (snd (Sedlexing.loc [%e evar ~loc lexbuf])) - | Some _ -> "Some "^string_of_int (min (snd (Sedlexing.loc [%e evar ~loc lexbuf])) (Option.get (! [%e evar ~loc var])))))] -*) - [%expr [%e evar ~loc var] := match (! [%e evar ~loc var]) with - | None -> Some (snd (Sedlexing.loc [%e evar ~loc lexbuf])) - | Some _ -> Some (min (snd (Sedlexing.loc [%e evar ~loc lexbuf])) (Option.get (! [%e evar ~loc var])))] - | `save_offset Sedlex.Save_end_offset_assign var -> -(* - pexp_sequence ~loc - [%expr - print_endline ([%e estring ~loc var]^" := "^(match (! [%e evar ~loc var]) with - | None -> "Some "^string_of_int (snd (Sedlexing.loc [%e evar ~loc lexbuf])) - | Some _ -> "Some "^string_of_int (max (snd (Sedlexing.loc [%e evar ~loc lexbuf])) (Option.get (! [%e evar ~loc var])))))] -*) - [%expr [%e evar ~loc var] := match (! [%e evar ~loc var]) with - | None -> Some (snd (Sedlexing.loc [%e evar ~loc lexbuf])) - | Some _ -> Some (max (snd (Sedlexing.loc [%e evar ~loc lexbuf])) (Option.get (! [%e evar ~loc var]) ))] - | _ -> assert false - -let call_state lexbuf auto state = +let eaction ~loc lexbuf = + let lexbuf = evar ~loc lexbuf in + let init_pos = "__sedlex_eaction_init_pos" in + let curr_pos = "__sedlex_eaction_curr_pos" in + let fin_flag = "__sedlex_eaction_fin_flag" in + let result = "__sedlex_eaction_results" in + let init_pos_e = evar ~loc init_pos in + let curr_pos_e = evar ~loc curr_pos in + let fin_flag_e = evar ~loc fin_flag in + let result_e = evar ~loc result in + let init_pos_p = pvar ~loc init_pos in + let curr_pos_p = pvar ~loc curr_pos in + let fin_flag_p = pvar ~loc fin_flag in + let result_p = pvar ~loc result in + let get_pos = [%expr (snd (Sedlexing.loc [%e lexbuf]))] in + let get_sub init curr = + [%expr Sedlexing.sub_lexeme + [%e lexbuf] + ([%e init] - (fst (Sedlexing.loc [%e lexbuf]))) + ([%e curr] - [%e init])] in + function + | `may_init_capture_slot slot -> + let slot = evar ~loc slot in + [%expr match (! [%e slot]) with + | (None, None, [%p fin_flag_p], [%p result_p]) -> + [%e slot] := (Some [%e get_pos], None, [%e fin_flag_e], [%e result_e]) + | (None, Some [%p curr_pos_p], [%p fin_flag_p], [%p result_p]) -> + [%e slot] := (Some [%e get_pos], Some [%e curr_pos_e], [%e fin_flag_e], [%e result_e]) + | (Some [%p init_pos_p], None, [%p fin_flag_p], [%p result_p]) -> + [%e slot] := (Some [%e get_pos], None, [%e fin_flag_e], [%e result_e]) + | (Some [%p init_pos_p], Some [%p curr_pos_p], [%p fin_flag_p], [%p result_p]) -> + if [%e get_pos] = [%e curr_pos_e] + 1 then + [%e slot] := (Some [%e init_pos_e], Some [%e get_pos], [%e fin_flag_e], [%e result_e]) + else ()] + | `step_capture_slot slot -> + let slot = evar ~loc slot in + [%expr match (! [%e slot]) with + | (None, [%p curr_pos_p], _, [%p result_p]) -> + [%e slot] := (None, [%e curr_pos_e], false, [%e result_e]) + | (Some [%p init_pos_p], None, _, [%p result_p]) -> + if [%e get_pos] - [%e init_pos_e] < 2 then + [%e slot] := (Some [%e init_pos_e], Some [%e get_pos], false, [%e result_e]) + else () + | (Some [%p init_pos_p], Some [%p curr_pos_p], _, [%p result_p]) -> + if [%e get_pos] = [%e curr_pos_e] + 1 || [%e get_pos] = [%e init_pos_e] then + [%e slot] := (Some [%e init_pos_e], Some [%e get_pos], false, [%e result_e]) + else ()] + | `may_finish_capture_slot slot -> + let slot = evar ~loc slot in + [%expr match (! [%e slot]) with + | (None, None, _, [%p result_p]) -> () + | (Some [%p init_pos_p], None, _, [%p result_p]) -> + [%e slot] := (Some [%e init_pos_e], None, true, [||] :: [%e result_e]) + | (Some [%p init_pos_p], Some [%p curr_pos_p], [%p fin_flag_p], [%p result_p]) -> + if [%e get_pos] = [%e curr_pos_e] && not [%e fin_flag_e] then + [%e slot] := (Some [%e init_pos_e], Some [%e curr_pos_e], true, [%e get_sub init_pos_e curr_pos_e] :: [%e result_e]) + else + [%e slot] := (Some [%e init_pos_e], Some [%e curr_pos_e], true, [%e result_e]) + | ([%p init_pos_p], [%p curr_pos_p], _, [%p result_p]) -> + [%e slot] := ([%e init_pos_e], [%e curr_pos_e], true, [%e result_e])] + | `save_offset _ -> failwith "unsupported action" + +let reorder_actions : Sedlex.generic_action list -> Sedlex.generic_action list = fun acts -> + let int_of_action = function + | `may_init_capture_slot _ -> 0 + | `step_capture_slot _ -> 1 + | `may_finish_capture_slot _ -> 2 + | `save_offset _ -> 3 + | _ -> failwith "unsupported action" in + List.sort (fun a1 a2 -> compare (int_of_action a1) (int_of_action a2)) acts + +let call_state lexbuf auto state (trans_acts : Sedlex.transition_action list) = let loc = default_loc in - let (trans, final, actions) = auto.(state) in - let actions = List.map (eaction ~loc lexbuf) actions in + let (trans, final, (node_acts : Sedlex.node_action list)) = auto.(state) in + let acts : Sedlex.generic_action list = + (trans_acts :> Sedlex.generic_action list) + @ (node_acts :> Sedlex.generic_action list) in + let acts = acts |> reorder_actions |> List.map (eaction ~loc lexbuf) in + let trans_acts = trans_acts |> List.map (eaction ~loc lexbuf) in if Array.length trans = 0 then match best_final final with - | Some i -> esequence ~loc (actions @ [eint ~loc i]) + | Some i -> esequence ~loc (acts @ [eint ~loc i]) | None -> assert false - else appfun (state_fun state) [evar ~loc lexbuf] + else esequence ~loc (trans_acts @ [appfun (state_fun state) [evar ~loc lexbuf]]) let gen_state lexbuf auto i (trans, final, actions) = let loc = default_loc in - let partition = Array.map fst trans in - let actions = List.map (eaction ~loc lexbuf) actions in - let cases = Array.mapi (fun i (_, j) -> - case ~lhs:(pint ~loc i) ~guard:None ~rhs:(call_state lexbuf auto j)) trans in + let partition = Array.map (fun (f,_,_) -> f) trans in + let actions = + (actions :> Sedlex.generic_action list) + |> reorder_actions + |> List.map (eaction ~loc lexbuf) in + let cases = Array.mapi (fun i (_, j, acts) -> + case ~lhs:(pint ~loc i) ~guard:None ~rhs:(call_state lexbuf auto j acts)) trans in let cases = Array.to_list cases in let body () = esequence ~loc @@ -275,7 +321,9 @@ let gen_state lexbuf auto i (trans, final, actions) = (appfun (partition_name partition) [[%expr Sedlexing.__private__next_int [%e evar ~loc lexbuf]]]) (cases @ [case ~lhs:[%pat? _] ~guard:None ~rhs:[%expr Sedlexing.backtrack [%e evar ~loc lexbuf]]])]) in - let ret body = [ value_binding ~loc ~pat:(pvar ~loc (state_fun i)) ~expr:(pexp_function ~loc [case ~lhs:(pvar ~loc lexbuf) ~guard:None ~rhs:body]) ] in + let ret body = [ value_binding ~loc + ~pat:(pvar ~loc (state_fun i)) + ~expr:(pexp_function ~loc [case ~lhs:(pvar ~loc lexbuf) ~guard:None ~rhs:body]) ] in match best_final final with | None -> ret (body ()) | Some _ when Array.length trans = 0 -> [] @@ -288,7 +336,7 @@ let gen_recflag auto = Array.iter (fun (trans_i, _, _) -> Array.iter - (fun (_, j) -> + (fun (_, j, _) -> let (trans_j, _, _) = auto.(j) in if Array.length trans_j > 0 then raise Exit) trans_i) @@ -302,7 +350,7 @@ let gen_definition lexbuf l error = let brs = Array.of_list l in let auto = Sedlex.compile (Array.map fst brs) in let alias_slots = gen_alisas_slots auto in - let aliases = Array.map (fun (re, _) -> gen_aliases lexbuf re) brs in + let aliases = Array.map (fun (re, _) -> gen_aliases re) brs in let cases = Array.to_list (Array.mapi (fun i (_, e) -> let expression = match aliases.(i) with | [] -> e @@ -436,12 +484,22 @@ let regexp_of_pattern env = end | Ppat_alias (pat, {txt=var}) -> incr alias_slot_counter; - let begin_offset_slot_var = "__sedlex_"^var^"_begin_offset_"^(string_of_int !alias_slot_counter) in - let end_offset_slot_var = "__sedlex_"^var^"_end_offset_"^(string_of_int !alias_slot_counter) in + let capture_slot_var = "__sedlex_"^var^"_capture_slot"^(string_of_int !alias_slot_counter) in + (* let begin_offset_slot_var = "__sedlex_"^var^"_begin_offset_"^(string_of_int !alias_slot_counter) in *) + (* let end_offset_slot_var = "__sedlex_"^var^"_end_offset_"^(string_of_int !alias_slot_counter) in *) aux pat - |> Sedlex.set_slots var (begin_offset_slot_var, end_offset_slot_var) - |> Sedlex.set_post_action (`save_offset (Sedlex.Save_end_offset_assign end_offset_slot_var)) - |> Sedlex.set_pre_action (`save_offset (Sedlex.Save_begin_offset_assign begin_offset_slot_var)) + (* |> Sedlex.set_slots var (begin_offset_slot_var, end_offset_slot_var) *) + |> Sedlex.set_slot var capture_slot_var + |> Sedlex.add_transition_action_to_all_internal_transitions (`step_capture_slot capture_slot_var) + + (* XXX *) + |> Sedlex.set_pre_action (`may_init_capture_slot capture_slot_var) + |> Sedlex.set_post_action (`may_finish_capture_slot capture_slot_var) + + + (* |> Sedlex.set_post_action (`save_offset (Sedlex.Save_end_offset_assign end_offset_slot_var)) *) + (* |> Sedlex.set_pre_action (`save_offset (Sedlex.Save_begin_offset_assign begin_offset_slot_var)) *) + | _ -> err p.ppat_loc "this pattern is not a valid regexp" in diff --git a/src/syntax/sedlex.ml b/src/syntax/sedlex.ml index 45b67170..4534891a 100644 --- a/src/syntax/sedlex.ml +++ b/src/syntax/sedlex.ml @@ -15,33 +15,36 @@ type node = { id : int; mutable action : node_action list; mutable eps : node list; - mutable trans : (Cset.t * node) list; + mutable trans : (Cset.t * node * transition_action list) list; } -and node_action = [`save_offset of save_offset_action] +and node_action = [ + | `save_offset of save_offset_action + | `may_init_capture_slot of string + | `may_finish_capture_slot of string + ] +and transition_action = [`step_capture_slot of string] and save_offset_action = | Save_begin_offset_assign of string | Save_end_offset_assign of string +type generic_action = [ transition_action | node_action ] (* Compilation regexp -> NFA *) type regexp = { nfa : node -> node; - named_groups : named_slots StringMap.t; -} -and named_slots = { - begin_var : string; - end_var : string; + named_groups : string StringMap.t; } let get_names re = StringMap.fold (fun name _ acc -> name :: acc) re.named_groups [] -let get_slots name re = - let slots = StringMap.find name re.named_groups in - (slots.begin_var, slots.end_var) +let get_slot name re = + StringMap.find name re.named_groups -let set_slots name (begin_slot, end_slot) re = - {re with named_groups = StringMap.add name {begin_var=begin_slot; end_var=end_slot;} re.named_groups} +let set_slot name slot re = + {re with named_groups = + StringMap.add name slot re.named_groups} +(* XXX : consider name changing - set_ => add_ *) let set_pre_action act re = {re with nfa = (fun succ -> @@ -55,6 +58,35 @@ let set_post_action act re = succ.action <- act :: succ.action; re.nfa succ)} +module NodeSet = Set.Make(struct + type t = node + let compare a b = compare a.id b.id + end) + +let iter_all_nodes func init = + let visited = ref NodeSet.empty in + let rec aux n = + if not (NodeSet.mem n !visited) then ( + func n; visited := NodeSet.add n !visited; + List.iter aux n.eps; + List.iter (fun (_, n, _) -> aux n) n.trans + ) in + aux init + +let add_transition_action_to_all_internal_transitions act re : regexp = + { re with + nfa = begin + let nfa0 = re.nfa in + fun succ -> + let sub : node = nfa0 succ in + iter_all_nodes (fun node -> + node.trans <- + node.trans + |> List.map (fun (c, n, acts) -> c, n, act::acts)) sub; + sub + end + } + let cur_id = ref 0 let new_node () = incr cur_id; @@ -77,12 +109,12 @@ let seq r1 r2 = named_groups = merge_named_groups r1.named_groups r2.named_groups;} let is_chars final = function - | {eps = []; trans = [c, f]} when f == final -> Some c + | {eps = []; trans = [c, f, _]} when f == final -> Some c | _ -> None let chars c = regexp_of_nfa (fun succ -> let n = new_node () in - n.trans <- [c,succ]; + n.trans <- [c,succ,[]]; n) let alt r1 r2 = @@ -150,36 +182,35 @@ let rec add_node state node = and add_nodes state nodes = List.fold_left add_node state nodes - let transition (state : state) = (* Merge transition with the same target *) let rec norm = function - | (c1, n1)::((c2, n2)::q as l) -> - if n1 == n2 then norm ((Cset.union c1 c2, n1)::q) - else (c1, n1)::(norm l) + | (c1, n1, acts1)::((c2, n2, acts2)::q as l) -> + if n1 == n2 then norm ((Cset.union c1 c2, n1, acts1@acts2)::q) + else (c1, n1, acts1)::(norm l) | l -> l in let t = List.concat (List.map (fun n -> n.trans) state) in - let t = norm (List.sort (fun (_, n1) (_, n2) -> n1.id - n2.id) t) in + let t = norm (List.sort (fun (_, n1, _) (_, n2, _) -> n1.id - n2.id) t) in (* Split char sets so as to make them disjoint *) - let split (all, t) (c0, n0) = + let split (all, t) (c0, n0, acts0) = let t = - (Cset.difference c0 all, [n0]) :: - List.map (fun (c, ns) -> (Cset.intersection c c0, n0::ns)) t @ - List.map (fun (c, ns) -> (Cset.difference c c0, ns)) t + (Cset.difference c0 all, [n0], acts0) :: + List.map (fun (c, ns, acts) -> (Cset.intersection c c0, n0::ns, acts0@acts)) t @ + List.map (fun (c, ns, acts) -> (Cset.difference c c0, ns, acts)) t in Cset.union all c0, - List.filter (fun (c, _) -> not (Cset.is_empty c)) t + List.filter (fun (c, _, _) -> not (Cset.is_empty c)) t in let (_,t) = List.fold_left split (Cset.empty,[]) t in (* Epsilon closure of targets *) - let t = List.map (fun (c, ns) -> (c, add_nodes [] ns)) t in + let t = List.map (fun (c, ns, acts) -> (c, add_nodes [] ns, acts)) t in (* Canonical ordering *) let t = Array.of_list t in - Array.sort (fun (c1, _) (c2, _) -> compare c1 c2) t; + Array.sort (fun (c1, _, _) (c2, _, _) -> compare c1 c2) t; t let compile rs = @@ -196,7 +227,7 @@ let compile rs = let trans = transition state in let trans = Array.map - (fun (p, t) -> (p, aux t)) + (fun (p, t, acts) -> (p, aux t, acts)) trans in let finals = Array.map (fun (_, f) -> List.memq f state) rs in let actions = List.concat_map (fun n -> n.action) state in diff --git a/src/syntax/sedlex.mli b/src/syntax/sedlex.mli index d486c088..22f884be 100644 --- a/src/syntax/sedlex.mli +++ b/src/syntax/sedlex.mli @@ -4,16 +4,23 @@ type regexp -type node_action = [`save_offset of save_offset_action] +type transition_action = [`step_capture_slot of string] +type node_action = [ + | `save_offset of save_offset_action + | `may_init_capture_slot of string + | `may_finish_capture_slot of string + ] and save_offset_action = | Save_begin_offset_assign of string | Save_end_offset_assign of string +type generic_action = [ transition_action | node_action ] val get_names: regexp -> string list -val get_slots: string -> regexp -> string * string -val set_slots: string -> string * string -> regexp -> regexp +val get_slot: string -> regexp -> string +val set_slot: string -> string -> regexp -> regexp val set_pre_action: node_action -> regexp -> regexp val set_post_action: node_action -> regexp -> regexp +val add_transition_action_to_all_internal_transitions : transition_action -> regexp -> regexp val chars: Sedlex_cset.t -> regexp val seq: regexp -> regexp -> regexp @@ -32,4 +39,6 @@ val intersection: regexp -> regexp -> regexp option (* If each argument is a single [chars] regexp, returns a regexp which matches the intersection set. Otherwise returns [None]. *) -val compile: regexp array -> ((Sedlex_cset.t * int) array * bool array * node_action list) array +val compile: + regexp array -> + ((Sedlex_cset.t * int * transition_action list) array * bool array * node_action list) array