Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 59 additions & 0 deletions examples/as_pattern.ml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
let string_of_uchars us =
String.of_seq (Array.to_seq (Array.map Uchar.to_char us))

let rec token1 buf =
match%sedlex buf with
| eof -> print_endline "\tEnd"
| white_space -> print_endline "\tWhitespace"; token1 buf
| (Plus ('a' .. 'z' | 'A' .. 'Z')) as text ->
print_endline (string_of_uchars text); token1 buf
| (',' | '.') as x ->
print_endline (string_of_uchars x); token1 buf
| any -> print_endline "other"; token1 buf
| _ -> failwith "Internal failure: Reached impossible place"

let rec token2 buf =
match%sedlex buf with
| eof -> print_endline "\tEnd"
| white_space -> print_endline "\tWhitespace"; token2 buf
| "a" as x ->
print_endline (string_of_uchars x); token2 buf
| "b" as x ->
print_endline (string_of_uchars x); token2 buf
| "cde" as x ->
print_endline (string_of_uchars x); token2 buf
| ("f" as x), "g" ->
print_endline (string_of_uchars x); token2 buf
| "h", ("i" as x) ->
print_endline (string_of_uchars x); token2 buf
| ("j" as x | "k" as x) ->
print_endline (string_of_uchars x); token2 buf
| ("l" | "m") as x ->
print_endline (string_of_uchars x); token2 buf
| (Star "n" as x), (Star "o" as y) ->
print_endline (string_of_uchars x^"+"^string_of_uchars y); token2 buf
| (Plus "p" as x), (Plus "q" as y) ->
print_endline (string_of_uchars x^"+"^string_of_uchars y); token2 buf
| any -> print_endline "other"; token1 buf
| _ -> failwith "Internal failure: Reached impossible place"


let () =
token1 (Sedlexing.Utf8.from_string "It takes all the running you can do, to keep in the same place.");
token2 (Sedlexing.Utf8.from_string "a b cde fg hi j k l m n o nn no oo nnn nno noo ooo pq ppq pqq pppq ppqq pqqq")



(*
let rec token buf =
match%sedlex buf with
| eof -> print_endline "\tEnd"
| white_space -> print_endline "\tWhitespace"; token buf
| (Star 'a' as a), ('b' as b) -> print_endline (string_of_uchars a^"\t"^string_of_uchars b); token buf
| any -> print_endline "Other"; token buf
| _ -> failwith "Internal failure: Reached impossible place"

let () =
let lexbuf = Sedlexing.Utf8.from_string "b ab aab aaab aaaab" in
token lexbuf
*)
9 changes: 8 additions & 1 deletion examples/dune
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
(executables
(names tokenizer regressions complement subtraction repeat performance)
(names tokenizer regressions complement subtraction repeat as_pattern performance)
(libraries sedlex sedlex_ppx)
(preprocess
(pps sedlex.ppx))
Expand Down Expand Up @@ -40,6 +40,13 @@
(action
(run %{<})))

(rule
(alias runtest)
(deps
(:< as_pattern.exe))
(action
(run %{<})))

(rule
(alias runtest)
(deps
Expand Down
194 changes: 170 additions & 24 deletions src/syntax/ppx_sedlex.ml
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,9 @@ let partition (name, p) =
[%e body]
]

(* Alias offset slot counter *)
let alias_slot_counter = ref (-1)

(* Code generation for the automata *)

let best_final final =
Expand All @@ -184,27 +187,143 @@ let best_final final =
done;
!fin

let gen_alisas_slots auto =
let loc = default_loc in
let module S = Set.Make(struct type t = string let compare = compare end) in
let slots = ref [] in
let seen = ref S.empty in
Array.iter (fun (trans, _, nacts) ->
List.iter (function
| `may_init_capture_slot slot | `may_finish_capture_slot slot ->
if S.mem slot !seen then ()
else slots := slot :: !slots; seen := S.add slot !seen
| _ -> failwith "can't generate aliases: unsupported action") nacts;
Array.iter (fun (_, _, tacts) ->
List.iter (function
| `step_capture_slot slot ->
if S.mem slot !seen then ()
else slots := slot :: !slots; seen := S.add slot !seen
| _ -> failwith "can't generate aliases: unsupported action") tacts) trans) auto;
List.map (fun slot -> value_binding ~loc ~pat:(pvar ~loc slot) ~expr:[%expr ref (None, None, false, [])]) !slots

let gen_aliases re =
let loc = default_loc in
List.map (fun name ->
let slot = Sedlex.get_slot name re in
value_binding ~loc
~pat:(pvar ~loc name)
~expr:([%expr (function
| (_, _, _, x :: _) -> x
| _ -> [||]) ![%e evar ~loc slot]]))
(Sedlex.get_names re)

let state_fun state = Printf.sprintf "__sedlex_state_%i" state

let call_state lexbuf auto state =
let (trans, final) = auto.(state) in
let eaction ~loc lexbuf =
let lexbuf = evar ~loc lexbuf in
let init_pos = "__sedlex_eaction_init_pos" in
let curr_pos = "__sedlex_eaction_curr_pos" in
let fin_flag = "__sedlex_eaction_fin_flag" in
let result = "__sedlex_eaction_results" in
let init_pos_e = evar ~loc init_pos in
let curr_pos_e = evar ~loc curr_pos in
let fin_flag_e = evar ~loc fin_flag in
let result_e = evar ~loc result in
let init_pos_p = pvar ~loc init_pos in
let curr_pos_p = pvar ~loc curr_pos in
let fin_flag_p = pvar ~loc fin_flag in
let result_p = pvar ~loc result in
let get_pos = [%expr (snd (Sedlexing.loc [%e lexbuf]))] in
let get_sub init curr =
[%expr Sedlexing.sub_lexeme
[%e lexbuf]
([%e init] - (fst (Sedlexing.loc [%e lexbuf])))
([%e curr] - [%e init])] in
function
| `may_init_capture_slot slot ->
let slot = evar ~loc slot in
[%expr match (! [%e slot]) with
| (None, None, [%p fin_flag_p], [%p result_p]) ->
[%e slot] := (Some [%e get_pos], None, [%e fin_flag_e], [%e result_e])
| (None, Some [%p curr_pos_p], [%p fin_flag_p], [%p result_p]) ->
[%e slot] := (Some [%e get_pos], Some [%e curr_pos_e], [%e fin_flag_e], [%e result_e])
| (Some [%p init_pos_p], None, [%p fin_flag_p], [%p result_p]) ->
[%e slot] := (Some [%e get_pos], None, [%e fin_flag_e], [%e result_e])
| (Some [%p init_pos_p], Some [%p curr_pos_p], [%p fin_flag_p], [%p result_p]) ->
if [%e get_pos] = [%e curr_pos_e] + 1 then
[%e slot] := (Some [%e init_pos_e], Some [%e get_pos], [%e fin_flag_e], [%e result_e])
else ()]
| `step_capture_slot slot ->
let slot = evar ~loc slot in
[%expr match (! [%e slot]) with
| (None, [%p curr_pos_p], _, [%p result_p]) ->
[%e slot] := (None, [%e curr_pos_e], false, [%e result_e])
| (Some [%p init_pos_p], None, _, [%p result_p]) ->
if [%e get_pos] - [%e init_pos_e] < 2 then
[%e slot] := (Some [%e init_pos_e], Some [%e get_pos], false, [%e result_e])
else ()
| (Some [%p init_pos_p], Some [%p curr_pos_p], _, [%p result_p]) ->
if [%e get_pos] = [%e curr_pos_e] + 1 || [%e get_pos] = [%e init_pos_e] then
[%e slot] := (Some [%e init_pos_e], Some [%e get_pos], false, [%e result_e])
else ()]
| `may_finish_capture_slot slot ->
let slot = evar ~loc slot in
[%expr match (! [%e slot]) with
| (None, None, _, [%p result_p]) -> ()
| (Some [%p init_pos_p], None, _, [%p result_p]) ->
[%e slot] := (Some [%e init_pos_e], None, true, [||] :: [%e result_e])
| (Some [%p init_pos_p], Some [%p curr_pos_p], [%p fin_flag_p], [%p result_p]) ->
if [%e get_pos] = [%e curr_pos_e] && not [%e fin_flag_e] then
[%e slot] := (Some [%e init_pos_e], Some [%e curr_pos_e], true, [%e get_sub init_pos_e curr_pos_e] :: [%e result_e])
else
[%e slot] := (Some [%e init_pos_e], Some [%e curr_pos_e], true, [%e result_e])
| ([%p init_pos_p], [%p curr_pos_p], _, [%p result_p]) ->
[%e slot] := ([%e init_pos_e], [%e curr_pos_e], true, [%e result_e])]
| `save_offset _ -> failwith "unsupported action"

let reorder_actions : Sedlex.generic_action list -> Sedlex.generic_action list = fun acts ->
let int_of_action = function
| `may_init_capture_slot _ -> 0
| `step_capture_slot _ -> 1
| `may_finish_capture_slot _ -> 2
| `save_offset _ -> 3
| _ -> failwith "unsupported action" in
List.sort (fun a1 a2 -> compare (int_of_action a1) (int_of_action a2)) acts

let call_state lexbuf auto state (trans_acts : Sedlex.transition_action list) =
let loc = default_loc in
let (trans, final, (node_acts : Sedlex.node_action list)) = auto.(state) in
let acts : Sedlex.generic_action list =
(trans_acts :> Sedlex.generic_action list)
@ (node_acts :> Sedlex.generic_action list) in
let acts = acts |> reorder_actions |> List.map (eaction ~loc lexbuf) in
let trans_acts = trans_acts |> List.map (eaction ~loc lexbuf) in
if Array.length trans = 0
then match best_final final with
| Some i -> eint ~loc:default_loc i
| Some i -> esequence ~loc (acts @ [eint ~loc i])
| None -> assert false
else appfun (state_fun state) [evar ~loc:default_loc lexbuf]
else esequence ~loc (trans_acts @ [appfun (state_fun state) [evar ~loc lexbuf]])

let gen_state lexbuf auto i (trans, final) =
let gen_state lexbuf auto i (trans, final, actions) =
let loc = default_loc in
let partition = Array.map fst trans in
let cases = Array.mapi (fun i (_, j) -> case ~lhs:(pint ~loc i) ~guard:None ~rhs:(call_state lexbuf auto j)) trans in
let partition = Array.map (fun (f,_,_) -> f) trans in
let actions =
(actions :> Sedlex.generic_action list)
|> reorder_actions
|> List.map (eaction ~loc lexbuf) in
let cases = Array.mapi (fun i (_, j, acts) ->
case ~lhs:(pint ~loc i) ~guard:None ~rhs:(call_state lexbuf auto j acts)) trans in
let cases = Array.to_list cases in
let body () =
pexp_match ~loc
(appfun (partition_name partition) [[%expr Sedlexing.__private__next_int [%e evar ~loc lexbuf]]])
(cases @ [case ~lhs:[%pat? _] ~guard:None ~rhs:[%expr Sedlexing.backtrack [%e evar ~loc lexbuf]]])
esequence ~loc
(actions @
[pexp_match ~loc
(appfun (partition_name partition) [[%expr Sedlexing.__private__next_int [%e evar ~loc lexbuf]]])
(cases @ [case ~lhs:[%pat? _] ~guard:None ~rhs:[%expr Sedlexing.backtrack [%e evar ~loc lexbuf]]])])
in
let ret body = [ value_binding ~loc ~pat:(pvar ~loc (state_fun i)) ~expr:(pexp_function ~loc [case ~lhs:(pvar ~loc lexbuf) ~guard:None ~rhs:body]) ] in
let ret body = [ value_binding ~loc
~pat:(pvar ~loc (state_fun i))
~expr:(pexp_function ~loc [case ~lhs:(pvar ~loc lexbuf) ~guard:None ~rhs:body]) ] in
match best_final final with
| None -> ret (body ())
| Some _ when Array.length trans = 0 -> []
Expand All @@ -215,10 +334,10 @@ let gen_recflag auto =
in states with no further transitions. *)
try
Array.iter
(fun (trans_i, _) ->
(fun (trans_i, _, _) ->
Array.iter
(fun (_, j) ->
let (trans_j, _) = auto.(j) in
(fun (_, j, _) ->
let (trans_j, _, _) = auto.(j) in
if Array.length trans_j > 0 then raise Exit)
trans_i)
auto;
Expand All @@ -230,16 +349,26 @@ let gen_definition lexbuf l error =
let loc = default_loc in
let brs = Array.of_list l in
let auto = Sedlex.compile (Array.map fst brs) in
let cases = Array.to_list (Array.mapi (fun i (_, e) -> case ~lhs:(pint ~loc i) ~guard:None ~rhs:e) brs) in
let alias_slots = gen_alisas_slots auto in
let aliases = Array.map (fun (re, _) -> gen_aliases re) brs in
let cases = Array.to_list (Array.mapi (fun i (_, e) ->
let expression = match aliases.(i) with
| [] -> e
| aliases -> pexp_let ~loc Nonrecursive aliases e in
case ~lhs:(pint ~loc i) ~guard:None ~rhs:expression) brs) in
let states = Array.mapi (gen_state lexbuf auto) auto in
let states = List.flatten (Array.to_list states) in
pexp_let ~loc (gen_recflag auto) states
(pexp_sequence ~loc
[%expr Sedlexing.start [%e evar ~loc lexbuf]]
(pexp_match ~loc (appfun (state_fun 0) [evar ~loc lexbuf])
(cases @ [case ~lhs:(ppat_any ~loc) ~guard:None ~rhs:error])
)
)
let body () =
pexp_let ~loc (gen_recflag auto) states
(pexp_sequence ~loc
[%expr Sedlexing.start [%e evar ~loc lexbuf]]
(pexp_match ~loc (appfun (state_fun 0) [evar ~loc lexbuf])
(cases @ [case ~lhs:(ppat_any ~loc) ~guard:None ~rhs:error])
)
) in
match alias_slots with
| [] -> body ()
| _ -> pexp_let ~loc Nonrecursive alias_slots (body ())

(* Lexer specification parser *)

Expand Down Expand Up @@ -353,12 +482,29 @@ let regexp_of_pattern env =
with Not_found ->
err p.ppat_loc (Printf.sprintf "unbound regexp %s" x)
end
| Ppat_alias (pat, {txt=var}) ->
incr alias_slot_counter;
let capture_slot_var = "__sedlex_"^var^"_capture_slot"^(string_of_int !alias_slot_counter) in
(* let begin_offset_slot_var = "__sedlex_"^var^"_begin_offset_"^(string_of_int !alias_slot_counter) in *)
(* let end_offset_slot_var = "__sedlex_"^var^"_end_offset_"^(string_of_int !alias_slot_counter) in *)
aux pat
(* |> Sedlex.set_slots var (begin_offset_slot_var, end_offset_slot_var) *)
|> Sedlex.set_slot var capture_slot_var
|> Sedlex.add_transition_action_to_all_internal_transitions (`step_capture_slot capture_slot_var)

(* XXX *)
|> Sedlex.set_pre_action (`may_init_capture_slot capture_slot_var)
|> Sedlex.set_post_action (`may_finish_capture_slot capture_slot_var)


(* |> Sedlex.set_post_action (`save_offset (Sedlex.Save_end_offset_assign end_offset_slot_var)) *)
(* |> Sedlex.set_pre_action (`save_offset (Sedlex.Save_begin_offset_assign begin_offset_slot_var)) *)

| _ ->
err p.ppat_loc "this pattern is not a valid regexp"
err p.ppat_loc "this pattern is not a valid regexp"
in
aux


let previous = ref []
let regexps = ref []
let should_set_cookies = ref false
Expand Down
Loading