(* parser.ml — parse --help output into structured flag/subcommand/positional data. * * this module is the core of inshellah's help-text understanding. it takes the * raw text that a cli tool prints when you run `cmd --help` and extracts: * - flag entries (short/long switches with optional parameters and descriptions) * - subcommand listings (name + description pairs) * - positional arguments (from usage lines) * * the parser is built on Angstrom (a monadic parser combinator library) for the * structured flag/subcommand extraction, with hand-rolled imperative parsers for * usage-line positional extraction (where the format is too varied for clean * combinator composition). * * key design decisions: * - the Angstrom parser runs in prefix-consume mode — it doesn't need to parse * the entire input, just extract what it can recognize. unrecognized lines are * skipped via skip_non_option_line. * - multi-line descriptions are handled via indentation-based continuation: * lines indented 8+ spaces that don't start with '-' are folded into the * previous entry's description. * - subcommand detection uses a heuristic: lines with a name followed by 2+ * spaces then a description, where the name is at least 2 chars. section * headers (like "arguments:") toggle whether name-description pairs are * treated as subcommands or positionals. * - positional extraction has two paths: usage-line parsing (the common case) * and CLI11's explicit "positionals:" section format. *) open Angstrom (* strip ansi escape sequences and osc hyperlinks from --help output. * many modern cli tools emit colored/styled output even when piped, * so we need to clean this before parsing. handles: * - csi sequences (esc [ ... final_byte) — colors, cursor movement, etc. * - osc sequences (esc ] ... bel/st) — hyperlinks, window titles, etc. * - other two-byte esc+char sequences *) let strip_ansi s = let buf = Buffer.create (String.length s) in let len = String.length s in let pos = ref 0 in while !pos < len do if !pos + 1 < len && Char.code s.[!pos] = 0x1b then begin let next = s.[!pos + 1] in if next = '[' then begin (* csi sequence: esc [ ... final_byte *) pos := !pos + 2; while !pos < len && not (s.[!pos] >= '@' && s.[!pos] <= '~') do incr pos done; if !pos < len then incr pos end else if next = ']' then begin (* osc sequence: esc ] ... (terminated by bel or esc \) *) pos := !pos + 2; let terminated = ref false in while !pos < len && not !terminated do if s.[!pos] = '\x07' then (incr pos; terminated := true) else if !pos + 1 < len && Char.code s.[!pos] = 0x1b && s.[!pos + 1] = '\\' then (pos := !pos + 2; terminated := true) else incr pos done end else begin (* other esc sequence, skip esc + one char *) pos := !pos + 2 end end else begin Buffer.add_char buf s.[!pos]; incr pos end done; Buffer.contents buf (* --- character class predicates --- * used throughout the Angstrom parsers to classify characters. * separated out for readability and reuse. *) let is_whitespace = function ' ' | '\t' -> true | _ -> false let is_alphanumeric = function | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' -> true | _ -> false (* characters allowed inside parameter names like FILE, output-dir, etc. *) let is_param_char = function | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '_' | '-' -> true | _ -> false (* used to detect ALL_CAPS parameter names like FILE, TIME_STYLE *) let is_upper_or_underscore = function | 'A' .. 'Z' | '_' -> true | _ -> false (* characters allowed in long flag names (--foo-bar, --enable-feature2) *) let is_long_char = function | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '-' -> true | _ -> false (* --- core types --- * these types represent the structured output of parsing a help text. * they are shared across the entire codebase (nushell codegen, store, manpage parser). * * switch: a flag can be short-only (-v), long-only (--verbose), or both (-v, --verbose). * the both variant keeps the pair together so nushell can emit "--verbose(-v)". * * param: flags can take mandatory (--output FILE) or optional (--color[=WHEN]) values. * * entry: one complete flag definition — its switch form, optional parameter, and * the description text (potentially multi-line, already joined). * * help_result: the complete parsed output for a single command. *) type switch = Short of char | Long of string | Both of char * string type param = Mandatory of string | Optional of string type entry = { switch : switch; param : param option; desc : string } type subcommand = { name : string; desc : string } type positional = { pos_name : string; optional : bool; variadic : bool } type help_result = { entries : entry list; subcommands : subcommand list; positionals : positional list; description : string } (* --- low-level Angstrom combinators --- * building blocks for all the parsers below. *) (* consume horizontal whitespace (spaces and tabs) without crossing lines *) let inline_ws = skip_while (function ' ' | '\t' -> true | _ -> false) (* end of line — matches either a newline or end of input. * this is the permissive version used in most places. *) let eol = end_of_line <|> end_of_input (* strict end of line — must consume an actual newline character. * used in skip_non_option_line so we don't accidentally match eof * and consume it when we shouldn't. *) let eol_strict = end_of_line (* --- switch and parameter parsers --- * parse the flag name portion of an option line, e.g. "-v", "--verbose" *) let short_switch = char '-' *> satisfy is_alphanumeric let long_switch = string "--" *> take_while1 is_long_char let comma = char ',' *> inline_ws (* parameter parsers — handle the various syntaxes tools use to indicate * that a flag takes a value. the formats are surprisingly diverse: * --output=FILE (eq_man_param — mandatory, common in gnu tools) * --color[=WHEN] (eq_opt_param — optional with = syntax) * --depth DEPTH (space_upper_param — space-separated ALL_CAPS) * --file (space_angle_param — angle brackets) * --file [] (space_opt_angle_param — optional angle brackets) * --format string (space_type_param — go/cobra lowercase type word) *) let eq_opt_param = string "[=" *> take_while1 is_param_char <* char ']' >>| fun a -> Optional a let eq_man_param = char '=' *> take_while1 is_param_char >>| fun a -> Mandatory a (* space-separated ALL_CAPS param: e.g. " FILE", " TIME_STYLE". * peek ahead and check the first char is uppercase, then validate * the entire word is ALL_CAPS. prevents false positives where a * description word like "Do" or "Set" immediately follows the flag name. * digits are allowed (e.g. "SHA256") but lowercase chars disqualify. *) let space_upper_param = char ' ' *> peek_char_fail >>= fun c -> if is_upper_or_underscore c then take_while1 is_param_char >>= fun name -> if String.length name >= 1 && String.for_all (fun c -> is_upper_or_underscore c || c >= '0' && c <= '9') name then return (Mandatory name) else fail "not an all-caps param" else fail "not an uppercase param" (* angle-bracket param: e.g. "", "" *) let angle_param = char '<' *> take_while1 (fun c -> c <> '>') <* char '>' >>| fun name -> Mandatory name (* space + angle bracket param *) let space_angle_param = char ' ' *> angle_param (* optional angle bracket param: [] *) let opt_angle_param = char '[' *> char '<' *> take_while1 (fun c -> c <> '>') <* char '>' <* char ']' >>| fun name -> Optional name let space_opt_angle_param = char ' ' *> opt_angle_param (* go/cobra style: space + lowercase type word like "string", "list", "int". * capped at 10 chars to avoid consuming description words. * go's flag libraries commonly emit "--timeout duration" or "--name string" * where the type name is a short lowercase word. longer words are almost * certainly the start of a description, not a type annotation. *) let space_type_param = char ' ' *> peek_char_fail >>= fun c -> if c >= 'a' && c <= 'z' then take_while1 (fun c -> c >= 'a' && c <= 'z') >>= fun name -> if String.length name <= 10 then return (Mandatory name) else fail "too long for type param" else fail "not a lowercase type param" (* try each parameter format in order of specificity. the ordering matters: * eq_opt_param must come before eq_man_param because "[=WHEN]" would otherwise * partially match as "=WHEN" then fail on the trailing "]". similarly, * space_opt_angle_param before space_angle_param to catch "[]" before "". *) let param_parser = option None (choice [ eq_opt_param; eq_man_param; space_opt_angle_param; space_angle_param; space_upper_param; space_type_param ] >>| fun a -> Some a) (* switch parser — handles the various ways help text presents flag names. * formats handled (in order of attempt): * -a, --all (short + comma + long — gnu style) * -a --all (short + space + long — some tools omit the comma) * --all / -a (long + slash + short — rare but seen in some tools) * -a (short only) * --all (long only) * * the ordering is critical because Angstrom's choice commits to * the first parser that makes progress. short_switch consumes "-a", so the * combined parsers must be tried before the short-only parser. *) let switch_parser = choice [ (short_switch >>= fun s -> comma *> long_switch >>| fun l -> Both (s, l)); (short_switch >>= fun s -> char ' ' *> long_switch >>| fun l -> Both (s, l)); (long_switch >>= fun l -> inline_ws *> char '/' *> inline_ws *> short_switch >>| fun s -> Both (s, l)); (short_switch >>| fun s -> Short s); (long_switch >>| fun l -> Long l); ] (* --- description parsing with multi-line continuation --- * descriptions in help text often wrap across multiple lines. the convention * is that continuation lines are deeply indented (8+ spaces) and don't start * with '-' (which would indicate a new flag entry). we peek ahead to check * indentation without consuming, then decide whether to fold the line in. *) (* take the rest of the line as text (does not consume the newline itself) *) let rest_of_line = take_till (fun c -> c = '\n' || c = '\r') (* check if a line is a continuation line: deeply indented, doesn't start with '-'. * tabs count as 8 spaces to match typical terminal rendering. * the 8-space threshold was chosen empirically — most help formatters indent * descriptions at least this much, while flag lines are indented 2-4 spaces. *) let continuation_line = peek_string 1 >>= fun _ -> (* must start with significant whitespace (8+ spaces or tab) *) let count_indent s = let indent = ref 0 in let pos = ref 0 in while !pos < String.length s do (match s.[!pos] with | ' ' -> incr indent | '\t' -> indent := !indent + 8 | _ -> pos := String.length s); incr pos done; !indent in available >>= fun avail -> if avail = 0 then fail "eof" else (* peek ahead to see indentation level *) peek_string (min avail 80) >>= fun preview -> let indent = count_indent preview in let trimmed = String.trim preview in let starts_with_dash = String.length trimmed > 0 && trimmed.[0] = '-' in if indent >= 8 && not starts_with_dash then (* this is a continuation line — consume whitespace + text *) inline_ws *> rest_of_line <* eol else fail "not a continuation line" (* parse description text: first line (after switch+param) plus any continuation lines. * blank continuation lines are filtered out, and all lines are trimmed and joined * with spaces into a single string. *) let description = inline_ws *> rest_of_line <* eol >>= fun first_line -> many continuation_line >>| fun cont_lines -> let all = first_line :: cont_lines in let all = List.filter (fun s -> String.length (String.trim s) > 0) all in String.concat " " (List.map String.trim all) (* description that appears on a separate line below the flag. * this handles the clap (rust) "long" help format where flags and descriptions * are on separate lines: * --verbose * increase verbosity * here there's no inline description — just deeply-indented continuation lines. *) let description_below = many1 continuation_line >>| fun lines -> let lines = List.filter (fun s -> String.length (String.trim s) > 0) lines in String.concat " " (List.map String.trim lines) (* --- line classification for skipping --- * the parser needs to skip lines it doesn't understand (section headers, * blank lines, description paragraphs not attached to a flag, etc.) * without consuming lines that are flag entries. *) (* peek ahead to check if the current line looks like a flag entry. * an option line starts with whitespace then '-'. *) let at_option_line = peek_string 1 >>= fun _ -> available >>= fun avail -> if avail = 0 then fail "eof" else peek_string (min avail 40) >>= fun preview -> let s = String.trim preview in if String.length s > 0 && s.[0] = '-' then return () else fail "not an option line" (* skip a non-option line (section header, blank, description-only, etc.). * uses eol_strict (not eol) so it won't match at eof — this prevents the * parser from infinitely skipping at the end of input. if the line looks * like an option line (at_option_line succeeds), we deliberately fail so * that the entry parser gets a chance at it instead. *) let skip_non_option_line = (at_option_line *> fail "this is an option line") <|> (rest_of_line *> eol_strict *> return ()) (* --- entry parsing --- *) (* parse a single flag entry: leading whitespace, then switch+param, then description. * the description can appear on the same line (inline) or on the next line (below). * if there's no description at all, we accept an empty string. * the (eol *> description_below) branch handles the clap long-help format. *) let entry = inline_ws *> lift2 (fun (sw, param) desc -> { switch = sw; param; desc }) (lift2 (fun a b -> (a, b)) switch_parser param_parser) (description <|> (eol *> (description_below <|> return ""))) (* --- subcommand parsing --- * subcommand lines in help text follow the pattern: * " name description" * where the name and description are separated by 2+ spaces. * some tools also include argument placeholders between name and description: * " start UNIT... start one or more units" * " list [PATTERN] list matching units" *) let is_subcommand_char = function | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | '_' -> true | _ -> false (* skip argument placeholders like UNIT..., [PATTERN...|PID...], * that appear between the subcommand name and the description. * only consumes single-space gaps — the two-space gap before the * description is left for the main parser to use as the delimiter. * * this is a recursive (fix-point) parser that peeks ahead to distinguish * single-space argument gaps from the double-space description separator. * it accepts tokens that start with [, <, or are ALL_CAPS (with dots/pipes/ * commas for variadic syntax). *) let skip_arg_placeholders = fix (fun self -> (* peek ahead: single space followed by arg-like token *) available >>= fun avail -> if avail < 2 then return () else peek_string (min avail 2) >>= fun peek_two -> if String.length peek_two >= 2 && peek_two.[0] = ' ' && peek_two.[1] <> ' ' then (* single space — could be an arg placeholder *) let next = peek_two.[1] in if next = '[' || next = '<' || (next >= 'A' && next <= 'Z') then (* peek the full token to check if it's ALL_CAPS/brackets *) peek_string (min avail 80) >>= fun preview -> (* extract the token after the single space *) let tok_start = 1 in let token_end = ref tok_start in while !token_end < String.length preview && preview.[!token_end] <> ' ' && preview.[!token_end] <> '\n' && preview.[!token_end] <> '\r' do incr token_end done; let tok = String.sub preview tok_start (!token_end - tok_start) in (* accept as placeholder if it starts with [ or < or is ALL_CAPS (possibly with dots, pipes, dashes) *) let is_placeholder = tok.[0] = '[' || tok.[0] = '<' || String.for_all (fun c -> (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || c = '.' || c = '|' || c = ',' || (c >= '0' && c <= '9') ) tok in if is_placeholder then advance (1 + String.length tok) *> self else return () else return () else return ()) (* parse a subcommand entry line. * requires: name >= 2 chars, followed by 2+ spaces, then description. * the name is lowercased for consistent lookup. * * if the description starts with "- " (a dash-space prefix), it's stripped. * some tools format their subcommand lists as: * " add - add a new item" * where the "- " is decorative, not part of the description. *) let subcommand_entry = inline_ws *> take_while1 is_subcommand_char >>= fun name -> if String.length name < 2 then fail "subcommand name too short" else skip_arg_placeholders *> char ' ' *> char ' ' *> inline_ws *> rest_of_line <* eol >>| fun desc -> { name = String.lowercase_ascii name; desc = let trimmed = String.trim desc in if String.length trimmed >= 2 && trimmed.[0] = '-' && trimmed.[1] = ' ' then String.trim (String.sub trimmed 2 (String.length trimmed - 2)) else trimmed } (* --- section header detection --- * section headers are critical for disambiguating subcommands from positional * arguments. lines like "commands:" introduce subcommand sections, while * "arguments:" or "positionals:" introduce argument sections where the same * name+description format should not be treated as subcommands. *) (* detect section names that introduce positional argument listings. * the check is case-insensitive and strips trailing colons. *) let is_arg_section s = let lc = String.lowercase_ascii (String.trim s) in let base = if String.ends_with ~suffix:":" lc then String.sub lc 0 (String.length lc - 1) |> String.trim else lc in base = "arguments" || base = "args" || base = "positionals" || base = "positional arguments" (* a section header: left-aligned (or lightly indented, <= 4 spaces) text * ending with ':', not starting with '-'. must be consumed before * subcommand_entry in the choice combinator, otherwise "commands:" would * be parsed as a subcommand named "commands" with description ":". * * returns a bool indicating whether this is an argument section (true) * or some other section (false). this drives the subcommand filtering logic * in help_parser — entries under argument sections are excluded from the * subcommand list. *) let section_header = available >>= fun avail -> if avail = 0 then fail "eof" else peek_string (min avail 80) >>= fun preview -> (* extract just the first line from the preview *) let first_line = match String.index_opt preview '\n' with | Some pos -> String.sub preview 0 pos | None -> preview in let trimmed = String.trim first_line in let len = String.length trimmed in let indent = let pos = ref 0 in while !pos < String.length first_line && (first_line.[!pos] = ' ' || first_line.[!pos] = '\t') do incr pos done; !pos in if len >= 2 && trimmed.[len - 1] = ':' && trimmed.[0] <> '-' && indent <= 4 then rest_of_line <* eol_strict >>| fun line -> is_arg_section line else fail "not a section header" (* --- top-level parser --- * the main help parser: walks through all lines, trying each line as one of: * 1. a flag entry (starts with whitespace + '-') * 2. a section header (left-aligned text ending with ':') * 3. a subcommand line (name + 2+ spaces + description) * 4. anything else — skip * * the choice ordering matters: entries are tried first (highest priority), * then section headers (must beat subcommand_entry to avoid misparse), * then subcommands, then skip as fallback. * * after collecting all items, two post-processing steps happen: * - subcommands under argument sections are excluded (tracked via * a running in_arg_sec boolean toggled by section headers) * - duplicate subcommand names are deduplicated, keeping the entry * with the longer description (heuristic: more info = better) * * positionals are not extracted here — they come from the usage line * parser (extract_usage_positionals) or CLI11's explicit section parser * (extract_cli11_positionals), applied later in parse_help. *) let help_parser = let open Angstrom in fix (fun _self -> let try_entry = entry >>| fun e -> `Entry e in let try_section = section_header >>| fun is_arg -> `Section is_arg in let try_subcommand = subcommand_entry >>| fun sc -> `Subcommand sc in let try_skip = skip_non_option_line >>| fun () -> `Skip in many (choice [ try_entry; try_section; try_subcommand; try_skip ]) >>| fun items -> let entries = List.filter_map (function `Entry e -> Some e | _ -> None) items in let subcommands = List.fold_left (fun (in_arg_sec, acc) item -> match item with | `Section is_arg -> (is_arg, acc) | `Subcommand sc when not in_arg_sec -> (in_arg_sec, sc :: acc) | _ -> (in_arg_sec, acc) ) (false, []) items |> snd |> List.rev |> List.fold_left (fun acc sc -> match List.assoc_opt sc.name acc with | Some prev when String.length prev.desc >= String.length sc.desc -> acc | _ -> (sc.name, sc) :: List.remove_assoc sc.name acc ) [] |> List.rev_map snd in { entries; subcommands; positionals = []; description = "" }) (* --- usage line parsing --- * usage lines look like: "usage: git add [OPTIONS] [--] [...]" * to extract positional arguments, we first need to skip past the command * name prefix ("git add") to reach the argument portion. * * skip_command_prefix walks word-by-word, treating each space-separated * token as part of the command name as long as it: * - is made of "word chars" (alphanumeric, hyphen, underscore, slash, dot) * - contains at least one lowercase letter (to distinguish from ALL_CAPS * positional names like FILE) * - doesn't start with [, <, (, {, or - (which indicate arguments, not * command name components) * * this is an imperative index-walking parser rather than using Angstrom, * because usage lines are a single string (not line-oriented) and the format * is too varied for clean combinator composition. *) let skip_command_prefix s = let len = String.length s in let pos = ref 0 in let skip_ws () = while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done in let is_word_char = function | 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '/' | '.' -> true | _ -> false in let rec loop () = skip_ws (); if !pos >= len then () else if s.[!pos] = '[' || s.[!pos] = '<' || s.[!pos] = '(' || s.[!pos] = '{' || s.[!pos] = '-' then () else if is_word_char s.[!pos] then begin let start = !pos in while !pos < len && is_word_char s.[!pos] do incr pos done; let word = String.sub s start (!pos - start) in let has_lower = ref false in String.iter (fun c -> if c >= 'a' && c <= 'z' then has_lower := true) word; if not !has_lower then pos := start else loop () end in loop (); !pos (* parse the argument portion of a usage line into positional definitions. * handles these syntactic forms: * - mandatory positional * [file] - optional positional * FILE - mandatory positional (ALL_CAPS convention) * ... - variadic (also handles utf-8 ellipsis) * [file...] - optional variadic * curly-brace alternatives - skipped, not a positional * -flag - flags (skipped) * * certain ALL_CAPS names are skipped because they're not real positionals — * "OPTIONS", "FLAGS", etc. are section labels that sometimes appear in usage * lines for readability. * * deduplication at the end ensures we don't emit the same positional twice * (can happen when usage lines are reformatted or repeated). *) let parse_usage_args s = let len = String.length s in let pos = ref 0 in let positionals = ref [] in let skip_ws () = while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done in let is_pos_char c = (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || (c >= '0' && c <= '9') in (* detect trailing dots or utf-8 ellipsis indicating variadic args *) let read_dots () = skip_ws (); if !pos + 2 < len && s.[!pos] = '.' && s.[!pos+1] = '.' && s.[!pos+2] = '.' then (pos := !pos + 3; true) else if !pos + 2 < len && s.[!pos] = '\xe2' && s.[!pos+1] = '\x80' && s.[!pos+2] = '\xa6' then (pos := !pos + 3; true) (* utf-8 ellipsis *) else false in (* names that are section labels, not actual positional arguments *) let is_skip name = let u = String.uppercase_ascii name in u = "OPTIONS" || u = "OPTION" || u = "FLAGS" || u = "FLAG" in (* validate that a name contains only alphanumeric, underscore, hyphen chars *) let is_clean_name name = String.length name >= 2 && String.for_all (fun c -> (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c = '_' || c = '-') name in let is_letter c = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') in (* skip {A|c|d|...} alternative blocks — not positional arguments *) let skip_braces () = if !pos < len && s.[!pos] = '{' then begin let depth = ref 1 in incr pos; while !pos < len && !depth > 0 do if s.[!pos] = '{' then incr depth else if s.[!pos] = '}' then decr depth; incr pos done; ignore (read_dots ()); true end else false in while !pos < len do skip_ws (); if !pos >= len then () else if skip_braces () then () else match s.[!pos] with | '[' -> (* optional positional: [name] or [] or [name...] *) incr pos; let start = !pos in let depth = ref 1 in while !pos < len && !depth > 0 do if s.[!pos] = '[' then incr depth else if s.[!pos] = ']' then decr depth; incr pos done; let bracket_end = !pos - 1 in let inner = String.sub s start (max 0 (bracket_end - start)) |> String.trim in let inner, has_inner_dots = if String.ends_with ~suffix:"..." inner then (String.sub inner 0 (String.length inner - 3) |> String.trim, true) else (inner, false) in let variadic = has_inner_dots || read_dots () in if String.length inner > 0 && inner.[0] <> '-' && (is_letter inner.[0] || inner.[0] = '<') then begin let name = if inner.[0] = '<' then let e = try String.index inner '>' with Not_found -> String.length inner in String.sub inner 1 (e - 1) else inner in if is_clean_name name && not (is_skip name) then positionals := { pos_name = String.lowercase_ascii name; optional = true; variadic } :: !positionals end | '<' -> (* mandatory positional in angle brackets: *) incr pos; let start = !pos in while !pos < len && s.[!pos] <> '>' do incr pos done; let name = String.sub s start (!pos - start) in if !pos < len then incr pos; let variadic = read_dots () in if is_clean_name name && not (is_skip name) then positionals := { pos_name = String.lowercase_ascii name; optional = false; variadic } :: !positionals | '-' -> (* flag — skip entirely, not a positional *) while !pos < len && s.[!pos] <> ' ' && s.[!pos] <> '\t' && s.[!pos] <> ']' do incr pos done | c when c >= 'A' && c <= 'Z' -> (* ALL_CAPS positional name *) let start = !pos in while !pos < len && is_pos_char s.[!pos] do incr pos done; let name = String.sub s start (!pos - start) in let variadic = read_dots () in if String.length name >= 2 && String.for_all (fun c -> (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || (c >= '0' && c <= '9') ) name && not (is_skip name) then positionals := { pos_name = String.lowercase_ascii name; optional = false; variadic } :: !positionals | _ -> incr pos done; (* deduplicate positionals by name, keeping the first occurrence *) List.rev !positionals |> List.fold_left (fun (seen, acc) p -> if List.mem p.pos_name seen then (seen, acc) else (p.pos_name :: seen, p :: acc) ) ([], []) |> snd |> List.rev (* find the "usage:" line in the help text and extract positionals from it. * searches line-by-line for a line starting with "usage:" (case-insensitive). * handles both inline usage ("usage: cmd [OPTIONS] FILE") and the clap style * where the actual usage is on the next line: * USAGE: * cmd [OPTIONS] FILE * * also handles the bare "usage" header (no colon) followed by a next line. *) let extract_usage_positionals text = let lines = String.split_on_char '\n' text in let lines_arr = Array.of_list lines in let len = Array.length lines_arr in (* search through lines for the first usage header and return the usage content *) let find_usage_line () = let check_line idx = let trimmed = String.trim lines_arr.(idx) in let trimmed_len = String.length trimmed in let lc = String.lowercase_ascii trimmed in if trimmed_len >= 6 && String.sub lc 0 6 = "usage:" then begin let after = String.sub trimmed 6 (trimmed_len - 6) |> String.trim in if String.length after > 0 then Some after else if idx + 1 < len then (* clap style: USAGE:\n cmd [OPTIONS] PATTERN *) let next = String.trim lines_arr.(idx + 1) in if String.length next > 0 then Some next else None else None end else if lc = "usage" then begin if idx + 1 < len then let next = String.trim lines_arr.(idx + 1) in if String.length next > 0 then Some next else None else None end else None in (* use List.find_map over the index range to find the first matching line *) List.find_map check_line (List.init len Fun.id) in match find_usage_line () with | None -> [] | Some usage -> let cmd_end = skip_command_prefix usage in let args = String.sub usage cmd_end (String.length usage - cmd_end) in parse_usage_args args (* extract positionals from CLI11's explicit "POSITIONALS:" section. * CLI11 (a c++ arg parsing library) emits a dedicated section: * Positionals: * name TEXT description here * count INT another description * * this is preferred over usage-line extraction when present because it * provides more accurate type information. the parser looks for the * section header, then reads indented lines until a blank or unindented * line signals the end. type words (TEXT, INT, FLOAT, etc.) between the * name and description are skipped. *) let extract_cli11_positionals text = let lines = String.split_on_char '\n' text in (* parse a single indented positional line into a positional record *) let parse_one s = let len = String.length s in let pos = ref 0 in let is_name_char c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c = '_' || c = '-' in while !pos < len && is_name_char s.[!pos] do incr pos done; if !pos < 2 then None else let name = String.sub s 0 !pos in while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done; (* skip type word: TEXT, INT, FLOAT, ENUM, BOOLEAN, etc. *) while !pos < len && s.[!pos] >= 'A' && s.[!pos] <= 'Z' do incr pos done; while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done; let variadic = !pos + 2 < len && s.[!pos] = '.' && s.[!pos+1] = '.' && s.[!pos+2] = '.' in Some { pos_name = String.lowercase_ascii name; optional = false; variadic } in (* parse consecutive indented lines under the section header *) let rec parse_lines lines acc = match lines with | [] -> List.rev acc | line :: rest -> let len = String.length line in if len = 0 || (line.[0] <> ' ' && line.[0] <> '\t') then List.rev acc else let trimmed = String.trim line in if String.length trimmed = 0 then List.rev acc else match parse_one trimmed with | Some p -> parse_lines rest (p :: acc) | None -> parse_lines rest acc in (* scan lines for the positionals section header, then parse the body *) let rec find_section = function | [] -> [] | line :: rest -> let trimmed = String.trim line in if trimmed = "POSITIONALS:" || trimmed = "Positionals:" then parse_lines rest [] else find_section rest in find_section lines (* top-level entry point: parse a --help text string into a help_result. * steps: * 1. strip ansi escapes (colors, hyperlinks, etc.) * 2. run the Angstrom help_parser for flags and subcommands * 3. extract positionals via CLI11 format (preferred) or usage line (fallback) * 4. merge positionals into the result * uses Angstrom's prefix-consume mode — we don't need to parse every byte. *) let parse_help txt = let clean = strip_ansi txt in match Angstrom.parse_string ~consume:Consume.Prefix help_parser clean with | Ok result -> let cli11 = extract_cli11_positionals clean in let usage = extract_usage_positionals clean in let positionals = if cli11 <> [] then cli11 else usage in Ok { result with positionals } | Error msg -> Error msg