(* parser.ml — parse --help output into structured flag/subcommand/positional data. * * this module is the core of inshellah's help-text understanding. it takes the * raw text that a cli tool prints when you run `cmd --help` and extracts: * - flag entries (short/long switches with optional parameters and descriptions) * - subcommand listings (name + description pairs) * - positional arguments (from usage lines) * * the parser is built on angstrom (a monadic parser combinator library) for the * structured flag/subcommand extraction, with hand-rolled imperative parsers for * usage-line positional extraction (where the format is too varied for clean * combinator composition). * * key design decisions: * - the angstrom parser runs in prefix-consume mode — it doesn't need to parse * the entire input, just extract what it can recognize. unrecognized lines are * skipped via skip_non_option_line. * - multi-line descriptions are handled via indentation-based continuation: * lines indented 8+ spaces that don't start with '-' are folded into the * previous entry's description. * - subcommand detection uses a heuristic: lines with a name followed by 2+ * spaces then a description, where the name is at least 2 chars. section * headers (like "arguments:") toggle whether name-description pairs are * treated as subcommands or positionals. * - positional extraction has two paths: usage-line parsing (the common case) * and cli11's explicit "positionals:" section format. *) open Angstrom (* strip ansi escape sequences and osc hyperlinks from --help output. * many modern cli tools emit colored/styled output even when piped, * so we need to clean this before parsing. handles: * - csi sequences (esc [ ... final_byte) — colors, cursor movement, etc. * - osc sequences (esc ] ... bel/st) — hyperlinks, window titles, etc. * - other two-byte esc+char sequences *) let strip_ansi s = let buf = Buffer.create (String.length s) in let len = String.length s in let i = ref 0 in while !i < len do if !i + 1 < len && Char.code s.[!i] = 0x1b then begin let next = s.[!i + 1] in if next = '[' then begin (* CSI sequence: ESC [ ... final_byte *) i := !i + 2; while !i < len && not (s.[!i] >= '@' && s.[!i] <= '~') do incr i done; if !i < len then incr i end else if next = ']' then begin (* OSC sequence: ESC ] ... (terminated by BEL or ESC \) *) i := !i + 2; let found = ref false in while !i < len && not !found do if s.[!i] = '\x07' then (incr i; found := true) else if !i + 1 < len && Char.code s.[!i] = 0x1b && s.[!i + 1] = '\\' then (i := !i + 2; found := true) else incr i done end else begin (* Other ESC sequence, skip ESC + one char *) i := !i + 2 end end else begin Buffer.add_char buf s.[!i]; incr i end done; Buffer.contents buf (* --- character class predicates --- *) (* these are used throughout the angstrom parsers to classify characters. * they're separated out for readability and reuse. *) let is_whitespace = function ' ' | '\t' -> true | _ -> false let is_alphanumeric = function | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' -> true | _ -> false (* characters allowed inside parameter names like FILE, output-dir, etc. *) let is_param_char = function | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '_' | '-' -> true | _ -> false (* used to detect all-caps parameter names like FILE, TIME_STYLE *) let is_upper_or_underscore = function | 'A' .. 'Z' | '_' -> true | _ -> false (* characters allowed in long flag names (--foo-bar, --enable-feature2) *) let is_long_char = function | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '-' -> true | _ -> false (* --- core types --- * these types represent the structured output of parsing a help text. * they are shared across the entire codebase (nushell codegen, store, manpage parser). * * switch: a flag can be short-only (-v), long-only (--verbose), or both (-v, --verbose). * the both variant keeps the pair together so nushell can emit "--verbose(-v)". * * param: flags can take mandatory (--output FILE) or optional (--color[=WHEN]) values. * * entry: one complete flag definition — its switch form, optional parameter, and * the description text (potentially multi-line, already joined). * * help_result: the complete parsed output for a single command. *) type switch = Short of char | Long of string | Both of char * string type param = Mandatory of string | Optional of string type entry = { switch : switch; param : param option; desc : string } type subcommand = { name : string; desc : string } type positional = { pos_name : string; optional : bool; variadic : bool } type help_result = { entries : entry list; subcommands : subcommand list; positionals : positional list; description : string } (* --- low-level angstrom combinators --- *) (* these are the building blocks for all the parsers below. *) (* consume horizontal whitespace (spaces and tabs) without crossing lines *) let inline_ws = skip_while (function ' ' | '\t' -> true | _ -> false) (* end of line — matches either a newline or end of input. * this is the permissive version used in most places. *) let eol = end_of_line <|> end_of_input (* strict end of line — must consume an actual newline character. * used in skip_non_option_line so we don't accidentally match eof * and consume it when we shouldn't. *) let eol_strict = end_of_line (* --- switch and parameter parsers --- *) (* these parse the flag name portion of an option line, e.g. "-v", "--verbose" *) let short_switch = char '-' *> satisfy is_alphanumeric let long_switch = string "--" *> take_while1 is_long_char let comma = char ',' *> inline_ws (* parameter parsers — these handle the various syntaxes tools use to indicate * that a flag takes a value. the formats are surprisingly diverse: * --output=FILE (eq_man_param — mandatory, common in gnu tools) * --color[=WHEN] (eq_opt_param — optional with = syntax) * --depth DEPTH (space_upper_param — space-separated ALL_CAPS) * --file (space_angle_param — angle brackets) * --file [] (space_opt_angle_param — optional angle brackets) * --format string (space_type_param — go/cobra lowercase type word) *) let eq_opt_param = string "[=" *> take_while1 is_param_char <* char ']' >>| fun a -> Optional a let eq_man_param = char '=' *> take_while1 is_param_char >>| fun a -> Mandatory a (* space-separated ALL_CAPS param: e.g. " FILE", " TIME_STYLE". * peculiarity: we peek ahead and check the first char is uppercase, then * validate the entire word is ALL_CAPS. this prevents false positives where * a description word like "Do" or "Set" immediately follows the flag name. * digits are allowed (e.g. "SHA256") but lowercase chars disqualify. *) let space_upper_param = char ' ' *> peek_char_fail >>= fun c -> if is_upper_or_underscore c then take_while1 is_param_char >>= fun name -> if String.length name >= 1 && String.for_all (fun c -> is_upper_or_underscore c || c >= '0' && c <= '9') name then return (Mandatory name) else fail "not an all-caps param" else fail "not an uppercase param" (* Angle-bracket param: e.g. "", "" *) let angle_param = char '<' *> take_while1 (fun c -> c <> '>') <* char '>' >>| fun name -> Mandatory name (* Space + angle bracket param *) let space_angle_param = char ' ' *> angle_param (* Optional angle bracket param: [] *) let opt_angle_param = char '[' *> char '<' *> take_while1 (fun c -> c <> '>') <* char '>' <* char ']' >>| fun name -> Optional name let space_opt_angle_param = char ' ' *> opt_angle_param (* go/cobra style: space + lowercase type word like "string", "list", "int". * peculiarity: capped at 10 chars to avoid consuming description words. * go's flag libraries commonly emit "--timeout duration" or "--name string" * where the type name is a short lowercase word. longer words are almost * certainly the start of a description, not a type annotation. *) let space_type_param = char ' ' *> peek_char_fail >>= fun c -> if c >= 'a' && c <= 'z' then take_while1 (fun c -> c >= 'a' && c <= 'z') >>= fun name -> if String.length name <= 10 then return (Mandatory name) else fail "too long for type param" else fail "not a lowercase type param" (* try each parameter format in order of specificity. the ordering matters: * eq_opt_param must come before eq_man_param because "[=WHEN]" would otherwise * partially match as "=WHEN" then fail on the trailing "]". similarly, * space_opt_angle_param before space_angle_param to catch "[]" before "". *) let param_parser = option None (choice [ eq_opt_param; eq_man_param; space_opt_angle_param; space_angle_param; space_upper_param; space_type_param ] >>| fun a -> Some a) (* switch parser — handles the various ways help text presents flag names. * formats handled (in order of attempt): * -a, --all (short + comma + long — gnu style) * -a --all (short + space + long — some tools omit the comma) * --all / -a (long + slash + short — rare but seen in some tools) * -a (short only) * --all (long only) * * peculiarity: the ordering is critical because angstrom's choice commits to * the first parser that makes progress. short_switch consumes "-a", so the * combined parsers must be tried before the short-only parser. *) let switch_parser = choice [ (short_switch >>= fun s -> comma *> long_switch >>| fun l -> Both (s, l)); (short_switch >>= fun s -> char ' ' *> long_switch >>| fun l -> Both (s, l)); (long_switch >>= fun l -> inline_ws *> char '/' *> inline_ws *> short_switch >>| fun s -> Both (s, l)); (short_switch >>| fun s -> Short s); (long_switch >>| fun l -> Long l); ] (* --- description parsing with multi-line continuation --- * descriptions in help text often wrap across multiple lines. the convention * is that continuation lines are deeply indented (8+ spaces) and don't start * with '-' (which would indicate a new flag entry). we peek ahead to check * indentation without consuming, then decide whether to fold the line in. *) (* take the rest of the line as text (does not consume the newline itself) *) let rest_of_line = take_till (fun c -> c = '\n' || c = '\r') (* check if a line is a continuation line: deeply indented, doesn't start with '-'. * peculiarity: we count tabs as 8 spaces to match typical terminal rendering. * the 8-space threshold was chosen empirically — most help formatters indent * descriptions at least this much, while flag lines are indented 2-4 spaces. *) let continuation_line = peek_string 1 >>= fun _ -> (* Must start with significant whitespace (8+ spaces or tab) *) let count_indent s = let n = ref 0 in let i = ref 0 in while !i < String.length s do (match s.[!i] with | ' ' -> incr n | '\t' -> n := !n + 8 | _ -> i := String.length s); incr i done; !n in available >>= fun avail -> if avail = 0 then fail "eof" else (* Peek ahead to see indentation level *) peek_string (min avail 80) >>= fun preview -> let indent = count_indent preview in let trimmed = String.trim preview in let starts_with_dash = String.length trimmed > 0 && trimmed.[0] = '-' in if indent >= 8 && not starts_with_dash then (* This is a continuation line — consume whitespace + text *) inline_ws *> rest_of_line <* eol else fail "not a continuation line" (* parse description text: first line (after switch+param) plus any continuation lines. * blank continuation lines are filtered out, and all lines are trimmed and joined * with spaces into a single string. *) let description = inline_ws *> rest_of_line <* eol >>= fun first_line -> many continuation_line >>| fun cont_lines -> let all = first_line :: cont_lines in let all = List.filter (fun s -> String.length (String.trim s) > 0) all in String.concat " " (List.map String.trim all) (* description that appears on a separate line below the flag. * this handles the clap (rust) "long" help format where flags and descriptions * are on separate lines: * --verbose * increase verbosity * here there's no inline description — just deeply-indented continuation lines. *) let description_below = many1 continuation_line >>| fun lines -> let lines = List.filter (fun s -> String.length (String.trim s) > 0) lines in String.concat " " (List.map String.trim lines) (* --- line classification for skipping --- * the parser needs to skip lines it doesn't understand (section headers, * blank lines, description paragraphs not attached to a flag, etc.) * without consuming lines that ARE flag entries. *) (* peek ahead to check if the current line looks like a flag entry. * an option line starts with whitespace then '-'. *) let at_option_line = peek_string 1 >>= fun _ -> available >>= fun avail -> if avail = 0 then fail "eof" else peek_string (min avail 40) >>= fun preview -> let s = String.trim preview in if String.length s > 0 && s.[0] = '-' then return () else fail "not an option line" (* skip a non-option line (section header, blank, description-only, etc.). * peculiarity: uses eol_strict (not eol) so it won't match at eof — this * prevents the parser from infinitely skipping at the end of input. if the * line looks like an option line (at_option_line succeeds), we deliberately * fail so that the entry parser gets a chance at it instead. *) let skip_non_option_line = (at_option_line *> fail "this is an option line") <|> (rest_of_line *> eol_strict *> return ()) (* --- entry parsing --- *) (* parse a single flag entry: leading whitespace, then switch+param, then description. * the description can appear on the same line (inline) or on the next line (below). * if there's no description at all, we accept an empty string. * the (eol *> description_below) branch handles the clap long-help format. *) let entry = inline_ws *> lift2 (fun (sw, param) desc -> { switch = sw; param; desc }) (lift2 (fun a b -> (a, b)) switch_parser param_parser) (description <|> (eol *> (description_below <|> return ""))) (* --- subcommand parsing --- * subcommand lines in help text follow the pattern: * " name description" * where the name and description are separated by 2+ spaces. * some tools also include argument placeholders between name and description: * " start UNIT... start one or more units" * " list [PATTERN] list matching units" *) let is_subcommand_char = function | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | '_' -> true | _ -> false (* skip argument placeholders like UNIT..., [PATTERN...|PID...], * that appear between the subcommand name and the description. * only consumes single-space gaps — the two-space gap before the * description is left for the main parser to use as the delimiter. * * peculiarity: this is a recursive (fix-point) parser that peeks ahead * to distinguish single-space argument gaps from the double-space * description separator. it accepts tokens that start with [, <, or * are ALL_CAPS (with dots/pipes/commas for variadic syntax). *) let skip_arg_placeholders = fix (fun self -> (* Peek ahead: single space followed by arg-like token *) available >>= fun avail -> if avail < 2 then return () else peek_string (min avail 2) >>= fun s2 -> if String.length s2 >= 2 && s2.[0] = ' ' && s2.[1] <> ' ' then (* Single space — could be an arg placeholder *) let next = s2.[1] in if next = '[' || next = '<' || (next >= 'A' && next <= 'Z') then (* Peek the full token to check if it's ALL_CAPS/brackets *) peek_string (min avail 80) >>= fun preview -> (* Extract the token after the single space *) let tok_start = 1 in let tok_end = ref tok_start in while !tok_end < String.length preview && preview.[!tok_end] <> ' ' && preview.[!tok_end] <> '\n' && preview.[!tok_end] <> '\r' do incr tok_end done; let tok = String.sub preview tok_start (!tok_end - tok_start) in (* Accept as placeholder if it starts with [ or < or is ALL_CAPS (possibly with dots, pipes, dashes) *) let is_placeholder = tok.[0] = '[' || tok.[0] = '<' || String.for_all (fun c -> (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || c = '.' || c = '|' || c = ',' || (c >= '0' && c <= '9') ) tok in if is_placeholder then advance (1 + String.length tok) *> self else return () else return () else return ()) (* parse a subcommand entry line. * requires: name >= 2 chars, followed by 2+ spaces, then description. * the name is lowercased for consistent lookup. * * peculiarity: if the description starts with "- " (a dash-space prefix), * it's stripped. some tools format their subcommand lists as: * " add - add a new item" * where the "- " is decorative, not part of the description. *) let subcommand_entry = inline_ws *> take_while1 is_subcommand_char >>= fun name -> if String.length name < 2 then fail "subcommand name too short" else skip_arg_placeholders *> char ' ' *> char ' ' *> inline_ws *> rest_of_line <* eol >>| fun desc -> { name = String.lowercase_ascii name; desc = let t = String.trim desc in if String.length t >= 2 && t.[0] = '-' && t.[1] = ' ' then String.trim (String.sub t 2 (String.length t - 2)) else t } (* --- section header detection --- * section headers are critical for disambiguating subcommands from positional * arguments. lines like "commands:" introduce subcommand sections, while * "arguments:" or "positionals:" introduce argument sections where the same * name+description format should NOT be treated as subcommands. *) (* detect section names that introduce positional argument listings. * the check is case-insensitive and strips trailing colons. *) let is_arg_section s = let lc = String.lowercase_ascii (String.trim s) in let base = if String.ends_with ~suffix:":" lc then String.sub lc 0 (String.length lc - 1) |> String.trim else lc in base = "arguments" || base = "args" || base = "positionals" || base = "positional arguments" (* a section header: left-aligned (or lightly indented, <= 4 spaces) text * ending with ':', not starting with '-'. must be consumed BEFORE * subcommand_entry in the choice combinator, otherwise "commands:" would * be parsed as a subcommand named "commands" with description ":". * * returns a bool indicating whether this is an argument section (true) * or some other section (false). this drives the subcommand filtering logic * in help_parser — entries under argument sections are excluded from the * subcommand list. *) let section_header = available >>= fun avail -> if avail = 0 then fail "eof" else peek_string (min avail 80) >>= fun preview -> (* Extract just the first line from the preview *) let first_line = match String.index_opt preview '\n' with | Some i -> String.sub preview 0 i | None -> preview in let t = String.trim first_line in let len = String.length t in let indent = let i = ref 0 in while !i < String.length first_line && (first_line.[!i] = ' ' || first_line.[!i] = '\t') do incr i done; !i in if len >= 2 && t.[len - 1] = ':' && t.[0] <> '-' && indent <= 4 then rest_of_line <* eol_strict >>| fun line -> is_arg_section line else fail "not a section header" (* --- top-level parser --- * the main help parser: walks through all lines, trying each line as one of: * 1. a flag entry (starts with whitespace + '-') * 2. a section header (left-aligned text ending with ':') * 3. a subcommand line (name + 2+ spaces + description) * 4. anything else → skip * * the choice ordering matters: entries are tried first (highest priority), * then section headers (must beat subcommand_entry to avoid misparse), * then subcommands, then skip as fallback. * * after collecting all items, two post-processing steps happen: * - subcommands under argument sections are excluded (tracked via * a running in_arg_sec boolean toggled by section headers) * - duplicate subcommand names are deduplicated, keeping the entry * with the longer description (heuristic: more info = better) * * peculiarity: positionals are NOT extracted here — they come from * the usage line parser (extract_usage_positionals) or cli11's * explicit section parser (extract_cli11_positionals), applied later * in parse_help. *) let help_parser = let open Angstrom in fix (fun _self -> let try_entry = entry >>| fun e -> `Entry e in let try_section = section_header >>| fun is_arg -> `Section is_arg in let try_subcommand = subcommand_entry >>| fun sc -> `Subcommand sc in let try_skip = skip_non_option_line >>| fun () -> `Skip in many (choice [ try_entry; try_section; try_subcommand; try_skip ]) >>| fun items -> let entries = List.filter_map (function `Entry e -> Some e | _ -> None) items in let subcommands = List.fold_left (fun (in_arg_sec, acc) item -> match item with | `Section is_arg -> (is_arg, acc) | `Subcommand sc when not in_arg_sec -> (in_arg_sec, sc :: acc) | _ -> (in_arg_sec, acc) ) (false, []) items |> snd |> List.rev |> List.fold_left (fun acc sc -> match List.assoc_opt sc.name acc with | Some prev when String.length prev.desc >= String.length sc.desc -> acc | _ -> (sc.name, sc) :: List.remove_assoc sc.name acc ) [] |> List.rev_map snd in { entries; subcommands; positionals = []; description = "" }) (* --- usage line parsing --- * usage lines look like: "usage: git add [OPTIONS] [--] [...]" * to extract positional arguments, we first need to skip past the command * name prefix ("git add") to reach the argument portion. * * skip_command_prefix walks word-by-word, treating each space-separated * token as part of the command name as long as it: * - is made of "word chars" (alphanumeric, hyphen, underscore, slash, dot) * - contains at least one lowercase letter (to distinguish from ALL_CAPS * positional names like FILE) * - doesn't start with [, <, (, {, or - (which indicate arguments, not * command name components) * * peculiarity: this is an imperative index-walking parser rather than using * angstrom, because usage lines are a single string (not line-oriented) * and the format is too varied for clean combinator composition. *) let skip_command_prefix s = let len = String.length s in let i = ref 0 in let skip_ws () = while !i < len && (s.[!i] = ' ' || s.[!i] = '\t') do incr i done in let is_word_char = function | 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '/' | '.' -> true | _ -> false in let rec loop () = skip_ws (); if !i >= len then () else if s.[!i] = '[' || s.[!i] = '<' || s.[!i] = '(' || s.[!i] = '{' || s.[!i] = '-' then () else if is_word_char s.[!i] then begin let start = !i in while !i < len && is_word_char s.[!i] do incr i done; let word = String.sub s start (!i - start) in let has_lower = ref false in String.iter (fun c -> if c >= 'a' && c <= 'z' then has_lower := true) word; if not !has_lower then i := start else loop () end in loop (); !i (* parse the argument portion of a usage line into positional definitions. * handles these syntactic forms: * - mandatory positional * [file] - optional positional * FILE - mandatory positional (ALL_CAPS convention) * ... - variadic (also handles utf-8 ellipsis) * [file...] - optional variadic * curly-brace alternatives - skipped, not a positional * -flag - flags (skipped) * * peculiarity: certain all-caps names are skipped because they're not real * positionals — "OPTIONS", "FLAGS", etc. are section labels that sometimes * appear in usage lines for readability. * * deduplication at the end ensures we don't emit the same positional twice * (can happen when usage lines are reformatted or repeated). *) let parse_usage_args s = let len = String.length s in let i = ref 0 in let results = ref [] in let skip_ws () = while !i < len && (s.[!i] = ' ' || s.[!i] = '\t') do incr i done in let is_pos_char c = (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || (c >= '0' && c <= '9') in let read_dots () = skip_ws (); if !i + 2 < len && s.[!i] = '.' && s.[!i+1] = '.' && s.[!i+2] = '.' then (i := !i + 3; true) else if !i + 2 < len && s.[!i] = '\xe2' && s.[!i+1] = '\x80' && s.[!i+2] = '\xa6' then (i := !i + 3; true) (* UTF-8 ellipsis … *) else false in let is_skip name = let u = String.uppercase_ascii name in u = "OPTIONS" || u = "OPTION" || u = "FLAGS" || u = "FLAG" in let is_clean_name name = String.length name >= 2 && String.for_all (fun c -> (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c = '_' || c = '-') name in let is_letter c = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') in let skip_braces () = (* Skip {A|c|d|...} alternative blocks *) if !i < len && s.[!i] = '{' then begin let depth = ref 1 in incr i; while !i < len && !depth > 0 do if s.[!i] = '{' then incr depth else if s.[!i] = '}' then decr depth; incr i done; ignore (read_dots ()); true end else false in while !i < len do skip_ws (); if !i >= len then () else if skip_braces () then () else match s.[!i] with | '[' -> incr i; let start = !i in let depth = ref 1 in while !i < len && !depth > 0 do if s.[!i] = '[' then incr depth else if s.[!i] = ']' then decr depth; incr i done; let bracket_end = !i - 1 in let inner = String.sub s start (max 0 (bracket_end - start)) |> String.trim in let inner, has_inner_dots = if String.ends_with ~suffix:"..." inner then (String.sub inner 0 (String.length inner - 3) |> String.trim, true) else (inner, false) in let variadic = has_inner_dots || read_dots () in if String.length inner > 0 && inner.[0] <> '-' && (is_letter inner.[0] || inner.[0] = '<') then begin let name = if inner.[0] = '<' then let e = try String.index inner '>' with Not_found -> String.length inner in String.sub inner 1 (e - 1) else inner in if is_clean_name name && not (is_skip name) then results := { pos_name = String.lowercase_ascii name; optional = true; variadic } :: !results end | '<' -> incr i; let start = !i in while !i < len && s.[!i] <> '>' do incr i done; let name = String.sub s start (!i - start) in if !i < len then incr i; let variadic = read_dots () in if is_clean_name name && not (is_skip name) then results := { pos_name = String.lowercase_ascii name; optional = false; variadic } :: !results | '-' -> while !i < len && s.[!i] <> ' ' && s.[!i] <> '\t' && s.[!i] <> ']' do incr i done | c when c >= 'A' && c <= 'Z' -> let start = !i in while !i < len && is_pos_char s.[!i] do incr i done; let name = String.sub s start (!i - start) in let variadic = read_dots () in if String.length name >= 2 && String.for_all (fun c -> (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || (c >= '0' && c <= '9') ) name && not (is_skip name) then results := { pos_name = String.lowercase_ascii name; optional = false; variadic } :: !results | _ -> incr i done; List.rev !results |> List.fold_left (fun (seen, acc) p -> if List.mem p.pos_name seen then (seen, acc) else (p.pos_name :: seen, p :: acc) ) ([], []) |> snd |> List.rev (* find the "usage:" line in the help text and extract positionals from it. * searches line-by-line for a line starting with "usage:" (case-insensitive). * handles both inline usage ("usage: cmd [OPTIONS] FILE") and the clap style * where the actual usage is on the next line: * USAGE: * cmd [OPTIONS] FILE * * also handles the bare "usage" header (no colon) followed by a next line. *) let extract_usage_positionals text = let lines = String.split_on_char '\n' text in let lines_arr = Array.of_list lines in let len = Array.length lines_arr in let find_usage_line () = let rec go i = if i >= len then None else let t = String.trim lines_arr.(i) in let tlen = String.length t in let lc = String.lowercase_ascii t in if tlen >= 6 && String.sub lc 0 6 = "usage:" then begin let after = String.sub t 6 (tlen - 6) |> String.trim in if String.length after > 0 then Some after else if i + 1 < len then (* Clap style: USAGE:\n cmd [OPTIONS] PATTERN *) let next = String.trim lines_arr.(i + 1) in if String.length next > 0 then Some next else None else None end else if lc = "usage" then begin if i + 1 < len then let next = String.trim lines_arr.(i + 1) in if String.length next > 0 then Some next else None else None end else go (i + 1) in go 0 in match find_usage_line () with | None -> [] | Some usage -> let cmd_end = skip_command_prefix usage in let args = String.sub usage cmd_end (String.length usage - cmd_end) in parse_usage_args args (* extract positionals from cli11's explicit "POSITIONALS:" section. * cli11 (a c++ arg parsing library) emits a dedicated section: * Positionals: * name TEXT description here * count INT another description * * this is preferred over usage-line extraction when present because it * provides more accurate type information. the parser looks for the * section header, then reads indented lines until a blank or unindented * line signals the end. type words (TEXT, INT, FLOAT, etc.) between the * name and description are skipped. *) let extract_cli11_positionals text = let lines = String.split_on_char '\n' text in let rec find_section = function | [] -> [] | line :: rest -> let t = String.trim line in if t = "POSITIONALS:" || t = "Positionals:" then parse_lines rest [] else find_section rest and parse_lines lines acc = match lines with | [] -> List.rev acc | line :: rest -> let len = String.length line in if len = 0 || (line.[0] <> ' ' && line.[0] <> '\t') then List.rev acc else let t = String.trim line in if String.length t = 0 then List.rev acc else match parse_one t with | Some p -> parse_lines rest (p :: acc) | None -> parse_lines rest acc and parse_one s = let len = String.length s in let i = ref 0 in let is_name_char c = (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c = '_' || c = '-' in while !i < len && is_name_char s.[!i] do incr i done; if !i < 2 then None else let name = String.sub s 0 !i in while !i < len && (s.[!i] = ' ' || s.[!i] = '\t') do incr i done; (* skip type word: TEXT, INT, FLOAT, ENUM, BOOLEAN, etc. *) while !i < len && s.[!i] >= 'A' && s.[!i] <= 'Z' do incr i done; while !i < len && (s.[!i] = ' ' || s.[!i] = '\t') do incr i done; let variadic = !i + 2 < len && s.[!i] = '.' && s.[!i+1] = '.' && s.[!i+2] = '.' in Some { pos_name = String.lowercase_ascii name; optional = false; variadic } in find_section lines (* top-level entry point: parse a --help text string into a help_result. * steps: * 1. strip ansi escapes (colors, hyperlinks, etc.) * 2. run the angstrom help_parser for flags and subcommands * 3. extract positionals via cli11 format (preferred) or usage line (fallback) * 4. merge positionals into the result * uses angstrom's prefix-consume mode — we don't need to parse every byte. *) let parse_help txt = let clean = strip_ansi txt in match Angstrom.parse_string ~consume:Consume.Prefix help_parser clean with | Ok result -> let cli11 = extract_cli11_positionals clean in let usage = extract_usage_positionals clean in let positionals = if cli11 <> [] then cli11 else usage in Ok { result with positionals } | Error msg -> Error msg