inshellah/lib/parser.ml

(* parser.ml — parse --help output into structured flag/subcommand/positional data.
 *
 * this module is the core of inshellah's help-text understanding. it takes the
 * raw text that a cli tool prints when you run `cmd --help` and extracts:
 *   - flag entries (short/long switches with optional parameters and descriptions)
 *   - subcommand listings (name + description pairs)
 *   - positional arguments (from usage lines)
 *
 * the parser is built on Angstrom (a monadic parser combinator library) for the
 * structured flag/subcommand extraction, with hand-rolled imperative parsers for
 * usage-line positional extraction (where the format is too varied for clean
 * combinator composition).
 *
 * key design decisions:
 *   - the Angstrom parser runs in prefix-consume mode — it doesn't need to parse
 *     the entire input, just extract what it can recognize. unrecognized lines are
 *     skipped via skip_non_option_line.
 *   - multi-line descriptions are handled via indentation-based continuation:
 *     lines indented 8+ spaces that don't start with '-' are folded into the
 *     previous entry's description.
 *   - subcommand detection uses a heuristic: lines with a name followed by 2+
 *     spaces then a description, where the name is at least 2 chars. section
 *     headers (like "arguments:") toggle whether name-description pairs are
 *     treated as subcommands or positionals.
 *   - positional extraction has two paths: usage-line parsing (the common case)
 *     and CLI11's explicit "positionals:" section format.
 *)

open Angstrom

(* strip ansi escape sequences and osc hyperlinks from --help output.
 * many modern cli tools emit colored/styled output even when piped,
 * so we need to clean this before parsing. handles:
 *   - csi sequences (esc [ ... final_byte) — colors, cursor movement, etc.
 *   - osc sequences (esc ] ... bel/st) — hyperlinks, window titles, etc.
 *   - other two-byte esc+char sequences *)
let strip_ansi s =
  let buf = Buffer.create (String.length s) in
  let len = String.length s in
  let pos = ref 0 in
  while !pos < len do
    if !pos + 1 < len && Char.code s.[!pos] = 0x1b then begin
      let next = s.[!pos + 1] in
      if next = '[' then begin
        (* csi sequence: esc [ ... final_byte *)
        pos := !pos + 2;
        while !pos < len && not (s.[!pos] >= '@' && s.[!pos] <= '~') do incr pos done;
        if !pos < len then incr pos
      end else if next = ']' then begin
        (* osc sequence: esc ] ... (terminated by bel or esc \) *)
        pos := !pos + 2;
        let terminated = ref false in
        while !pos < len && not !terminated do
          if s.[!pos] = '\x07' then
            (incr pos; terminated := true)
          else if !pos + 1 < len && Char.code s.[!pos] = 0x1b && s.[!pos + 1] = '\\' then
            (pos := !pos + 2; terminated := true)
          else
            incr pos
        done
      end else begin
        (* other esc sequence, skip esc + one char *)
        pos := !pos + 2
      end
    end else begin
      Buffer.add_char buf s.[!pos];
      incr pos
    end
  done;
  Buffer.contents buf

(* --- character class predicates ---
 * used throughout the Angstrom parsers to classify characters.
 * separated out for readability and reuse. *)

let is_whitespace = function ' ' | '\t' -> true | _ -> false

let is_alphanumeric = function
  | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' -> true
  | _ -> false

(* characters allowed inside parameter names like FILE, output-dir, etc. *)
let is_param_char = function
  | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '_' | '-' -> true
  | _ -> false

(* used to detect ALL_CAPS parameter names like FILE, TIME_STYLE *)
let is_upper_or_underscore = function
  | 'A' .. 'Z' | '_' -> true
  | _ -> false

(* characters allowed in long flag names (--foo-bar, --enable-feature2) *)
let is_long_char = function
  | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '-' -> true
  | _ -> false

(* --- core types ---
 * these types represent the structured output of parsing a help text.
 * they are shared across the entire codebase (nushell codegen, store, manpage parser).
 *
 * switch: a flag can be short-only (-v), long-only (--verbose), or both (-v, --verbose).
 *   the both variant keeps the pair together so nushell can emit "--verbose(-v)".
 *
 * param: flags can take mandatory (--output FILE) or optional (--color[=WHEN]) values.
 *
 * entry: one complete flag definition — its switch form, optional parameter, and
 *   the description text (potentially multi-line, already joined).
 *
 * help_result: the complete parsed output for a single command. *)
type switch = Short of char | Long of string | Both of char * string
type param = Mandatory of string | Optional of string
type entry = { switch : switch; param : param option; desc : string }
type subcommand = { name : string; desc : string }
type positional = { pos_name : string; optional : bool; variadic : bool }
type help_result = { entries : entry list; subcommands : subcommand list; positionals : positional list; description : string }

(* --- low-level Angstrom combinators ---
 * building blocks for all the parsers below. *)

(* consume horizontal whitespace (spaces and tabs) without crossing lines *)
let inline_ws = skip_while (function ' ' | '\t' -> true | _ -> false)
(* end of line — matches either a newline or end of input.
 * this is the permissive version used in most places. *)
let eol = end_of_line <|> end_of_input
(* strict end of line — must consume an actual newline character.
 * used in skip_non_option_line so we don't accidentally match eof
 * and consume it when we shouldn't. *)
let eol_strict = end_of_line

(* --- switch and parameter parsers ---
 * parse the flag name portion of an option line, e.g. "-v", "--verbose" *)

let short_switch = char '-' *> satisfy is_alphanumeric
let long_switch = string "--" *> take_while1 is_long_char
let comma = char ',' *> inline_ws

(* parameter parsers — handle the various syntaxes tools use to indicate
 * that a flag takes a value. the formats are surprisingly diverse:
 *   --output=FILE        (eq_man_param — mandatory, common in gnu tools)
 *   --color[=WHEN]       (eq_opt_param — optional with = syntax)
 *   --depth DEPTH        (space_upper_param — space-separated ALL_CAPS)
 *   --file <path>        (space_angle_param — angle brackets)
 *   --file [<path>]      (space_opt_angle_param — optional angle brackets)
 *   --format string      (space_type_param — go/cobra lowercase type word)
 *)
let eq_opt_param =
  string "[=" *> take_while1 is_param_char <* char ']' >>| fun a -> Optional a

let eq_man_param =
  char '=' *> take_while1 is_param_char >>| fun a -> Mandatory a

(* space-separated ALL_CAPS param: e.g. " FILE", " TIME_STYLE".
 * peek ahead and check the first char is uppercase, then validate
 * the entire word is ALL_CAPS. prevents false positives where a
 * description word like "Do" or "Set" immediately follows the flag name.
 * digits are allowed (e.g. "SHA256") but lowercase chars disqualify. *)
let space_upper_param =
  char ' ' *> peek_char_fail >>= fun c ->
  if is_upper_or_underscore c then
    take_while1 is_param_char >>= fun name ->
    if String.length name >= 1 && String.for_all (fun c -> is_upper_or_underscore c || c >= '0' && c <= '9') name then
      return (Mandatory name)
    else
      fail "not an all-caps param"
  else
    fail "not an uppercase param"

(* angle-bracket param: e.g. "<file>", "<notation>" *)
let angle_param =
  char '<' *> take_while1 (fun c -> c <> '>') <* char '>' >>| fun name ->
  Mandatory name

(* space + angle bracket param *)
let space_angle_param =
  char ' ' *> angle_param

(* optional angle bracket param: [<file>] *)
let opt_angle_param =
  char '[' *> char '<' *> take_while1 (fun c -> c <> '>') <* char '>' <* char ']'
  >>| fun name -> Optional name

let space_opt_angle_param =
  char ' ' *> opt_angle_param

(* go/cobra style: space + lowercase type word like "string", "list", "int".
 * capped at 10 chars to avoid consuming description words.
 * go's flag libraries commonly emit "--timeout duration" or "--name string"
 * where the type name is a short lowercase word. longer words are almost
 * certainly the start of a description, not a type annotation. *)
let space_type_param =
  char ' ' *> peek_char_fail >>= fun c ->
  if c >= 'a' && c <= 'z' then
    take_while1 (fun c -> c >= 'a' && c <= 'z') >>= fun name ->
    if String.length name <= 10 then
      return (Mandatory name)
    else
      fail "too long for type param"
  else
    fail "not a lowercase type param"

(* try each parameter format in order of specificity. the ordering matters:
 * eq_opt_param must come before eq_man_param because "[=WHEN]" would otherwise
 * partially match as "=WHEN" then fail on the trailing "]". similarly,
 * space_opt_angle_param before space_angle_param to catch "[<file>]" before "<file>". *)
let param_parser =
  option None
    (choice
       [ eq_opt_param; eq_man_param;
         space_opt_angle_param; space_angle_param;
         space_upper_param; space_type_param ]
     >>| fun a -> Some a)

(* switch parser — handles the various ways help text presents flag names.
 * formats handled (in order of attempt):
 *   -a, --all       (short + comma + long — gnu style)
 *   -a --all        (short + space + long — some tools omit the comma)
 *   --all / -a      (long + slash + short — rare but seen in some tools)
 *   -a              (short only)
 *   --all           (long only)
 *
 * the ordering is critical because Angstrom's choice commits to
 * the first parser that makes progress. short_switch consumes "-a", so the
 * combined parsers must be tried before the short-only parser. *)
let switch_parser =
  choice
    [
      (short_switch >>= fun s ->
       comma *> long_switch >>| fun l -> Both (s, l));
      (short_switch >>= fun s ->
       char ' ' *> long_switch >>| fun l -> Both (s, l));
      (long_switch >>= fun l ->
       inline_ws *> char '/' *> inline_ws *>
       short_switch >>| fun s -> Both (s, l));
      (short_switch >>| fun s -> Short s);
      (long_switch >>| fun l -> Long l);
    ]

(* --- description parsing with multi-line continuation ---
 * descriptions in help text often wrap across multiple lines. the convention
 * is that continuation lines are deeply indented (8+ spaces) and don't start
 * with '-' (which would indicate a new flag entry). we peek ahead to check
 * indentation without consuming, then decide whether to fold the line in. *)

(* take the rest of the line as text (does not consume the newline itself) *)
let rest_of_line = take_till (fun c -> c = '\n' || c = '\r')

(* check if a line is a continuation line: deeply indented, doesn't start with '-'.
 * tabs count as 8 spaces to match typical terminal rendering.
 * the 8-space threshold was chosen empirically — most help formatters indent
 * descriptions at least this much, while flag lines are indented 2-4 spaces. *)
let continuation_line =
  peek_string 1 >>= fun _ ->
  (* must start with significant whitespace (8+ spaces or tab) *)
  let count_indent s =
    let indent = ref 0 in
    let pos = ref 0 in
    while !pos < String.length s do
      (match s.[!pos] with
       | ' ' -> incr indent
       | '\t' -> indent := !indent + 8
       | _ -> pos := String.length s);
      incr pos
    done;
    !indent
  in
  available >>= fun avail ->
  if avail = 0 then fail "eof"
  else
    (* peek ahead to see indentation level *)
    peek_string (min avail 80) >>= fun preview ->
    let indent = count_indent preview in
    let trimmed = String.trim preview in
    let starts_with_dash =
      String.length trimmed > 0 && trimmed.[0] = '-'
    in
    if indent >= 8 && not starts_with_dash then
      (* this is a continuation line — consume whitespace + text *)
      inline_ws *> rest_of_line <* eol
    else
      fail "not a continuation line"

(* parse description text: first line (after switch+param) plus any continuation lines.
 * blank continuation lines are filtered out, and all lines are trimmed and joined
 * with spaces into a single string. *)
let description =
  inline_ws *> rest_of_line <* eol >>= fun first_line ->
  many continuation_line >>| fun cont_lines ->
  let all = first_line :: cont_lines in
  let all = List.filter (fun s -> String.length (String.trim s) > 0) all in
  String.concat " " (List.map String.trim all)

(* description that appears on a separate line below the flag.
 * this handles the clap (rust) "long" help format where flags and descriptions
 * are on separate lines:
 *   --verbose
 *           increase verbosity
 * here there's no inline description — just deeply-indented continuation lines. *)
let description_below =
  many1 continuation_line >>| fun lines ->
  let lines = List.filter (fun s -> String.length (String.trim s) > 0) lines in
  String.concat " " (List.map String.trim lines)

(* --- line classification for skipping ---
 * the parser needs to skip lines it doesn't understand (section headers,
 * blank lines, description paragraphs not attached to a flag, etc.)
 * without consuming lines that are flag entries. *)

(* peek ahead to check if the current line looks like a flag entry.
 * an option line starts with whitespace then '-'. *)
let at_option_line =
  peek_string 1 >>= fun _ ->
  available >>= fun avail ->
  if avail = 0 then fail "eof"
  else
    peek_string (min avail 40) >>= fun preview ->
    let s = String.trim preview in
    if String.length s > 0 && s.[0] = '-' then return ()
    else fail "not an option line"

(* skip a non-option line (section header, blank, description-only, etc.).
 * uses eol_strict (not eol) so it won't match at eof — this prevents the
 * parser from infinitely skipping at the end of input. if the line looks
 * like an option line (at_option_line succeeds), we deliberately fail so
 * that the entry parser gets a chance at it instead. *)
let skip_non_option_line =
  (at_option_line *> fail "this is an option line")
  <|> (rest_of_line *> eol_strict *> return ())

(* --- entry parsing --- *)

(* parse a single flag entry: leading whitespace, then switch+param, then description.
 * the description can appear on the same line (inline) or on the next line (below).
 * if there's no description at all, we accept an empty string.
 * the (eol *> description_below) branch handles the clap long-help format. *)
let entry =
  inline_ws *>
  lift2 (fun (sw, param) desc -> { switch = sw; param; desc })
    (lift2 (fun a b -> (a, b)) switch_parser param_parser)
    (description <|> (eol *> (description_below <|> return "")))

(* --- subcommand parsing ---
 * subcommand lines in help text follow the pattern:
 *   "  name   description"
 * where the name and description are separated by 2+ spaces.
 * some tools also include argument placeholders between name and description:
 *   "  start UNIT...   start one or more units"
 *   "  list [PATTERN]  list matching units"
 *)

let is_subcommand_char = function
  | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | '_' -> true
  | _ -> false

(* skip argument placeholders like UNIT..., [PATTERN...|PID...], <file>
 * that appear between the subcommand name and the description.
 * only consumes single-space gaps — the two-space gap before the
 * description is left for the main parser to use as the delimiter.
 *
 * this is a recursive (fix-point) parser that peeks ahead to distinguish
 * single-space argument gaps from the double-space description separator.
 * it accepts tokens that start with [, <, or are ALL_CAPS (with dots/pipes/
 * commas for variadic syntax). *)
let skip_arg_placeholders =
  fix (fun self ->
    (* peek ahead: single space followed by arg-like token *)
    available >>= fun avail ->
    if avail < 2 then return ()
    else
    peek_string (min avail 2) >>= fun peek_two ->
    if String.length peek_two >= 2 && peek_two.[0] = ' ' && peek_two.[1] <> ' ' then
      (* single space — could be an arg placeholder *)
      let next = peek_two.[1] in
      if next = '[' || next = '<'
         || (next >= 'A' && next <= 'Z') then
        (* peek the full token to check if it's ALL_CAPS/brackets *)
        peek_string (min avail 80) >>= fun preview ->
        (* extract the token after the single space *)
        let tok_start = 1 in
        let token_end = ref tok_start in
        while !token_end < String.length preview
              && preview.[!token_end] <> ' '
              && preview.[!token_end] <> '\n'
              && preview.[!token_end] <> '\r' do
          incr token_end
        done;
        let tok = String.sub preview tok_start (!token_end - tok_start) in
        (* accept as placeholder if it starts with [ or < or is ALL_CAPS
           (possibly with dots, pipes, dashes) *)
        let is_placeholder =
          tok.[0] = '[' || tok.[0] = '<'
          || String.for_all (fun c ->
               (c >= 'A' && c <= 'Z') || c = '_' || c = '-'
               || c = '.' || c = '|' || c = ',' || (c >= '0' && c <= '9')
             ) tok
        in
        if is_placeholder then
          advance (1 + String.length tok) *> self
        else return ()
      else return ()
    else return ())

(* parse a subcommand entry line.
 * requires: name >= 2 chars, followed by 2+ spaces, then description.
 * the name is lowercased for consistent lookup.
 *
 * if the description starts with "- " (a dash-space prefix), it's stripped.
 * some tools format their subcommand lists as:
 *   "  add   - add a new item"
 * where the "- " is decorative, not part of the description. *)
let subcommand_entry =
  inline_ws *>
  take_while1 is_subcommand_char >>= fun name ->
  if String.length name < 2 then fail "subcommand name too short"
  else
  skip_arg_placeholders *>
  char ' ' *> char ' ' *> inline_ws *>
  rest_of_line <* eol >>| fun desc ->
  { name = String.lowercase_ascii name;
    desc = let trimmed = String.trim desc in
      if String.length trimmed >= 2 && trimmed.[0] = '-' && trimmed.[1] = ' ' then
        String.trim (String.sub trimmed 2 (String.length trimmed - 2))
      else trimmed }

(* --- section header detection ---
 * section headers are critical for disambiguating subcommands from positional
 * arguments. lines like "commands:" introduce subcommand sections, while
 * "arguments:" or "positionals:" introduce argument sections where the same
 * name+description format should not be treated as subcommands. *)

(* detect section names that introduce positional argument listings.
 * the check is case-insensitive and strips trailing colons. *)
let is_arg_section s =
  let lc = String.lowercase_ascii (String.trim s) in
  let base = if String.ends_with ~suffix:":" lc
    then String.sub lc 0 (String.length lc - 1) |> String.trim
    else lc in
  base = "arguments" || base = "args" || base = "positionals"
  || base = "positional arguments"

(* a section header: left-aligned (or lightly indented, <= 4 spaces) text
 * ending with ':', not starting with '-'. must be consumed before
 * subcommand_entry in the choice combinator, otherwise "commands:" would
 * be parsed as a subcommand named "commands" with description ":".
 *
 * returns a bool indicating whether this is an argument section (true)
 * or some other section (false). this drives the subcommand filtering logic
 * in help_parser — entries under argument sections are excluded from the
 * subcommand list. *)
let section_header =
  available >>= fun avail ->
  if avail = 0 then fail "eof"
  else
    peek_string (min avail 80) >>= fun preview ->
    (* extract just the first line from the preview *)
    let first_line = match String.index_opt preview '\n' with
      | Some pos -> String.sub preview 0 pos
      | None -> preview in
    let trimmed = String.trim first_line in
    let len = String.length trimmed in
    let indent = let pos = ref 0 in
      while !pos < String.length first_line && (first_line.[!pos] = ' ' || first_line.[!pos] = '\t') do incr pos done;
      !pos in
    if len >= 2 && trimmed.[len - 1] = ':' && trimmed.[0] <> '-' && indent <= 4 then
      rest_of_line <* eol_strict >>| fun line -> is_arg_section line
    else fail "not a section header"

(* --- top-level parser ---
 * the main help parser: walks through all lines, trying each line as one of:
 *   1. a flag entry (starts with whitespace + '-')
 *   2. a section header (left-aligned text ending with ':')
 *   3. a subcommand line (name + 2+ spaces + description)
 *   4. anything else — skip
 *
 * the choice ordering matters: entries are tried first (highest priority),
 * then section headers (must beat subcommand_entry to avoid misparse),
 * then subcommands, then skip as fallback.
 *
 * after collecting all items, two post-processing steps happen:
 *   - subcommands under argument sections are excluded (tracked via
 *     a running in_arg_sec boolean toggled by section headers)
 *   - duplicate subcommand names are deduplicated, keeping the entry
 *     with the longer description (heuristic: more info = better)
 *
 * positionals are not extracted here — they come from the usage line
 * parser (extract_usage_positionals) or CLI11's explicit section parser
 * (extract_cli11_positionals), applied later in parse_help. *)
let help_parser =
  let open Angstrom in
  fix (fun _self ->
    let try_entry =
      entry >>| fun e -> `Entry e
    in
    let try_section =
      section_header >>| fun is_arg -> `Section is_arg
    in
    let try_subcommand =
      subcommand_entry >>| fun sc -> `Subcommand sc
    in
    let try_skip =
      skip_non_option_line >>| fun () -> `Skip
    in
    many (choice [ try_entry; try_section; try_subcommand; try_skip ]) >>| fun items ->
    let entries = List.filter_map (function `Entry e -> Some e | _ -> None) items in
    let subcommands =
      List.fold_left (fun (in_arg_sec, acc) item ->
        match item with
        | `Section is_arg -> (is_arg, acc)
        | `Subcommand sc when not in_arg_sec -> (in_arg_sec, sc :: acc)
        | _ -> (in_arg_sec, acc)
      ) (false, []) items
      |> snd |> List.rev
      |> List.fold_left (fun acc sc ->
           match List.assoc_opt sc.name acc with
           | Some prev when String.length prev.desc >= String.length sc.desc -> acc
           | _ -> (sc.name, sc) :: List.remove_assoc sc.name acc
         ) []
      |> List.rev_map snd
    in
    { entries; subcommands; positionals = []; description = "" })

(* --- usage line parsing ---
 * usage lines look like: "usage: git add [OPTIONS] [--] [<pathspec>...]"
 * to extract positional arguments, we first need to skip past the command
 * name prefix ("git add") to reach the argument portion.
 *
 * skip_command_prefix walks word-by-word, treating each space-separated
 * token as part of the command name as long as it:
 *   - is made of "word chars" (alphanumeric, hyphen, underscore, slash, dot)
 *   - contains at least one lowercase letter (to distinguish from ALL_CAPS
 *     positional names like FILE)
 *   - doesn't start with [, <, (, {, or - (which indicate arguments, not
 *     command name components)
 *
 * this is an imperative index-walking parser rather than using Angstrom,
 * because usage lines are a single string (not line-oriented) and the format
 * is too varied for clean combinator composition. *)
let skip_command_prefix s =
  let len = String.length s in
  let pos = ref 0 in
  let skip_ws () = while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done in
  let is_word_char = function
    | 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '/' | '.' -> true
    | _ -> false
  in
  let rec loop () =
    skip_ws ();
    if !pos >= len then ()
    else if s.[!pos] = '[' || s.[!pos] = '<' || s.[!pos] = '(' || s.[!pos] = '{' || s.[!pos] = '-' then ()
    else if is_word_char s.[!pos] then begin
      let start = !pos in
      while !pos < len && is_word_char s.[!pos] do incr pos done;
      let word = String.sub s start (!pos - start) in
      let has_lower = ref false in
      String.iter (fun c -> if c >= 'a' && c <= 'z' then has_lower := true) word;
      if not !has_lower then
        pos := start
      else
        loop ()
    end
  in
  loop ();
  !pos

(* parse the argument portion of a usage line into positional definitions.
 * handles these syntactic forms:
 *   <file>          - mandatory positional
 *   [file]          - optional positional
 *   FILE            - mandatory positional (ALL_CAPS convention)
 *   <file>...       - variadic (also handles utf-8 ellipsis)
 *   [file...]       - optional variadic
 *   curly-brace alternatives - skipped, not a positional
 *   -flag           - flags (skipped)
 *
 * certain ALL_CAPS names are skipped because they're not real positionals —
 * "OPTIONS", "FLAGS", etc. are section labels that sometimes appear in usage
 * lines for readability.
 *
 * deduplication at the end ensures we don't emit the same positional twice
 * (can happen when usage lines are reformatted or repeated). *)
let parse_usage_args s =
  let len = String.length s in
  let pos = ref 0 in
  let positionals = ref [] in
  let skip_ws () =
    while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done in
  let is_pos_char c =
    (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || (c >= '0' && c <= '9') in
  (* detect trailing dots or utf-8 ellipsis indicating variadic args *)
  let read_dots () =
    skip_ws ();
    if !pos + 2 < len && s.[!pos] = '.' && s.[!pos+1] = '.' && s.[!pos+2] = '.' then
      (pos := !pos + 3; true)
    else if !pos + 2 < len && s.[!pos] = '\xe2' && s.[!pos+1] = '\x80' && s.[!pos+2] = '\xa6' then
      (pos := !pos + 3; true)  (* utf-8 ellipsis *)
    else false
  in
  (* names that are section labels, not actual positional arguments *)
  let is_skip name =
    let u = String.uppercase_ascii name in
    u = "OPTIONS" || u = "OPTION" || u = "FLAGS" || u = "FLAG"
  in
  (* validate that a name contains only alphanumeric, underscore, hyphen chars *)
  let is_clean_name name =
    String.length name >= 2
    && String.for_all (fun c ->
         (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
         || (c >= '0' && c <= '9') || c = '_' || c = '-') name
  in
  let is_letter c = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') in
  (* skip {A|c|d|...} alternative blocks — not positional arguments *)
  let skip_braces () =
    if !pos < len && s.[!pos] = '{' then begin
      let depth = ref 1 in
      incr pos;
      while !pos < len && !depth > 0 do
        if s.[!pos] = '{' then incr depth
        else if s.[!pos] = '}' then decr depth;
        incr pos
      done;
      ignore (read_dots ());
      true
    end else false
  in
  while !pos < len do
    skip_ws ();
    if !pos >= len then ()
    else if skip_braces () then ()
    else match s.[!pos] with
    | '[' ->
      (* optional positional: [name] or [<name>] or [name...] *)
      incr pos;
      let start = !pos in
      let depth = ref 1 in
      while !pos < len && !depth > 0 do
        if s.[!pos] = '[' then incr depth
        else if s.[!pos] = ']' then decr depth;
        incr pos
      done;
      let bracket_end = !pos - 1 in
      let inner = String.sub s start (max 0 (bracket_end - start)) |> String.trim in
      let inner, has_inner_dots =
        if String.ends_with ~suffix:"..." inner then
          (String.sub inner 0 (String.length inner - 3) |> String.trim, true)
        else (inner, false)
      in
      let variadic = has_inner_dots || read_dots () in
      if String.length inner > 0
         && inner.[0] <> '-'
         && (is_letter inner.[0] || inner.[0] = '<') then begin
        let name =
          if inner.[0] = '<' then
            let e = try String.index inner '>' with Not_found -> String.length inner in
            String.sub inner 1 (e - 1)
          else inner
        in
        if is_clean_name name && not (is_skip name) then
          positionals := { pos_name = String.lowercase_ascii name;
                       optional = true; variadic } :: !positionals
      end
    | '<' ->
      (* mandatory positional in angle brackets: <name> *)
      incr pos;
      let start = !pos in
      while !pos < len && s.[!pos] <> '>' do incr pos done;
      let name = String.sub s start (!pos - start) in
      if !pos < len then incr pos;
      let variadic = read_dots () in
      if is_clean_name name && not (is_skip name) then
        positionals := { pos_name = String.lowercase_ascii name;
                     optional = false; variadic } :: !positionals
    | '-' ->
      (* flag — skip entirely, not a positional *)
      while !pos < len && s.[!pos] <> ' ' && s.[!pos] <> '\t' && s.[!pos] <> ']' do incr pos done
    | c when c >= 'A' && c <= 'Z' ->
      (* ALL_CAPS positional name *)
      let start = !pos in
      while !pos < len && is_pos_char s.[!pos] do incr pos done;
      let name = String.sub s start (!pos - start) in
      let variadic = read_dots () in
      if String.length name >= 2
         && String.for_all (fun c ->
              (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || (c >= '0' && c <= '9')
            ) name
         && not (is_skip name) then
        positionals := { pos_name = String.lowercase_ascii name;
                     optional = false; variadic } :: !positionals
    | _ ->
      incr pos
  done;
  (* deduplicate positionals by name, keeping the first occurrence *)
  List.rev !positionals
  |> List.fold_left (fun (seen, acc) p ->
       if List.mem p.pos_name seen then (seen, acc)
       else (p.pos_name :: seen, p :: acc)
     ) ([], [])
  |> snd |> List.rev

(* find the "usage:" line in the help text and extract positionals from it.
 * searches line-by-line for a line starting with "usage:" (case-insensitive).
 * handles both inline usage ("usage: cmd [OPTIONS] FILE") and the clap style
 * where the actual usage is on the next line:
 *   USAGE:
 *     cmd [OPTIONS] FILE
 *
 * also handles the bare "usage" header (no colon) followed by a next line. *)
let extract_usage_positionals text =
  let lines = String.split_on_char '\n' text in
  let lines_arr = Array.of_list lines in
  let len = Array.length lines_arr in
  (* search through lines for the first usage header and return the usage content *)
  let find_usage_line () =
    let check_line idx =
      let trimmed = String.trim lines_arr.(idx) in
      let trimmed_len = String.length trimmed in
      let lc = String.lowercase_ascii trimmed in
      if trimmed_len >= 6 && String.sub lc 0 6 = "usage:" then begin
        let after = String.sub trimmed 6 (trimmed_len - 6) |> String.trim in
        if String.length after > 0 then Some after
        else if idx + 1 < len then
          (* clap style: USAGE:\n  cmd [OPTIONS] PATTERN *)
          let next = String.trim lines_arr.(idx + 1) in
          if String.length next > 0 then Some next else None
        else None
      end else if lc = "usage" then begin
        if idx + 1 < len then
          let next = String.trim lines_arr.(idx + 1) in
          if String.length next > 0 then Some next else None
        else None
      end else None
    in
    (* use List.find_map over the index range to find the first matching line *)
    List.find_map check_line (List.init len Fun.id)
  in
  match find_usage_line () with
  | None -> []
  | Some usage ->
    let cmd_end = skip_command_prefix usage in
    let args = String.sub usage cmd_end (String.length usage - cmd_end) in
    parse_usage_args args

(* extract positionals from CLI11's explicit "POSITIONALS:" section.
 * CLI11 (a c++ arg parsing library) emits a dedicated section:
 *   Positionals:
 *     name TEXT           description here
 *     count INT           another description
 *
 * this is preferred over usage-line extraction when present because it
 * provides more accurate type information. the parser looks for the
 * section header, then reads indented lines until a blank or unindented
 * line signals the end. type words (TEXT, INT, FLOAT, etc.) between the
 * name and description are skipped. *)
let extract_cli11_positionals text =
  let lines = String.split_on_char '\n' text in
  (* parse a single indented positional line into a positional record *)
  let parse_one s =
    let len = String.length s in
    let pos = ref 0 in
    let is_name_char c =
      (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
      || (c >= '0' && c <= '9') || c = '_' || c = '-' in
    while !pos < len && is_name_char s.[!pos] do incr pos done;
    if !pos < 2 then None
    else
      let name = String.sub s 0 !pos in
      while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done;
      (* skip type word: TEXT, INT, FLOAT, ENUM, BOOLEAN, etc. *)
      while !pos < len && s.[!pos] >= 'A' && s.[!pos] <= 'Z' do incr pos done;
      while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done;
      let variadic = !pos + 2 < len && s.[!pos] = '.' && s.[!pos+1] = '.' && s.[!pos+2] = '.' in
      Some { pos_name = String.lowercase_ascii name; optional = false; variadic }
  in
  (* parse consecutive indented lines under the section header *)
  let rec parse_lines lines acc =
    match lines with
    | [] -> List.rev acc
    | line :: rest ->
      let len = String.length line in
      if len = 0 || (line.[0] <> ' ' && line.[0] <> '\t') then
        List.rev acc
      else
        let trimmed = String.trim line in
        if String.length trimmed = 0 then List.rev acc
        else match parse_one trimmed with
          | Some p -> parse_lines rest (p :: acc)
          | None -> parse_lines rest acc
  in
  (* scan lines for the positionals section header, then parse the body *)
  let rec find_section = function
    | [] -> []
    | line :: rest ->
      let trimmed = String.trim line in
      if trimmed = "POSITIONALS:" || trimmed = "Positionals:" then
        parse_lines rest []
      else
        find_section rest
  in
  find_section lines

(* top-level entry point: parse a --help text string into a help_result.
 * steps:
 *   1. strip ansi escapes (colors, hyperlinks, etc.)
 *   2. run the Angstrom help_parser for flags and subcommands
 *   3. extract positionals via CLI11 format (preferred) or usage line (fallback)
 *   4. merge positionals into the result
 * uses Angstrom's prefix-consume mode — we don't need to parse every byte. *)
let parse_help txt =
  let clean = strip_ansi txt in
  match Angstrom.parse_string ~consume:Consume.Prefix help_parser clean with
  | Ok result ->
    let cli11 = extract_cli11_positionals clean in
    let usage = extract_usage_positionals clean in
    let positionals = if cli11 <> [] then cli11 else usage in
    Ok { result with positionals }
  | Error msg -> Error msg