init

2026-03-18 15:40:47 +11:00 · 2026-03-18 15:40:47 +11:00 · daa0c24415
commit daa0c24415
23 changed files with 5336 additions and 0 deletions
--- a/lib/.ocamlformat
+++ b/lib/.ocamlformat
--- a/lib/dune
+++ b/lib/dune
@ -0,0 +1,3 @@
+(library
+ (name inshellah)
+ (libraries angstrom angstrom-unix camlzip str unix))
--- a/lib/manpage.ml
+++ b/lib/manpage.ml
--- a/lib/nushell.ml
+++ b/lib/nushell.ml
@ -0,0 +1,253 @@
+(* nushell.ml — generate nushell extern definitions from parsed help data.
+ *
+ * this module is the code generation backend. it takes a help_result (from
+ * the parser or manpage modules) and produces nushell source code that
+ * defines `extern` declarations — nushell's mechanism for teaching the shell
+ * about external commands' flags and subcommands so it can offer completions.
+ *
+ * it also maintains a list of nushell's built-in commands to avoid generating
+ * extern definitions that would shadow them.
+ *
+ * key responsibilities:
+ *   - deduplicating flag entries (same flag from multiple help sources)
+ *   - mapping parameter names to nushell types (path, int, string)
+ *   - formatting flags in nushell syntax: --flag(-f): type  # description
+ *   - handling positional arguments with nushell's ordering constraints
+ *   - escaping special characters for nushell string literals
+ *)
+
+open Parser
+
+module SSet = Set.Make(String)
+module SMap = Map.Make(String)
+module CSet = Set.Make(Char)
+
+(* nushell built-in commands and keywords — we must never generate `extern`
+ * definitions for these because it would shadow nushell's own implementations.
+ * this list is maintained manually and should be updated with new nushell releases. *)
+let nushell_builtins = [
+  "alias"; "all"; "ansi"; "any"; "append"; "ast"; "attr";
+  "bits"; "break"; "bytes";
+  "cal"; "cd"; "char"; "chunk-by"; "chunks"; "clear"; "collect";
+  "columns"; "commandline"; "compact"; "complete"; "config"; "const";
+  "continue"; "cp";
+  "date"; "debug"; "decode"; "def"; "default"; "describe"; "detect";
+  "do"; "drop"; "du";
+  "each"; "echo"; "encode"; "enumerate"; "error"; "every"; "exec";
+  "exit"; "explain"; "explore"; "export"; "export-env"; "extern";
+  "fill"; "filter"; "find"; "first"; "flatten"; "for"; "format"; "from";
+  "generate"; "get"; "glob"; "grid"; "group-by";
+  "hash"; "headers"; "help"; "hide"; "hide-env"; "histogram";
+  "history"; "http";
+  "if"; "ignore"; "input"; "insert"; "inspect"; "interleave"; "into";
+  "is-admin"; "is-empty"; "is-not-empty"; "is-terminal"; "items";
+  "job"; "join";
+  "keybindings"; "kill";
+  "last"; "length"; "let"; "let-env"; "lines"; "load-env"; "loop"; "ls";
+  "match"; "math"; "merge"; "metadata"; "mkdir"; "mktemp"; "module";
+  "move"; "mut"; "mv";
+  "nu-check"; "nu-highlight";
+  "open"; "overlay";
+  "panic"; "par-each"; "parse"; "path"; "plugin"; "port"; "prepend"; "print"; "ps";
+  "query";
+  "random"; "reduce"; "reject"; "rename"; "return"; "reverse"; "rm";
+  "roll"; "rotate"; "run-external";
+  "save"; "schema"; "scope"; "select"; "seq"; "shuffle"; "skip"; "sleep";
+  "slice"; "sort"; "sort-by"; "source"; "source-env"; "split"; "start";
+  "stor"; "str"; "sys";
+  "table"; "take"; "tee"; "term"; "timeit"; "to"; "touch"; "transpose";
+  "try"; "tutor";
+  "ulimit"; "umask"; "uname"; "uniq"; "uniq-by"; "unlet"; "update";
+  "upsert"; "url"; "use";
+  "values"; "version"; "view";
+  "watch"; "where"; "which"; "while"; "whoami"; "window"; "with-env"; "wrap";
+  "zip";
+]
+
+(* lazily constructed set for fast membership checks against builtins *)
+let builtin_set = lazy (SSet.of_list nushell_builtins)
+
+(* returns true if the given command name collides with a nushell built-in *)
+let is_nushell_builtin cmd =
+  SSet.mem cmd (Lazy.force builtin_set)
+
+(* deduplicate flag entries that refer to the same flag.
+ * when the same flag appears multiple times (e.g. from overlapping manpage
+ * sections or repeated help text), we keep the "best" version using a score:
+ *   - both short+long form present: +10 (most informative)
+ *   - has a parameter: +5
+ *   - description length bonus: up to +5
+ *
+ * after deduplication by long name, we also remove standalone short flags
+ * whose letter is already covered by a Both(short, long) entry. this prevents
+ * emitting both "-v" and "--verbose(-v)" which nushell would reject as a
+ * duplicate. the filtering preserves original ordering from the help text. *)
+let dedup_entries entries =
+  (* produce a canonical key for each entry based on its switch form *)
+  let key_of entry =
+    match entry.switch with
+    | Short c -> Printf.sprintf "-%c" c
+    | Long l | Both (_, l) -> Printf.sprintf "--%s" l
+  in
+  (* compute a quality score for ranking duplicate entries *)
+  let score entry =
+    let switch_bonus = match entry.switch with Both _ -> 10 | _ -> 0 in
+    let param_bonus = match entry.param with Some _ -> 5 | None -> 0 in
+    let desc_bonus = min 5 (String.length entry.desc / 10) in
+    switch_bonus + param_bonus + desc_bonus
+  in
+  (* fold over entries, keeping only the highest-scored entry per key *)
+  let best = List.fold_left (fun acc entry ->
+    let key = key_of entry in
+    match SMap.find_opt key acc with
+    | Some prev when score prev >= score entry -> acc
+    | _ -> SMap.add key entry acc
+  ) SMap.empty entries in
+  (* collect all short-flag characters that are already part of a Both entry,
+   * so we can suppress standalone Short entries for the same character *)
+  let covered = SMap.fold (fun _ entry acc ->
+    match entry.switch with
+    | Both (c, _) -> CSet.add c acc
+    | _ -> acc
+  ) best CSet.empty in
+  (* emit entries in original order, skipping duplicates and covered shorts *)
+  List.fold_left (fun (seen, acc) entry ->
+    let key = key_of entry in
+    if SSet.mem key seen then (seen, acc)
+    else match entry.switch with
+    | Short c when CSet.mem c covered -> (seen, acc)
+    | _ -> (SSet.add key seen, SMap.find key best :: acc)
+  ) (SSet.empty, []) entries |> snd |> List.rev
+
+(* map parameter names to nushell types.
+ * nushell's `extern` declarations use typed parameters, so we infer the type
+ * from the parameter name. file/path-related names become "path" (enables
+ * path completion), numeric names become "int", everything else is "string". *)
+let nushell_type_of_param = function
+  | "FILE" | "file" | "PATH" | "path" | "DIR" | "dir" | "DIRECTORY"
+  | "FILENAME" | "PATTERNFILE" -> "path"
+  | "NUM" | "N" | "COUNT" | "NUMBER" | "int" | "INT" | "COLS" | "WIDTH"
+  | "LINES" | "DEPTH" | "depth" -> "int"
+  | _ -> "string"
+
+(* escape a string for use inside nushell double-quoted string literals.
+ * only double quotes and backslashes need escaping in nushell's syntax. *)
+let escape_nu s =
+  if not (String.contains s '"') && not (String.contains s '\\') then s
+  else begin
+    let buf = Buffer.create (String.length s + 4) in
+    String.iter (fun c -> match c with
+      | '"' -> Buffer.add_string buf "\\\""
+      | '\\' -> Buffer.add_string buf "\\\\"
+      | _ -> Buffer.add_char buf c
+    ) s;
+    Buffer.contents buf
+  end
+
+(* format a single flag entry as a nushell `extern` parameter line.
+ * output examples:
+ *   "    --verbose(-v)                       # increase verbosity"
+ *   "    --output(-o): path                  # write output to file"
+ *   "    -n: int                             # number of results"
+ *
+ * the description is right-padded to column 40 with a "# " comment prefix.
+ * nushell's syntax for combined short+long is "--long(-s)". *)
+let format_flag entry =
+  let name = match entry.switch with
+    | Both (short_char, l) -> Printf.sprintf "--%s(-%c)" l short_char
+    | Long l -> Printf.sprintf "--%s" l
+    | Short short_char -> Printf.sprintf "-%c" short_char
+  in
+  let typed = match entry.param with
+    | Some (Mandatory p) | Some (Optional p) -> ": " ^ nushell_type_of_param p
+    | None -> ""
+  in
+  let flag = "    " ^ name ^ typed in
+  if String.length entry.desc = 0 then flag
+  else
+    let pad_len = max 1 (40 - String.length flag) in
+    flag ^ String.make pad_len ' ' ^ "# " ^ entry.desc
+
+(* format a positional argument as a nushell `extern` parameter line.
+ * nushell syntax: "...name: type" for variadic, "name?: type" for optional.
+ * hyphens in names are converted to underscores since nushell identifiers
+ * cannot contain hyphens. *)
+let format_positional positional =
+  let name = String.map (function '-' -> '_' | c -> c) positional.pos_name in
+  let prefix = if positional.variadic then "..." else "" in
+  let suffix = if positional.optional && not positional.variadic then "?" else "" in
+  let typ = nushell_type_of_param (String.uppercase_ascii positional.pos_name) in
+  Printf.sprintf "    %s%s%s: %s" prefix name suffix typ
+
+(* enforce nushell's positional argument ordering rules:
+ *   1. no required positional may follow an optional one
+ *   2. at most one variadic ("rest") parameter is allowed
+ *
+ * if a required positional appears after an optional one, it is silently
+ * promoted to optional. duplicate variadic params are dropped.
+ * uses a fold to track the state across the list in one pass. *)
+let fixup_positionals positionals =
+  List.fold_left (fun (seen_optional, seen_variadic, acc) positional ->
+    if positional.variadic then
+      (* only allow the first variadic parameter *)
+      if seen_variadic then (seen_optional, seen_variadic, acc)
+      else (true, true, positional :: acc)
+    else if seen_optional then
+      (* once we've seen an optional, all subsequent must be optional too *)
+      (true, seen_variadic, { positional with optional = true } :: acc)
+    else
+      (positional.optional, seen_variadic, positional :: acc)
+  ) (false, false, []) positionals
+  |> fun (_, _, acc) -> List.rev acc
+
+(* generate the full nushell `extern` block for a command.
+ * produces output like:
+ *   export extern "git add" [
+ *     ...pathspec?: path
+ *     --verbose(-v)              # be verbose
+ *     --dry-run(-n)              # dry run
+ *   ]
+ *
+ * subcommands that weren't resolved into their own full definitions get
+ * stub `extern` blocks with just a comment containing their description:
+ *   export extern "git stash" [  # stash changes
+ *   ]
+ *)
+let extern_of cmd_name result =
+  let entries = dedup_entries result.entries in
+  let escaped_name = escape_nu cmd_name in
+  let positionals = fixup_positionals result.positionals in
+  (* format all positional and flag lines, each terminated with a newline *)
+  let pos_lines = List.map (fun positional -> format_positional positional ^ "\n") positionals in
+  let flags = List.map (fun entry -> format_flag entry ^ "\n") entries in
+  let main = Printf.sprintf "export extern \"%s\" [\n%s%s]\n" escaped_name (String.concat "" pos_lines) (String.concat "" flags) in
+  (* generate stub extern blocks for unresolved subcommands *)
+  let subs = List.map (fun (subcommand : subcommand) ->
+    Printf.sprintf "\nexport extern \"%s %s\" [  # %s\n]\n"
+      escaped_name (escape_nu subcommand.name) (escape_nu subcommand.desc)
+  ) result.subcommands in
+  String.concat "" (main :: subs)
+
+(* public alias for extern_of — this is the main entry point for callers *)
+let generate_extern = extern_of
+
+(* derive a nushell `module` name from a command name.
+ * replaces non-alphanumeric characters with hyphens and appends "-completions".
+ * e.g. "git" becomes "git-completions", "docker-compose" stays "docker-compose-completions" *)
+let module_name_of cmd_name =
+  let s = String.map (function
+    | ('a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_') as c -> c | _ -> '-') cmd_name in
+  s ^ "-completions"
+
+(* generate a complete nushell `module` wrapping the `extern`.
+ * output: "module git-completions { ... }\n\nuse git-completions *\n"
+ * the `use` at the end makes the `extern` immediately available in scope. *)
+let generate_module cmd_name result =
+  let mod_name = module_name_of cmd_name in
+  Printf.sprintf "module %s {\n%s}\n\nuse %s *\n" mod_name (extern_of cmd_name result) mod_name
+
+(* convenience wrapper: generate an `extern` from just a list of entries
+ * (no subcommands, positionals, or description). used when we only have
+ * flag data and nothing else. *)
+let generate_extern_from_entries cmd_name entries =
+  generate_extern cmd_name { entries; subcommands = []; positionals = []; description = "" }
--- a/lib/parser.ml
+++ b/lib/parser.ml
@ -0,0 +1,814 @@
+(* parser.ml — parse --help output into structured flag/subcommand/positional data.
+ *
+ * this module is the core of inshellah's help-text understanding. it takes the
+ * raw text that a cli tool prints when you run `cmd --help` and extracts:
+ *   - flag entries (short/long switches with optional parameters and descriptions)
+ *   - subcommand listings (name + description pairs)
+ *   - positional arguments (from usage lines)
+ *
+ * the parser is built on Angstrom (a monadic parser combinator library) for the
+ * structured flag/subcommand extraction, with hand-rolled imperative parsers for
+ * usage-line positional extraction (where the format is too varied for clean
+ * combinator composition).
+ *
+ * key design decisions:
+ *   - the Angstrom parser runs in prefix-consume mode — it doesn't need to parse
+ *     the entire input, just extract what it can recognize. unrecognized lines are
+ *     skipped via skip_non_option_line.
+ *   - multi-line descriptions are handled via indentation-based continuation:
+ *     lines indented 8+ spaces that don't start with '-' are folded into the
+ *     previous entry's description.
+ *   - subcommand detection uses a heuristic: lines with a name followed by 2+
+ *     spaces then a description, where the name is at least 2 chars. section
+ *     headers (like "arguments:") toggle whether name-description pairs are
+ *     treated as subcommands or positionals.
+ *   - positional extraction has two paths: usage-line parsing (the common case)
+ *     and CLI11's explicit "positionals:" section format.
+ *)
+
+open Angstrom
+
+(* strip ansi escape sequences and osc hyperlinks from --help output.
+ * many modern cli tools emit colored/styled output even when piped,
+ * so we need to clean this before parsing. handles:
+ *   - csi sequences (esc [ ... final_byte) — colors, cursor movement, etc.
+ *   - osc sequences (esc ] ... bel/st) — hyperlinks, window titles, etc.
+ *   - other two-byte esc+char sequences *)
+let strip_ansi s =
+  let buf = Buffer.create (String.length s) in
+  let len = String.length s in
+  let pos = ref 0 in
+  while !pos < len do
+    if !pos + 1 < len && Char.code s.[!pos] = 0x1b then begin
+      let next = s.[!pos + 1] in
+      if next = '[' then begin
+        (* csi sequence: esc [ ... final_byte *)
+        pos := !pos + 2;
+        while !pos < len && not (s.[!pos] >= '@' && s.[!pos] <= '~') do incr pos done;
+        if !pos < len then incr pos
+      end else if next = ']' then begin
+        (* osc sequence: esc ] ... (terminated by bel or esc \) *)
+        pos := !pos + 2;
+        let terminated = ref false in
+        while !pos < len && not !terminated do
+          if s.[!pos] = '\x07' then
+            (incr pos; terminated := true)
+          else if !pos + 1 < len && Char.code s.[!pos] = 0x1b && s.[!pos + 1] = '\\' then
+            (pos := !pos + 2; terminated := true)
+          else
+            incr pos
+        done
+      end else begin
+        (* other esc sequence, skip esc + one char *)
+        pos := !pos + 2
+      end
+    end else begin
+      Buffer.add_char buf s.[!pos];
+      incr pos
+    end
+  done;
+  Buffer.contents buf
+
+(* --- character class predicates ---
+ * used throughout the Angstrom parsers to classify characters.
+ * separated out for readability and reuse. *)
+
+let is_whitespace = function ' ' | '\t' -> true | _ -> false
+
+let is_alphanumeric = function
+  | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' -> true
+  | _ -> false
+
+(* characters allowed inside parameter names like FILE, output-dir, etc. *)
+let is_param_char = function
+  | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '_' | '-' -> true
+  | _ -> false
+
+(* used to detect ALL_CAPS parameter names like FILE, TIME_STYLE *)
+let is_upper_or_underscore = function
+  | 'A' .. 'Z' | '_' -> true
+  | _ -> false
+
+(* characters allowed in long flag names (--foo-bar, --enable-feature2) *)
+let is_long_char = function
+  | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '-' -> true
+  | _ -> false
+
+(* --- core types ---
+ * these types represent the structured output of parsing a help text.
+ * they are shared across the entire codebase (nushell codegen, store, manpage parser).
+ *
+ * switch: a flag can be short-only (-v), long-only (--verbose), or both (-v, --verbose).
+ *   the both variant keeps the pair together so nushell can emit "--verbose(-v)".
+ *
+ * param: flags can take mandatory (--output FILE) or optional (--color[=WHEN]) values.
+ *
+ * entry: one complete flag definition — its switch form, optional parameter, and
+ *   the description text (potentially multi-line, already joined).
+ *
+ * help_result: the complete parsed output for a single command. *)
+type switch = Short of char | Long of string | Both of char * string
+type param = Mandatory of string | Optional of string
+type entry = { switch : switch; param : param option; desc : string }
+type subcommand = { name : string; desc : string }
+type positional = { pos_name : string; optional : bool; variadic : bool }
+type help_result = { entries : entry list; subcommands : subcommand list; positionals : positional list; description : string }
+
+(* --- low-level Angstrom combinators ---
+ * building blocks for all the parsers below. *)
+
+(* consume horizontal whitespace (spaces and tabs) without crossing lines *)
+let inline_ws = skip_while (function ' ' | '\t' -> true | _ -> false)
+(* end of line — matches either a newline or end of input.
+ * this is the permissive version used in most places. *)
+let eol = end_of_line <|> end_of_input
+(* strict end of line — must consume an actual newline character.
+ * used in skip_non_option_line so we don't accidentally match eof
+ * and consume it when we shouldn't. *)
+let eol_strict = end_of_line
+
+(* --- switch and parameter parsers ---
+ * parse the flag name portion of an option line, e.g. "-v", "--verbose" *)
+
+let short_switch = char '-' *> satisfy is_alphanumeric
+let long_switch = string "--" *> take_while1 is_long_char
+let comma = char ',' *> inline_ws
+
+(* parameter parsers — handle the various syntaxes tools use to indicate
+ * that a flag takes a value. the formats are surprisingly diverse:
+ *   --output=FILE        (eq_man_param — mandatory, common in gnu tools)
+ *   --color[=WHEN]       (eq_opt_param — optional with = syntax)
+ *   --depth DEPTH        (space_upper_param — space-separated ALL_CAPS)
+ *   --file <path>        (space_angle_param — angle brackets)
+ *   --file [<path>]      (space_opt_angle_param — optional angle brackets)
+ *   --format string      (space_type_param — go/cobra lowercase type word)
+ *)
+let eq_opt_param =
+  string "[=" *> take_while1 is_param_char <* char ']' >>| fun a -> Optional a
+
+let eq_man_param =
+  char '=' *> take_while1 is_param_char >>| fun a -> Mandatory a
+
+(* space-separated ALL_CAPS param: e.g. " FILE", " TIME_STYLE".
+ * peek ahead and check the first char is uppercase, then validate
+ * the entire word is ALL_CAPS. prevents false positives where a
+ * description word like "Do" or "Set" immediately follows the flag name.
+ * digits are allowed (e.g. "SHA256") but lowercase chars disqualify. *)
+let space_upper_param =
+  char ' ' *> peek_char_fail >>= fun c ->
+  if is_upper_or_underscore c then
+    take_while1 is_param_char >>= fun name ->
+    if String.length name >= 1 && String.for_all (fun c -> is_upper_or_underscore c || c >= '0' && c <= '9') name then
+      return (Mandatory name)
+    else
+      fail "not an all-caps param"
+  else
+    fail "not an uppercase param"
+
+(* angle-bracket param: e.g. "<file>", "<notation>" *)
+let angle_param =
+  char '<' *> take_while1 (fun c -> c <> '>') <* char '>' >>| fun name ->
+  Mandatory name
+
+(* space + angle bracket param *)
+let space_angle_param =
+  char ' ' *> angle_param
+
+(* optional angle bracket param: [<file>] *)
+let opt_angle_param =
+  char '[' *> char '<' *> take_while1 (fun c -> c <> '>') <* char '>' <* char ']'
+  >>| fun name -> Optional name
+
+let space_opt_angle_param =
+  char ' ' *> opt_angle_param
+
+(* go/cobra style: space + lowercase type word like "string", "list", "int".
+ * capped at 10 chars to avoid consuming description words.
+ * go's flag libraries commonly emit "--timeout duration" or "--name string"
+ * where the type name is a short lowercase word. longer words are almost
+ * certainly the start of a description, not a type annotation. *)
+let space_type_param =
+  char ' ' *> peek_char_fail >>= fun c ->
+  if c >= 'a' && c <= 'z' then
+    take_while1 (fun c -> c >= 'a' && c <= 'z') >>= fun name ->
+    if String.length name <= 10 then
+      return (Mandatory name)
+    else
+      fail "too long for type param"
+  else
+    fail "not a lowercase type param"
+
+(* try each parameter format in order of specificity. the ordering matters:
+ * eq_opt_param must come before eq_man_param because "[=WHEN]" would otherwise
+ * partially match as "=WHEN" then fail on the trailing "]". similarly,
+ * space_opt_angle_param before space_angle_param to catch "[<file>]" before "<file>". *)
+let param_parser =
+  option None
+    (choice
+       [ eq_opt_param; eq_man_param;
+         space_opt_angle_param; space_angle_param;
+         space_upper_param; space_type_param ]
+     >>| fun a -> Some a)
+
+(* switch parser — handles the various ways help text presents flag names.
+ * formats handled (in order of attempt):
+ *   -a, --all       (short + comma + long — gnu style)
+ *   -a --all        (short + space + long — some tools omit the comma)
+ *   --all / -a      (long + slash + short — rare but seen in some tools)
+ *   -a              (short only)
+ *   --all           (long only)
+ *
+ * the ordering is critical because Angstrom's choice commits to
+ * the first parser that makes progress. short_switch consumes "-a", so the
+ * combined parsers must be tried before the short-only parser. *)
+let switch_parser =
+  choice
+    [
+      (short_switch >>= fun s ->
+       comma *> long_switch >>| fun l -> Both (s, l));
+      (short_switch >>= fun s ->
+       char ' ' *> long_switch >>| fun l -> Both (s, l));
+      (long_switch >>= fun l ->
+       inline_ws *> char '/' *> inline_ws *>
+       short_switch >>| fun s -> Both (s, l));
+      (short_switch >>| fun s -> Short s);
+      (long_switch >>| fun l -> Long l);
+    ]
+
+(* --- description parsing with multi-line continuation ---
+ * descriptions in help text often wrap across multiple lines. the convention
+ * is that continuation lines are deeply indented (8+ spaces) and don't start
+ * with '-' (which would indicate a new flag entry). we peek ahead to check
+ * indentation without consuming, then decide whether to fold the line in. *)
+
+(* take the rest of the line as text (does not consume the newline itself) *)
+let rest_of_line = take_till (fun c -> c = '\n' || c = '\r')
+
+(* check if a line is a continuation line: deeply indented, doesn't start with '-'.
+ * tabs count as 8 spaces to match typical terminal rendering.
+ * the 8-space threshold was chosen empirically — most help formatters indent
+ * descriptions at least this much, while flag lines are indented 2-4 spaces. *)
+let continuation_line =
+  peek_string 1 >>= fun _ ->
+  (* must start with significant whitespace (8+ spaces or tab) *)
+  let count_indent s =
+    let indent = ref 0 in
+    let pos = ref 0 in
+    while !pos < String.length s do
+      (match s.[!pos] with
+       | ' ' -> incr indent
+       | '\t' -> indent := !indent + 8
+       | _ -> pos := String.length s);
+      incr pos
+    done;
+    !indent
+  in
+  available >>= fun avail ->
+  if avail = 0 then fail "eof"
+  else
+    (* peek ahead to see indentation level *)
+    peek_string (min avail 80) >>= fun preview ->
+    let indent = count_indent preview in
+    let trimmed = String.trim preview in
+    let starts_with_dash =
+      String.length trimmed > 0 && trimmed.[0] = '-'
+    in
+    if indent >= 8 && not starts_with_dash then
+      (* this is a continuation line — consume whitespace + text *)
+      inline_ws *> rest_of_line <* eol
+    else
+      fail "not a continuation line"
+
+(* parse description text: first line (after switch+param) plus any continuation lines.
+ * blank continuation lines are filtered out, and all lines are trimmed and joined
+ * with spaces into a single string. *)
+let description =
+  inline_ws *> rest_of_line <* eol >>= fun first_line ->
+  many continuation_line >>| fun cont_lines ->
+  let all = first_line :: cont_lines in
+  let all = List.filter (fun s -> String.length (String.trim s) > 0) all in
+  String.concat " " (List.map String.trim all)
+
+(* description that appears on a separate line below the flag.
+ * this handles the clap (rust) "long" help format where flags and descriptions
+ * are on separate lines:
+ *   --verbose
+ *           increase verbosity
+ * here there's no inline description — just deeply-indented continuation lines. *)
+let description_below =
+  many1 continuation_line >>| fun lines ->
+  let lines = List.filter (fun s -> String.length (String.trim s) > 0) lines in
+  String.concat " " (List.map String.trim lines)
+
+(* --- line classification for skipping ---
+ * the parser needs to skip lines it doesn't understand (section headers,
+ * blank lines, description paragraphs not attached to a flag, etc.)
+ * without consuming lines that are flag entries. *)
+
+(* peek ahead to check if the current line looks like a flag entry.
+ * an option line starts with whitespace then '-'. *)
+let at_option_line =
+  peek_string 1 >>= fun _ ->
+  available >>= fun avail ->
+  if avail = 0 then fail "eof"
+  else
+    peek_string (min avail 40) >>= fun preview ->
+    let s = String.trim preview in
+    if String.length s > 0 && s.[0] = '-' then return ()
+    else fail "not an option line"
+
+(* skip a non-option line (section header, blank, description-only, etc.).
+ * uses eol_strict (not eol) so it won't match at eof — this prevents the
+ * parser from infinitely skipping at the end of input. if the line looks
+ * like an option line (at_option_line succeeds), we deliberately fail so
+ * that the entry parser gets a chance at it instead. *)
+let skip_non_option_line =
+  (at_option_line *> fail "this is an option line")
+  <|> (rest_of_line *> eol_strict *> return ())
+
+(* --- entry parsing --- *)
+
+(* parse a single flag entry: leading whitespace, then switch+param, then description.
+ * the description can appear on the same line (inline) or on the next line (below).
+ * if there's no description at all, we accept an empty string.
+ * the (eol *> description_below) branch handles the clap long-help format. *)
+let entry =
+  inline_ws *>
+  lift2 (fun (sw, param) desc -> { switch = sw; param; desc })
+    (lift2 (fun a b -> (a, b)) switch_parser param_parser)
+    (description <|> (eol *> (description_below <|> return "")))
+
+(* --- subcommand parsing ---
+ * subcommand lines in help text follow the pattern:
+ *   "  name   description"
+ * where the name and description are separated by 2+ spaces.
+ * some tools also include argument placeholders between name and description:
+ *   "  start UNIT...   start one or more units"
+ *   "  list [PATTERN]  list matching units"
+ *)
+
+let is_subcommand_char = function
+  | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | '_' -> true
+  | _ -> false
+
+(* skip argument placeholders like UNIT..., [PATTERN...|PID...], <file>
+ * that appear between the subcommand name and the description.
+ * only consumes single-space gaps — the two-space gap before the
+ * description is left for the main parser to use as the delimiter.
+ *
+ * this is a recursive (fix-point) parser that peeks ahead to distinguish
+ * single-space argument gaps from the double-space description separator.
+ * it accepts tokens that start with [, <, or are ALL_CAPS (with dots/pipes/
+ * commas for variadic syntax). *)
+let skip_arg_placeholders =
+  fix (fun self ->
+    (* peek ahead: single space followed by arg-like token *)
+    available >>= fun avail ->
+    if avail < 2 then return ()
+    else
+    peek_string (min avail 2) >>= fun peek_two ->
+    if String.length peek_two >= 2 && peek_two.[0] = ' ' && peek_two.[1] <> ' ' then
+      (* single space — could be an arg placeholder *)
+      let next = peek_two.[1] in
+      if next = '[' || next = '<'
+         || (next >= 'A' && next <= 'Z') then
+        (* peek the full token to check if it's ALL_CAPS/brackets *)
+        peek_string (min avail 80) >>= fun preview ->
+        (* extract the token after the single space *)
+        let tok_start = 1 in
+        let token_end = ref tok_start in
+        while !token_end < String.length preview
+              && preview.[!token_end] <> ' '
+              && preview.[!token_end] <> '\n'
+              && preview.[!token_end] <> '\r' do
+          incr token_end
+        done;
+        let tok = String.sub preview tok_start (!token_end - tok_start) in
+        (* accept as placeholder if it starts with [ or < or is ALL_CAPS
+           (possibly with dots, pipes, dashes) *)
+        let is_placeholder =
+          tok.[0] = '[' || tok.[0] = '<'
+          || String.for_all (fun c ->
+               (c >= 'A' && c <= 'Z') || c = '_' || c = '-'
+               || c = '.' || c = '|' || c = ',' || (c >= '0' && c <= '9')
+             ) tok
+        in
+        if is_placeholder then
+          advance (1 + String.length tok) *> self
+        else return ()
+      else return ()
+    else return ())
+
+(* parse a subcommand entry line.
+ * requires: name >= 2 chars, followed by 2+ spaces, then description.
+ * the name is lowercased for consistent lookup.
+ *
+ * if the description starts with "- " (a dash-space prefix), it's stripped.
+ * some tools format their subcommand lists as:
+ *   "  add   - add a new item"
+ * where the "- " is decorative, not part of the description. *)
+let subcommand_entry =
+  inline_ws *>
+  take_while1 is_subcommand_char >>= fun name ->
+  if String.length name < 2 then fail "subcommand name too short"
+  else
+  skip_arg_placeholders *>
+  char ' ' *> char ' ' *> inline_ws *>
+  rest_of_line <* eol >>| fun desc ->
+  { name = String.lowercase_ascii name;
+    desc = let trimmed = String.trim desc in
+      if String.length trimmed >= 2 && trimmed.[0] = '-' && trimmed.[1] = ' ' then
+        String.trim (String.sub trimmed 2 (String.length trimmed - 2))
+      else trimmed }
+
+(* --- section header detection ---
+ * section headers are critical for disambiguating subcommands from positional
+ * arguments. lines like "commands:" introduce subcommand sections, while
+ * "arguments:" or "positionals:" introduce argument sections where the same
+ * name+description format should not be treated as subcommands. *)
+
+(* detect section names that introduce positional argument listings.
+ * the check is case-insensitive and strips trailing colons. *)
+let is_arg_section s =
+  let lc = String.lowercase_ascii (String.trim s) in
+  let base = if String.ends_with ~suffix:":" lc
+    then String.sub lc 0 (String.length lc - 1) |> String.trim
+    else lc in
+  base = "arguments" || base = "args" || base = "positionals"
+  || base = "positional arguments"
+
+(* a section header: left-aligned (or lightly indented, <= 4 spaces) text
+ * ending with ':', not starting with '-'. must be consumed before
+ * subcommand_entry in the choice combinator, otherwise "commands:" would
+ * be parsed as a subcommand named "commands" with description ":".
+ *
+ * returns a bool indicating whether this is an argument section (true)
+ * or some other section (false). this drives the subcommand filtering logic
+ * in help_parser — entries under argument sections are excluded from the
+ * subcommand list. *)
+let section_header =
+  available >>= fun avail ->
+  if avail = 0 then fail "eof"
+  else
+    peek_string (min avail 80) >>= fun preview ->
+    (* extract just the first line from the preview *)
+    let first_line = match String.index_opt preview '\n' with
+      | Some pos -> String.sub preview 0 pos
+      | None -> preview in
+    let trimmed = String.trim first_line in
+    let len = String.length trimmed in
+    let indent = let pos = ref 0 in
+      while !pos < String.length first_line && (first_line.[!pos] = ' ' || first_line.[!pos] = '\t') do incr pos done;
+      !pos in
+    if len >= 2 && trimmed.[len - 1] = ':' && trimmed.[0] <> '-' && indent <= 4 then
+      rest_of_line <* eol_strict >>| fun line -> is_arg_section line
+    else fail "not a section header"
+
+(* --- top-level parser ---
+ * the main help parser: walks through all lines, trying each line as one of:
+ *   1. a flag entry (starts with whitespace + '-')
+ *   2. a section header (left-aligned text ending with ':')
+ *   3. a subcommand line (name + 2+ spaces + description)
+ *   4. anything else — skip
+ *
+ * the choice ordering matters: entries are tried first (highest priority),
+ * then section headers (must beat subcommand_entry to avoid misparse),
+ * then subcommands, then skip as fallback.
+ *
+ * after collecting all items, two post-processing steps happen:
+ *   - subcommands under argument sections are excluded (tracked via
+ *     a running in_arg_sec boolean toggled by section headers)
+ *   - duplicate subcommand names are deduplicated, keeping the entry
+ *     with the longer description (heuristic: more info = better)
+ *
+ * positionals are not extracted here — they come from the usage line
+ * parser (extract_usage_positionals) or CLI11's explicit section parser
+ * (extract_cli11_positionals), applied later in parse_help. *)
+let help_parser =
+  let open Angstrom in
+  fix (fun _self ->
+    let try_entry =
+      entry >>| fun e -> `Entry e
+    in
+    let try_section =
+      section_header >>| fun is_arg -> `Section is_arg
+    in
+    let try_subcommand =
+      subcommand_entry >>| fun sc -> `Subcommand sc
+    in
+    let try_skip =
+      skip_non_option_line >>| fun () -> `Skip
+    in
+    many (choice [ try_entry; try_section; try_subcommand; try_skip ]) >>| fun items ->
+    let entries = List.filter_map (function `Entry e -> Some e | _ -> None) items in
+    let subcommands =
+      List.fold_left (fun (in_arg_sec, acc) item ->
+        match item with
+        | `Section is_arg -> (is_arg, acc)
+        | `Subcommand sc when not in_arg_sec -> (in_arg_sec, sc :: acc)
+        | _ -> (in_arg_sec, acc)
+      ) (false, []) items
+      |> snd |> List.rev
+      |> List.fold_left (fun acc sc ->
+           match List.assoc_opt sc.name acc with
+           | Some prev when String.length prev.desc >= String.length sc.desc -> acc
+           | _ -> (sc.name, sc) :: List.remove_assoc sc.name acc
+         ) []
+      |> List.rev_map snd
+    in
+    { entries; subcommands; positionals = []; description = "" })
+
+(* --- usage line parsing ---
+ * usage lines look like: "usage: git add [OPTIONS] [--] [<pathspec>...]"
+ * to extract positional arguments, we first need to skip past the command
+ * name prefix ("git add") to reach the argument portion.
+ *
+ * skip_command_prefix walks word-by-word, treating each space-separated
+ * token as part of the command name as long as it:
+ *   - is made of "word chars" (alphanumeric, hyphen, underscore, slash, dot)
+ *   - contains at least one lowercase letter (to distinguish from ALL_CAPS
+ *     positional names like FILE)
+ *   - doesn't start with [, <, (, {, or - (which indicate arguments, not
+ *     command name components)
+ *
+ * this is an imperative index-walking parser rather than using Angstrom,
+ * because usage lines are a single string (not line-oriented) and the format
+ * is too varied for clean combinator composition. *)
+let skip_command_prefix s =
+  let len = String.length s in
+  let pos = ref 0 in
+  let skip_ws () = while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done in
+  let is_word_char = function
+    | 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '/' | '.' -> true
+    | _ -> false
+  in
+  let rec loop () =
+    skip_ws ();
+    if !pos >= len then ()
+    else if s.[!pos] = '[' || s.[!pos] = '<' || s.[!pos] = '(' || s.[!pos] = '{' || s.[!pos] = '-' then ()
+    else if is_word_char s.[!pos] then begin
+      let start = !pos in
+      while !pos < len && is_word_char s.[!pos] do incr pos done;
+      let word = String.sub s start (!pos - start) in
+      let has_lower = ref false in
+      String.iter (fun c -> if c >= 'a' && c <= 'z' then has_lower := true) word;
+      if not !has_lower then
+        pos := start
+      else
+        loop ()
+    end
+  in
+  loop ();
+  !pos
+
+(* parse the argument portion of a usage line into positional definitions.
+ * handles these syntactic forms:
+ *   <file>          - mandatory positional
+ *   [file]          - optional positional
+ *   FILE            - mandatory positional (ALL_CAPS convention)
+ *   <file>...       - variadic (also handles utf-8 ellipsis)
+ *   [file...]       - optional variadic
+ *   curly-brace alternatives - skipped, not a positional
+ *   -flag           - flags (skipped)
+ *
+ * certain ALL_CAPS names are skipped because they're not real positionals —
+ * "OPTIONS", "FLAGS", etc. are section labels that sometimes appear in usage
+ * lines for readability.
+ *
+ * deduplication at the end ensures we don't emit the same positional twice
+ * (can happen when usage lines are reformatted or repeated). *)
+let parse_usage_args s =
+  let len = String.length s in
+  let pos = ref 0 in
+  let positionals = ref [] in
+  let skip_ws () =
+    while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done in
+  let is_pos_char c =
+    (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || (c >= '0' && c <= '9') in
+  (* detect trailing dots or utf-8 ellipsis indicating variadic args *)
+  let read_dots () =
+    skip_ws ();
+    if !pos + 2 < len && s.[!pos] = '.' && s.[!pos+1] = '.' && s.[!pos+2] = '.' then
+      (pos := !pos + 3; true)
+    else if !pos + 2 < len && s.[!pos] = '\xe2' && s.[!pos+1] = '\x80' && s.[!pos+2] = '\xa6' then
+      (pos := !pos + 3; true)  (* utf-8 ellipsis *)
+    else false
+  in
+  (* names that are section labels, not actual positional arguments *)
+  let is_skip name =
+    let u = String.uppercase_ascii name in
+    u = "OPTIONS" || u = "OPTION" || u = "FLAGS" || u = "FLAG"
+  in
+  (* validate that a name contains only alphanumeric, underscore, hyphen chars *)
+  let is_clean_name name =
+    String.length name >= 2
+    && String.for_all (fun c ->
+         (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')
+         || (c >= '0' && c <= '9') || c = '_' || c = '-') name
+  in
+  let is_letter c = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') in
+  (* skip {A|c|d|...} alternative blocks — not positional arguments *)
+  let skip_braces () =
+    if !pos < len && s.[!pos] = '{' then begin
+      let depth = ref 1 in
+      incr pos;
+      while !pos < len && !depth > 0 do
+        if s.[!pos] = '{' then incr depth
+        else if s.[!pos] = '}' then decr depth;
+        incr pos
+      done;
+      ignore (read_dots ());
+      true
+    end else false
+  in
+  while !pos < len do
+    skip_ws ();
+    if !pos >= len then ()
+    else if skip_braces () then ()
+    else match s.[!pos] with
+    | '[' ->
+      (* optional positional: [name] or [<name>] or [name...] *)
+      incr pos;
+      let start = !pos in
+      let depth = ref 1 in
+      while !pos < len && !depth > 0 do
+        if s.[!pos] = '[' then incr depth
+        else if s.[!pos] = ']' then decr depth;
+        incr pos
+      done;
+      let bracket_end = !pos - 1 in
+      let inner = String.sub s start (max 0 (bracket_end - start)) |> String.trim in
+      let inner, has_inner_dots =
+        if String.ends_with ~suffix:"..." inner then
+          (String.sub inner 0 (String.length inner - 3) |> String.trim, true)
+        else (inner, false)
+      in
+      let variadic = has_inner_dots || read_dots () in
+      if String.length inner > 0
+         && inner.[0] <> '-'
+         && (is_letter inner.[0] || inner.[0] = '<') then begin
+        let name =
+          if inner.[0] = '<' then
+            let e = try String.index inner '>' with Not_found -> String.length inner in
+            String.sub inner 1 (e - 1)
+          else inner
+        in
+        if is_clean_name name && not (is_skip name) then
+          positionals := { pos_name = String.lowercase_ascii name;
+                       optional = true; variadic } :: !positionals
+      end
+    | '<' ->
+      (* mandatory positional in angle brackets: <name> *)
+      incr pos;
+      let start = !pos in
+      while !pos < len && s.[!pos] <> '>' do incr pos done;
+      let name = String.sub s start (!pos - start) in
+      if !pos < len then incr pos;
+      let variadic = read_dots () in
+      if is_clean_name name && not (is_skip name) then
+        positionals := { pos_name = String.lowercase_ascii name;
+                     optional = false; variadic } :: !positionals
+    | '-' ->
+      (* flag — skip entirely, not a positional *)
+      while !pos < len && s.[!pos] <> ' ' && s.[!pos] <> '\t' && s.[!pos] <> ']' do incr pos done
+    | c when c >= 'A' && c <= 'Z' ->
+      (* ALL_CAPS positional name *)
+      let start = !pos in
+      while !pos < len && is_pos_char s.[!pos] do incr pos done;
+      let name = String.sub s start (!pos - start) in
+      let variadic = read_dots () in
+      if String.length name >= 2
+         && String.for_all (fun c ->
+              (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || (c >= '0' && c <= '9')
+            ) name
+         && not (is_skip name) then
+        positionals := { pos_name = String.lowercase_ascii name;
+                     optional = false; variadic } :: !positionals
+    | _ ->
+      incr pos
+  done;
+  (* deduplicate positionals by name, keeping the first occurrence *)
+  List.rev !positionals
+  |> List.fold_left (fun (seen, acc) p ->
+       if List.mem p.pos_name seen then (seen, acc)
+       else (p.pos_name :: seen, p :: acc)
+     ) ([], [])
+  |> snd |> List.rev
+
+(* find the "usage:" line in the help text and extract positionals from it.
+ * searches line-by-line for a line starting with "usage:" (case-insensitive).
+ * handles both inline usage ("usage: cmd [OPTIONS] FILE") and the clap style
+ * where the actual usage is on the next line:
+ *   USAGE:
+ *     cmd [OPTIONS] FILE
+ *
+ * also handles the bare "usage" header (no colon) followed by a next line. *)
+let extract_usage_positionals text =
+  let lines = String.split_on_char '\n' text in
+  let lines_arr = Array.of_list lines in
+  let len = Array.length lines_arr in
+  (* search through lines for the first usage header and return the usage content *)
+  let find_usage_line () =
+    let check_line idx =
+      let trimmed = String.trim lines_arr.(idx) in
+      let trimmed_len = String.length trimmed in
+      let lc = String.lowercase_ascii trimmed in
+      if trimmed_len >= 6 && String.sub lc 0 6 = "usage:" then begin
+        let after = String.sub trimmed 6 (trimmed_len - 6) |> String.trim in
+        if String.length after > 0 then Some after
+        else if idx + 1 < len then
+          (* clap style: USAGE:\n  cmd [OPTIONS] PATTERN *)
+          let next = String.trim lines_arr.(idx + 1) in
+          if String.length next > 0 then Some next else None
+        else None
+      end else if lc = "usage" then begin
+        if idx + 1 < len then
+          let next = String.trim lines_arr.(idx + 1) in
+          if String.length next > 0 then Some next else None
+        else None
+      end else None
+    in
+    (* use List.find_map over the index range to find the first matching line *)
+    List.find_map check_line (List.init len Fun.id)
+  in
+  match find_usage_line () with
+  | None -> []
+  | Some usage ->
+    let cmd_end = skip_command_prefix usage in
+    let args = String.sub usage cmd_end (String.length usage - cmd_end) in
+    parse_usage_args args
+
+(* extract positionals from CLI11's explicit "POSITIONALS:" section.
+ * CLI11 (a c++ arg parsing library) emits a dedicated section:
+ *   Positionals:
+ *     name TEXT           description here
+ *     count INT           another description
+ *
+ * this is preferred over usage-line extraction when present because it
+ * provides more accurate type information. the parser looks for the
+ * section header, then reads indented lines until a blank or unindented
+ * line signals the end. type words (TEXT, INT, FLOAT, etc.) between the
+ * name and description are skipped. *)
+let extract_cli11_positionals text =
+  let lines = String.split_on_char '\n' text in
+  (* parse a single indented positional line into a positional record *)
+  let parse_one s =
+    let len = String.length s in
+    let pos = ref 0 in
+    let is_name_char c =
+      (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
+      || (c >= '0' && c <= '9') || c = '_' || c = '-' in
+    while !pos < len && is_name_char s.[!pos] do incr pos done;
+    if !pos < 2 then None
+    else
+      let name = String.sub s 0 !pos in
+      while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done;
+      (* skip type word: TEXT, INT, FLOAT, ENUM, BOOLEAN, etc. *)
+      while !pos < len && s.[!pos] >= 'A' && s.[!pos] <= 'Z' do incr pos done;
+      while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done;
+      let variadic = !pos + 2 < len && s.[!pos] = '.' && s.[!pos+1] = '.' && s.[!pos+2] = '.' in
+      Some { pos_name = String.lowercase_ascii name; optional = false; variadic }
+  in
+  (* parse consecutive indented lines under the section header *)
+  let rec parse_lines lines acc =
+    match lines with
+    | [] -> List.rev acc
+    | line :: rest ->
+      let len = String.length line in
+      if len = 0 || (line.[0] <> ' ' && line.[0] <> '\t') then
+        List.rev acc
+      else
+        let trimmed = String.trim line in
+        if String.length trimmed = 0 then List.rev acc
+        else match parse_one trimmed with
+          | Some p -> parse_lines rest (p :: acc)
+          | None -> parse_lines rest acc
+  in
+  (* scan lines for the positionals section header, then parse the body *)
+  let rec find_section = function
+    | [] -> []
+    | line :: rest ->
+      let trimmed = String.trim line in
+      if trimmed = "POSITIONALS:" || trimmed = "Positionals:" then
+        parse_lines rest []
+      else
+        find_section rest
+  in
+  find_section lines
+
+(* top-level entry point: parse a --help text string into a help_result.
+ * steps:
+ *   1. strip ansi escapes (colors, hyperlinks, etc.)
+ *   2. run the Angstrom help_parser for flags and subcommands
+ *   3. extract positionals via CLI11 format (preferred) or usage line (fallback)
+ *   4. merge positionals into the result
+ * uses Angstrom's prefix-consume mode — we don't need to parse every byte. *)
+let parse_help txt =
+  let clean = strip_ansi txt in
+  match Angstrom.parse_string ~consume:Consume.Prefix help_parser clean with
+  | Ok result ->
+    let cli11 = extract_cli11_positionals clean in
+    let usage = extract_usage_positionals clean in
+    let positionals = if cli11 <> [] then cli11 else usage in
+    Ok { result with positionals }
+  | Error msg -> Error msg
--- a/lib/store.ml
+++ b/lib/store.ml
@ -0,0 +1,498 @@
+(* store.ml — filesystem-backed cache of parsed completion data.
+ *
+ * this module handles persistence of completion data to disk. each command's
+ * help_result is serialized to JSON and stored as a file in a cache directory
+ * (default: $XDG_CACHE_HOME/inshellah). commands with native nushell completions
+ * are stored as .nu files instead.
+ *
+ * the store also provides lookup, listing, and subcommand discovery by
+ * scanning filenames in the cache directory.
+ *
+ * file naming convention:
+ *   - spaces in command names become underscores (e.g. "git add" -> "git_add.json")
+ *   - subcommands of a parent share the prefix (e.g. "git_add.json", "git_commit.json")
+ *   - .json files contain serialized help_result
+ *   - .nu files contain native nushell extern source code
+ *
+ * the module includes a minimal hand-rolled JSON parser/serializer because
+ * we only need to handle our own output format (no need for a full JSON library).
+ *)
+
+open Parser
+
+(* get the default store path: $XDG_CACHE_HOME/inshellah, falling back to
+ * ~/.cache/inshellah if XDG_CACHE_HOME is not set. *)
+let default_store_path () =
+  let cache = try Sys.getenv "XDG_CACHE_HOME"
+    with Not_found -> Filename.concat (Sys.getenv "HOME") ".cache" in
+  Filename.concat cache "inshellah"
+
+(* recursively create directories along a path (equivalent to mkdir -p).
+ * splits the path into components and folds over them, accumulating
+ * the current directory prefix and creating each level if missing. *)
+let ensure_dir dir =
+  let sep = Filename.dir_sep in
+  let parts = String.split_on_char sep.[0] dir in
+  (* determine the starting prefix: absolute paths begin with "/" *)
+  let start = if String.length dir > 0 && dir.[0] = sep.[0] then sep else "" in
+  let _final =
+    List.fold_left (fun current part ->
+      if part = "" then current
+      else begin
+        let next = if current = sep then sep ^ part
+                   else if current = "" then part
+                   else current ^ sep ^ part in
+        (if not (Sys.file_exists next) then Unix.mkdir next 0o755);
+        next
+      end
+    ) start parts
+  in
+  ()
+
+(* convert command name to safe filename: spaces become underscores,
+ * non-alphanumeric chars become hyphens.
+ * e.g. "git add" -> "git_add", "docker-compose" -> "docker-compose" *)
+let filename_of_command cmd =
+  String.map (function
+    | ' ' -> '_'
+    | ('a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '.') as char_val -> char_val
+    | _ -> '-') cmd
+
+(* inverse of filename_of_command: underscores back to spaces.
+ * note: this is lossy — original underscores in command names
+ * (e.g. "my_tool") would be converted to spaces. in practice this
+ * doesn't matter because tools with underscores in names are rare,
+ * and subcommands use space-separated naming. *)
+let command_of_filename base_name =
+  String.map (function '_' -> ' ' | char_val -> char_val) base_name
+
+(* --- JSON serialization of help_result ---
+ * hand-rolled JSON emitters. we don't use a JSON library because:
+ *   1. the schema is fixed and simple — we only serialize our own types
+ *   2. avoiding dependencies keeps the binary small
+ *   3. printf-style emission is fast and straightforward for our types *)
+
+(* escape a string for JSON: quotes, backslashes, and control characters.
+ * control chars below 0x20 are emitted as \uXXXX unicode escapes. *)
+let escape_json contents =
+  let buf = Buffer.create (String.length contents + 4) in
+  String.iter (fun char_val -> match char_val with
+    | '"' -> Buffer.add_string buf "\\\""
+    | '\\' -> Buffer.add_string buf "\\\\"
+    | '\n' -> Buffer.add_string buf "\\n"
+    | '\t' -> Buffer.add_string buf "\\t"
+    | '\r' -> Buffer.add_string buf "\\r"
+    | c when Char.code c < 0x20 ->
+      Buffer.add_string buf (Printf.sprintf "\\u%04x" (Char.code c))
+    | c -> Buffer.add_char buf c
+  ) contents;
+  Buffer.contents buf
+
+(* wrap a string in quotes after escaping for JSON *)
+let json_string text = Printf.sprintf "\"%s\"" (escape_json text)
+
+(* the literal null value for JSON output *)
+let json_null = "null"
+
+(* serialize a switch (short flag, long flag, or both) to JSON *)
+let json_switch_of = function
+  | Short char_val ->
+    Printf.sprintf "{\"type\":\"short\",\"char\":%s}" (json_string (String.make 1 char_val))
+  | Long name ->
+    Printf.sprintf "{\"type\":\"long\",\"name\":%s}" (json_string name)
+  | Both (char_val, name) ->
+    Printf.sprintf "{\"type\":\"both\",\"char\":%s,\"name\":%s}"
+      (json_string (String.make 1 char_val)) (json_string name)
+
+(* serialize a parameter spec (mandatory, optional, or absent) to JSON *)
+let json_param_of = function
+  | None -> json_null
+  | Some (Mandatory name) ->
+    Printf.sprintf "{\"kind\":\"mandatory\",\"name\":%s}" (json_string name)
+  | Some (Optional name) ->
+    Printf.sprintf "{\"kind\":\"optional\",\"name\":%s}" (json_string name)
+
+(* serialize a single flag entry (switch + param + description) to JSON *)
+let json_entry_of entry =
+  Printf.sprintf "{\"switch\":%s,\"param\":%s,\"desc\":%s}"
+    (json_switch_of entry.switch) (json_param_of entry.param) (json_string entry.desc)
+
+(* serialize a subcommand (name + description) to JSON *)
+let json_subcommand_of sc =
+  Printf.sprintf "{\"name\":%s,\"desc\":%s}" (json_string sc.name) (json_string sc.desc)
+
+(* serialize a positional argument to JSON *)
+let json_positional_of p =
+  Printf.sprintf "{\"name\":%s,\"optional\":%b,\"variadic\":%b}"
+    (json_string p.pos_name) p.optional p.variadic
+
+(* serialize a list of items to a JSON array using the given formatter *)
+let json_list formatter items =
+  "[" ^ String.concat "," (List.map formatter items) ^ "]"
+
+(* serialize an entire help_result to a JSON object string *)
+let json_of_help_result ?(source="help") result =
+  Printf.sprintf "{\"source\":%s,\"description\":%s,\"entries\":%s,\"subcommands\":%s,\"positionals\":%s}"
+    (json_string source)
+    (json_string result.description)
+    (json_list json_entry_of result.entries)
+    (json_list json_subcommand_of result.subcommands)
+    (json_list json_positional_of result.positionals)
+
+(* --- JSON deserialization ---
+ * minimal hand-rolled recursive-descent JSON parser. only handles the subset
+ * we emit: strings, booleans, nulls, arrays, and objects. no number parsing
+ * (we don't emit numbers). this is intentionally minimal — we only read back
+ * our own serialized format, so robustness against arbitrary JSON is not needed.
+ *
+ * note: the \u escape handler does basic UTF-8 encoding for code points
+ * up to 0xFFFF but doesn't handle surrogate pairs. this is fine for our use
+ * case since we only escape control characters below 0x20. *)
+
+type json =
+  | Jnull
+  | Jbool of bool
+  | Jstring of string
+  | Jarray of json list
+  | Jobject of (string * json) list
+
+(* JSON accessor helpers — return sensible defaults for missing/wrong types *)
+let json_get key = function
+  | Jobject pairs -> (try List.assoc key pairs with Not_found -> Jnull)
+  | _ -> Jnull
+
+(* extract a string from a JSON value, defaulting to empty string *)
+let json_to_string = function Jstring text -> text | _ -> ""
+
+(* extract a boolean from a JSON value, defaulting to false *)
+let json_to_bool = function Jbool value -> value | _ -> false
+
+(* extract a list from a JSON array value, defaulting to empty list *)
+let json_to_list = function Jarray items -> items | _ -> []
+
+exception Json_error of string
+
+(* imperative recursive-descent JSON parser.
+ * uses a mutable position ref to walk through the string.
+ * note: boolean/null parsing just advances a fixed number of chars
+ * without validating the actual characters — safe because we only read
+ * our own output, but would be incorrect for arbitrary JSON. *)
+let parse_json contents =
+  let len = String.length contents in
+  let pos = ref 0 in
+  (* peek at the current character without consuming it *)
+  let peek () = if !pos < len then contents.[!pos] else '\x00' in
+  (* advance the position by one character *)
+  let advance () = incr pos in
+  (* skip over any whitespace characters at current position *)
+  let skip_ws () =
+    while !pos < len && (contents.[!pos] = ' ' || contents.[!pos] = '\t'
+                         || contents.[!pos] = '\n' || contents.[!pos] = '\r') do
+      advance ()
+    done in
+  (* skip whitespace then consume the expected character, or raise *)
+  let expect char_val =
+    skip_ws ();
+    if peek () <> char_val then
+      raise (Json_error (Printf.sprintf "expected '%c' at %d" char_val !pos));
+    advance () in
+  (* mutually recursive parsers for each JSON value type *)
+  let rec parse_value () =
+    skip_ws ();
+    match peek () with
+    | '"' -> Jstring (parse_string ())
+    | '{' -> parse_object ()
+    | '[' -> parse_array ()
+    | 'n' -> advance (); advance (); advance (); advance (); Jnull
+    | 't' -> advance (); advance (); advance (); advance (); Jbool true
+    | 'f' ->
+      advance (); advance (); advance (); advance (); advance (); Jbool false
+    | char_val ->
+      raise (Json_error (Printf.sprintf "unexpected '%c' at %d" char_val !pos))
+  (* parse a quoted string value, handling escape sequences *)
+  and parse_string () =
+    expect '"';
+    let buf = Buffer.create 32 in
+    while peek () <> '"' do
+      if peek () = '\\' then begin
+        advance ();
+        (match peek () with
+         | '"' -> Buffer.add_char buf '"'
+         | '\\' -> Buffer.add_char buf '\\'
+         | 'n' -> Buffer.add_char buf '\n'
+         | 't' -> Buffer.add_char buf '\t'
+         | 'r' -> Buffer.add_char buf '\r'
+         | 'u' ->
+           (* handle \uXXXX unicode escapes with basic UTF-8 encoding *)
+           advance ();
+           let hex = String.sub contents !pos 4 in
+           pos := !pos + 3;
+           let code = int_of_string ("0x" ^ hex) in
+           if code < 128 then Buffer.add_char buf (Char.chr code)
+           else begin
+             if code < 0x800 then begin
+               Buffer.add_char buf (Char.chr (0xc0 lor (code lsr 6)));
+               Buffer.add_char buf (Char.chr (0x80 lor (code land 0x3f)))
+             end else begin
+               Buffer.add_char buf (Char.chr (0xe0 lor (code lsr 12)));
+               Buffer.add_char buf (Char.chr (0x80 lor ((code lsr 6) land 0x3f)));
+               Buffer.add_char buf (Char.chr (0x80 lor (code land 0x3f)))
+             end
+           end
+         | char_val -> Buffer.add_char buf char_val);
+        advance ()
+      end else begin
+        Buffer.add_char buf (peek ());
+        advance ()
+      end
+    done;
+    advance (); (* consume closing quote *)
+    Buffer.contents buf
+  (* parse a JSON object: { "key": value, ... } *)
+  and parse_object () =
+    expect '{';
+    skip_ws ();
+    if peek () = '}' then (advance (); Jobject [])
+    else begin
+      let pairs = ref [] in
+      let more = ref true in
+      while !more do
+        skip_ws ();
+        let key = parse_string () in
+        expect ':';
+        let value = parse_value () in
+        pairs := (key, value) :: !pairs;
+        skip_ws ();
+        if peek () = ',' then advance ()
+        else more := false
+      done;
+      expect '}';
+      Jobject (List.rev !pairs)
+    end
+  (* parse a JSON array: [ value, value, ... ] *)
+  and parse_array () =
+    expect '[';
+    skip_ws ();
+    if peek () = ']' then (advance (); Jarray [])
+    else begin
+      let items = ref [] in
+      let more = ref true in
+      while !more do
+        let value = parse_value () in
+        items := value :: !items;
+        skip_ws ();
+        if peek () = ',' then advance ()
+        else more := false
+      done;
+      expect ']';
+      Jarray (List.rev !items)
+    end
+  in
+  parse_value ()
+
+(* --- JSON to OCaml type converters ---
+ * these reconstruct our parser types from their JSON representations.
+ * they mirror the json_*_of serializers above. *)
+
+(* reconstruct a switch value from its JSON representation *)
+let switch_of_json json_node =
+  match json_to_string (json_get "type" json_node) with
+  | "short" ->
+    let char_str = json_to_string (json_get "char" json_node) in
+    Short (if String.length char_str > 0 then char_str.[0] else '?')
+  | "long" -> Long (json_to_string (json_get "name" json_node))
+  | "both" ->
+    let char_str = json_to_string (json_get "char" json_node) in
+    Both ((if String.length char_str > 0 then char_str.[0] else '?'),
+          json_to_string (json_get "name" json_node))
+  | _ -> Long "?"
+
+(* reconstruct a parameter spec from its JSON representation *)
+let param_of_json = function
+  | Jnull -> None
+  | json_node ->
+    let name = json_to_string (json_get "name" json_node) in
+    (match json_to_string (json_get "kind" json_node) with
+     | "mandatory" -> Some (Mandatory name)
+     | "optional" -> Some (Optional name)
+     | _ -> None)
+
+(* reconstruct a flag entry from its JSON representation *)
+let entry_of_json json_node =
+  { switch = switch_of_json (json_get "switch" json_node);
+    param = param_of_json (json_get "param" json_node);
+    desc = json_to_string (json_get "desc" json_node) }
+
+(* reconstruct a subcommand from its JSON representation *)
+let subcommand_of_json json_node =
+  { name = json_to_string (json_get "name" json_node);
+    desc = json_to_string (json_get "desc" json_node) }
+
+(* reconstruct a positional argument from its JSON representation *)
+let positional_of_json json_node =
+  { pos_name = json_to_string (json_get "name" json_node);
+    optional = json_to_bool (json_get "optional" json_node);
+    variadic = json_to_bool (json_get "variadic" json_node) }
+
+(* reconstruct a full help_result from its JSON representation *)
+let help_result_of_json json_node =
+  { entries = List.map entry_of_json (json_to_list (json_get "entries" json_node));
+    subcommands = List.map subcommand_of_json (json_to_list (json_get "subcommands" json_node));
+    positionals = List.map positional_of_json (json_to_list (json_get "positionals" json_node));
+    description = json_to_string (json_get "description" json_node) }
+
+(* --- filesystem operations --- *)
+
+(* write a string to a file, overwriting any existing content *)
+let write_file path contents =
+  let oc = open_out path in
+  output_string oc contents;
+  close_out oc
+
+(* read an entire file into a string, returning None on any error *)
+let read_file path =
+  try
+    let ic = open_in path in
+    let size = in_channel_length ic in
+    let contents = Bytes.create size in
+    really_input ic contents 0 size;
+    close_in ic;
+    Some (Bytes.to_string contents)
+  with _ -> None
+
+(* write a parsed help_result to the store as JSON *)
+let write_result ~dir ?(source="help") command result =
+  let path = Filename.concat dir (filename_of_command command ^ ".json") in
+  write_file path (json_of_help_result ~source result)
+
+(* write native nushell completion source to the store as a .nu file *)
+let write_native ~dir command data =
+  let path = Filename.concat dir (filename_of_command command ^ ".nu") in
+  write_file path data
+
+(* check whether a path exists and is a directory *)
+let is_dir path = Sys.file_exists path && Sys.is_directory path
+
+(* look for a command's data file across multiple store directories.
+ * checks JSON first, then .nu. returns the first match found.
+ * directories are searched in order (user dir before system dirs). *)
+let find_file dirs command =
+  let base_name = filename_of_command command in
+  List.find_map (fun directory ->
+    let json_path = Filename.concat directory (base_name ^ ".json") in
+    if Sys.file_exists json_path then Some json_path
+    else
+      let nu_path = Filename.concat directory (base_name ^ ".nu") in
+      if Sys.file_exists nu_path then Some nu_path
+      else None
+  ) dirs
+
+(* look up a command and deserialize its help_result from JSON.
+ * only searches for .json files (not .nu, since those can't be deserialized
+ * back into help_result). returns None if not found or parse fails. *)
+let lookup dirs command =
+  let base_name = filename_of_command command in
+  List.find_map (fun directory ->
+    let path = Filename.concat directory (base_name ^ ".json") in
+    match read_file path with
+    | Some data ->
+      (try Some (help_result_of_json (parse_json data))
+       with _ -> None)
+    | None -> None
+  ) dirs
+
+(* look up a command's raw data (JSON or .nu source) without parsing.
+ * used by the "query" command to dump stored data as-is. *)
+let lookup_raw dirs command =
+  let base_name = filename_of_command command in
+  List.find_map (fun directory ->
+    let json_path = Filename.concat directory (base_name ^ ".json") in
+    match read_file json_path with
+    | Some _ as result -> result
+    | None ->
+      let nu_path = Filename.concat directory (base_name ^ ".nu") in
+      read_file nu_path
+  ) dirs
+
+(* strip known extensions (.json or .nu) from a filename, returning None
+ * if the filename has neither extension *)
+let chop_extension filename =
+  if Filename.check_suffix filename ".json" then Some (Filename.chop_suffix filename ".json")
+  else if Filename.check_suffix filename ".nu" then Some (Filename.chop_suffix filename ".nu")
+  else None
+
+(* discover subcommands of a command by scanning filenames in the store.
+ * looks for files whose names start with the command's filename + "_"
+ * (e.g. for "git", finds "git_add.json", "git_commit.json", etc.)
+ *
+ * only returns immediate subcommands (no nested underscores beyond the prefix).
+ * tries to extract description from the JSON "description" field if available.
+ *
+ * note: this filesystem-based discovery is used as a fallback when the
+ * command's own help_result doesn't list subcommands. it enables completion
+ * for subcommands that were indexed from separate manpages or help runs. *)
+let subcommands_of dirs command =
+  let prefix = filename_of_command command ^ "_" in
+  let prefix_len = String.length prefix in
+  let module SMap = Map.Make(String) in
+  let subs = List.fold_left (fun subs directory ->
+    if is_dir directory then
+      Array.fold_left (fun subs filename ->
+        if not (String.starts_with ~prefix filename) then subs
+        else
+          let is_json = Filename.check_suffix filename ".json" in
+          match chop_extension filename with
+          | None -> subs
+          | Some base_name ->
+            let rest = String.sub base_name prefix_len (String.length base_name - prefix_len) in
+            (* skip nested subcommands and empty names *)
+            if String.contains rest '_' || String.length rest = 0 then subs
+            else if SMap.mem rest subs then subs
+            else
+              (* try to read the description from the JSON file *)
+              let desc = if is_json then
+                match read_file (Filename.concat directory filename) with
+                | Some data ->
+                  (try json_to_string (json_get "description" (parse_json data))
+                   with _ -> "")
+                | None -> ""
+              else "" in
+              SMap.add rest { name = rest; desc } subs
+      ) subs (Sys.readdir directory)
+    else subs
+  ) SMap.empty dirs in
+  SMap.fold (fun _ sc acc -> sc :: acc) subs [] |> List.rev
+
+(* list all indexed commands across all store directories.
+ * returns a sorted, deduplicated list of command names. *)
+let all_commands dirs =
+  let module SSet = Set.Make(String) in
+  List.fold_left (fun cmds directory ->
+    if is_dir directory then
+      Array.fold_left (fun cmds filename ->
+        match chop_extension filename with
+        | Some base_name -> SSet.add (command_of_filename base_name) cmds
+        | None -> cmds
+      ) cmds (Sys.readdir directory)
+    else cmds
+  ) SSet.empty dirs
+  |> SSet.elements
+
+(* determine how a command was indexed: "help", "manpage", "native", etc.
+ * for JSON files, reads the "source" field. for .nu files, returns "native".
+ * used by the "dump" command to show provenance. *)
+let file_type_of dirs command =
+  let base_name = filename_of_command command in
+  List.find_map (fun directory ->
+    let json_path = Filename.concat directory (base_name ^ ".json") in
+    if Sys.file_exists json_path then
+      (match read_file json_path with
+       | Some data ->
+         (try Some (json_to_string (json_get "source" (parse_json data)))
+          with _ -> Some "json")
+       | None -> Some "json")
+    else
+      let nu_path = Filename.concat directory (base_name ^ ".nu") in
+      if Sys.file_exists nu_path then Some "native"
+      else None
+  ) dirs