init

2026-03-18 15:40:47 +11:00 · 2026-03-18 15:40:47 +11:00 · d16ece28e2
commit d16ece28e2
22 changed files with 4798 additions and 0 deletions
--- a/bin/.ocamlformat
+++ b/bin/.ocamlformat
--- a/bin/dune
+++ b/bin/dune
@ -0,0 +1,4 @@
+(executable
+ (public_name inshellah)
+ (name main)
+ (libraries inshellah))
--- a/bin/main.ml
+++ b/bin/main.ml
@ -0,0 +1,992 @@
+(* main.ml — cli entry point for inshellah, a nushell completions engine.
+ *
+ * inshellah generates nushell "extern" definitions for external commands by
+ * parsing their manpages and --help output. it has two main modes:
+ *
+ *   1. indexing (batch): scan a prefix directory's bin/ and share/man/,
+ *      extract completions for every binary, and write them to a cache dir.
+ *      this is typically run once per nix profile or system update.
+ *
+ *   2. completing (interactive): given a command and its current arguments,
+ *      look up the cached data and return json completion candidates for
+ *      nushell's custom completer protocol.
+ *
+ * the indexing pipeline for each binary:
+ *   a. classify the binary (skip? try --help? try native completions?)
+ *   b. if the tool has native nushell completion support, try various
+ *      subcommand patterns ("completions nushell", "--completion nushell", etc.)
+ *   c. otherwise, run the tool with --help/-h and parse the output
+ *   d. recursively resolve subcommands (depth-limited to 5)
+ *   e. after binaries, parse manpages for any commands not yet covered
+ *
+ * parallelism: indexing forks per binary, and subcommand resolution forks
+ * per subcommand. results are marshaled back via pipes. this gives good
+ * throughput on multi-core systems while keeping the code simple (no threads,
+ * no async runtime — just unix fork/pipe/waitpid).
+ *)
+
+open Inshellah.Parser
+open Inshellah.Manpage
+open Inshellah.Nushell
+open Inshellah.Store
+
+module SSet = Set.Make(String)
+
+(* print usage and exit. called when no valid subcommand is given. *)
+let usage () =
+  Printf.eprintf
+    {|inshellah - nushell completions engine
+
+Usage:
+  inshellah index PREFIX... [--dir PATH] [--ignore FILE] [--help-only FILE]
+      Index completions into a directory of JSON/nu files.
+      PREFIX is a directory containing bin/ and share/man/.
+      Default dir: $XDG_CACHE_HOME/inshellah
+      --ignore FILE     skip listed commands entirely
+      --help-only FILE  skip manpages for listed commands, use --help instead
+  inshellah complete CMD [ARGS...] [--dir PATH] [--system-dir PATH]
+      Nushell custom completer. Outputs JSON completion candidates.
+      Falls back to --help resolution if command is not indexed.
+  inshellah query CMD [--dir PATH] [--system-dir PATH]
+      Print stored completion data for CMD.
+  inshellah dump [--dir PATH] [--system-dir PATH]
+      List indexed commands.
+  inshellah manpage FILE            Parse a manpage and emit nushell extern
+  inshellah manpage-dir DIR         Batch-process manpages under DIR
+
+|};
+  exit 1
+
+(* manpage sections that contain command documentation.
+ * section 1 = user commands, section 8 = system administration commands. *)
+let command_sections = [1; 8]
+
+(* simple substring search using Str *)
+let contains_str s sub =
+  try ignore (Str.search_forward (Str.regexp_string sub) s 0); true
+  with Not_found -> false
+
+(* heuristic to detect whether text is valid nushell source code.
+ * checks for common nushell declaration keywords. the length > 20
+ * check avoids false positives on short error messages. *)
+let is_nushell_source text =
+  String.length text > 20
+  && (contains_str text "export extern"
+      || contains_str text "export def"
+      || (contains_str text "module " && contains_str text "export"))
+
+(* extract command name from a manpage filename.
+ * "ls.1.gz" → strip .gz → "ls.1" → chop extension → "ls" *)
+let cmd_name_of_manpage path =
+  let base = Filename.basename path in
+  let base =
+    if Filename.check_suffix base ".gz" then Filename.chop_suffix base ".gz"
+    else base in
+  try Filename.chop_extension base with Invalid_argument _ -> base
+
+(* sanitized environment for child processes.
+ * strips display-related variables (DISPLAY, WAYLAND_DISPLAY, etc.) to prevent
+ * gui tools from trying to open windows when we run them with --help.
+ * without this, some tools (e.g. ckb-next) would pop up dialogs or hang
+ * waiting for a display connection. *)
+let safe_env = lazy (
+  Array.of_list (
+    List.filter (fun s ->
+      not (String.starts_with ~prefix:"DISPLAY=" s
+           || String.starts_with ~prefix:"WAYLAND_DISPLAY=" s
+           || String.starts_with ~prefix:"DBUS_SESSION_BUS_ADDRESS=" s
+           || String.starts_with ~prefix:"XAUTHORITY=" s))
+      (Array.to_list (Unix.environment ()))))
+
+(* Non-blocking drain of a pipe fd into a buffer.  Safe to call repeatedly;
+   reads whatever is available without blocking.  Used by all fork-pipe sites
+   to keep pipes drained so children never block on write. *)
+let drain_fd rd buf =
+  let chunk = Bytes.create 8192 in
+  let continue = ref true in
+  while !continue do
+    match Unix.select [rd] [] [] 0.0 with
+    | (_ :: _, _, _) ->
+      (try
+         let n = Unix.read rd chunk 0 8192 in
+         if n = 0 then continue := false
+         else Buffer.add_subbytes buf chunk 0 n
+       with Unix.Unix_error _ -> continue := false)
+    | _ -> continue := false
+  done
+
+(* run a command with a timeout, capturing its stdout+stderr.
+ * forks a child process, redirects stdin from /dev/null, and merges
+ * stdout+stderr onto a pipe. reads from the pipe with select() polling
+ * until either the child exits or the deadline is reached.
+ *
+ * peculiarity: the child is run in /tmp to prevent tools that create
+ * side-effect files (like ckb-next-dev-detect-report.gz) from polluting
+ * the user's working directory. we chdir to /tmp before fork and restore after.
+ *
+ * peculiarity: the select timeout is capped at 0.05s per iteration to ensure
+ * we check the deadline frequently even when no data is available.
+ *
+ * returns none if the process couldn't be started, produced no output,
+ * or was killed due to timeout. *)
+let run_cmd args timeout_ms =
+  let (rd, wr) = Unix.pipe () in
+  let devnull = Unix.openfile "/dev/null" [Unix.O_RDONLY] 0 in
+  let argv = Array.of_list args in
+  (* Run subprocesses in /tmp so commands that write side-effect files
+     (e.g. ckb-next-dev-detect-report.gz) don't pollute the working dir *)
+  let saved_cwd = Sys.getcwd () in
+  Sys.chdir "/tmp";
+  let pid =
+    try Unix.create_process_env (List.hd args) argv
+          (Lazy.force safe_env) devnull wr wr
+    with Unix.Unix_error _ ->
+      Unix.close rd; Unix.close wr; Unix.close devnull; -1 in
+  Sys.chdir saved_cwd;
+  Unix.close wr; Unix.close devnull;
+  if pid < 0 then (Unix.close rd; None)
+  else begin
+    let buf = Buffer.create 4096 in
+    let deadline = Unix.gettimeofday () +. (float_of_int timeout_ms /. 1000.0) in
+    let chunk = Bytes.create 8192 in
+    let alive = ref true in
+    (try while !alive do
+       let remaining = deadline -. Unix.gettimeofday () in
+       if remaining <= 0.0 then alive := false
+       else match Unix.select [rd] [] [] (min remaining 0.05) with
+         | (_ :: _, _, _) ->
+           let n = Unix.read rd chunk 0 8192 in
+           if n = 0 then raise Exit
+           else Buffer.add_subbytes buf chunk 0 n
+         | _ -> ()
+     done with Exit -> ());
+    Unix.close rd;
+    if not !alive then begin
+      (try Unix.kill pid Sys.sigkill with Unix.Unix_error _ -> ());
+      ignore (Unix.waitpid [] pid)
+    end else
+      ignore (Unix.waitpid [] pid);
+    if Buffer.length buf > 0 then Some (Buffer.contents buf) else None
+  end
+
+(* check if a path is a regular file with at least one execute bit set *)
+let is_executable path =
+  try let st = Unix.stat path in
+    st.st_kind = Unix.S_REG && st.st_perm land 0o111 <> 0
+  with Unix.Unix_error _ -> false
+
+(* check if a file is a script by looking for a #! shebang.
+ * follows symlinks via realpath before reading. *)
+let is_script path =
+  try
+    let real = Unix.realpath path in
+    let ic = open_in_bin real in
+    let has_shebang =
+      try let b = Bytes.create 2 in
+        really_input ic b 0 2;
+        Bytes.get b 0 = '#' && Bytes.get b 1 = '!'
+      with End_of_file -> false in
+    close_in ic;
+    has_shebang
+  with _ -> false
+
+(* scan an elf binary for string needles without loading the entire file.
+ * reads the file in 64kb chunks, searching each chunk for the needle strings.
+ * uses a sliding window (carry) of max_needle bytes between chunks to handle
+ * needles that span chunk boundaries.
+ *
+ * peculiarity: on read failure (e.g. if the path resolves to something
+ * unreadable), all needles are marked as found. this is a conservative
+ * fallback — we'd rather try --help on an unreadable binary than skip it.
+ *
+ * the inner loop is a manual byte-by-byte comparison rather than using
+ * String.contains or Str for performance — this runs on every binary
+ * in the prefix, so it needs to be fast. *)
+let elf_scan path needles =
+  let found = Hashtbl.create 4 in
+  let remaining () = List.filter (fun n -> not (Hashtbl.mem found n)) needles in
+  (try
+    let real = Unix.realpath path in
+    let ic = open_in_bin real in
+    let magic = Bytes.create 4 in
+    really_input ic magic 0 4;
+    if Bytes.get magic 0 = '\x7f' && Bytes.get magic 1 = 'E'
+       && Bytes.get magic 2 = 'L' && Bytes.get magic 3 = 'F' then begin
+      let max_needle = List.fold_left (fun m n -> max m (String.length n)) 0 needles in
+      let chunk_size = 65536 in
+      let buf = Bytes.create (chunk_size + max_needle) in
+      let carry = ref 0 in
+      let eof = ref false in
+      while not !eof && remaining () <> [] do
+        let n = (try input ic buf !carry chunk_size with End_of_file -> 0) in
+        if n = 0 then eof := true
+        else begin
+          let total = !carry + n in
+          List.iter (fun needle ->
+            if not (Hashtbl.mem found needle) then begin
+              let nlen = String.length needle in
+              let i = ref 0 in
+              while !i <= total - nlen do
+                if Bytes.get buf !i = needle.[0] then begin
+                  let ok = ref true in
+                  for j = 1 to nlen - 1 do
+                    if Bytes.get buf (!i + j) <> needle.[j] then ok := false
+                  done;
+                  if !ok then (Hashtbl.replace found needle true; i := total)
+                  else incr i
+                end else incr i
+              done
+            end
+          ) (remaining ());
+          let new_carry = min max_needle total in
+          Bytes.blit buf (total - new_carry) buf 0 new_carry;
+          carry := new_carry
+        end
+      done
+    end;
+    close_in ic
+  with _ ->
+    List.iter (fun n -> Hashtbl.replace found n true) needles);
+  found
+
+(* detect nix-generated c wrapper scripts and extract the real binary path.
+ * nix's makeCWrapper creates small c programs that set up the environment
+ * and exec the real binary. these wrappers won't contain "-h" or "completion"
+ * in their own binary (they're just wrappers), so elf_scan would say "skip".
+ * this function reads the wrapper source to find the actual /nix/store/.../bin/...
+ * target path, so we can try --help on the real binary instead.
+ *
+ * peculiarity: caps the read at 64kb to avoid accidentally reading a large
+ * non-wrapper binary into memory. *)
+let nix_wrapper_target path =
+  try
+    let real = Unix.realpath path in
+    let ic = open_in_bin real in
+    let n = in_channel_length ic in
+    if n > 65536 then (close_in ic; None)
+    else begin
+      let s = Bytes.create n in
+      really_input ic s 0 n; close_in ic;
+      let s = Bytes.to_string s in
+      if not (contains_str s "makeCWrapper") then None
+      else
+        let re = Str.regexp "/nix/store/[a-z0-9]+-[^' \n\r\x00]+/bin/[a-zA-Z0-9._-]+" in
+        try ignore (Str.search_forward re s 0);
+          let target = Str.matched_string s in
+          if Sys.file_exists target then Some target else None
+        with Not_found -> None
+    end
+  with _ -> None
+
+(* heuristic filter for binary names that should never be indexed.
+ * skips: empty names, "-", dotfiles, libraries (lib-prefix), daemon wrappers
+ * (suffixes -daemon, -wrapped), shared objects (.so suffix), and names with no
+ * alphanumeric characters (e.g. punctuation-only names). *)
+let skip_name name =
+  String.length name = 0 || name = "-" || name.[0] = '.'
+  || String.starts_with ~prefix:"lib" name
+  || String.ends_with ~suffix:"-daemon" name
+  || String.ends_with ~suffix:"-wrapped" name
+  || String.ends_with ~suffix:".so" name
+  || not (String.exists (fun c -> (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) name)
+
+(* classification result for a binary.
+ *   Skip               — don't index this binary at all
+ *   Try_help           — only try --help (scripts, binaries without "completion" string)
+ *   Try_native_and_help — try native nushell completion first, fall back to --help *)
+type bin_class = Skip | Try_help | Try_native_and_help
+
+(* classify a binary to decide the indexing strategy.
+ * decision tree:
+ *   1. nushell builtin or bad name → Skip
+ *   2. not executable → Skip
+ *   3. script (has shebang) → Try_help (scripts can't have native completions)
+ *   4. elf binary containing "completion" → Try_native_and_help
+ *   5. elf binary containing "-h" → Try_help
+ *   6. nix wrapper → Try_help (the wrapper itself is just an exec shim)
+ *   7. otherwise → Skip (binary has no help infrastructure) *)
+let classify_binary bindir name =
+  if is_nushell_builtin name || skip_name name then Skip
+  else
+    let path = Filename.concat bindir name in
+    if not (is_executable path) then Skip
+    else if is_script path then Try_help
+    else
+      let scan = elf_scan path ["-h"; "completion"] in
+      if Hashtbl.mem scan "completion" then Try_native_and_help
+      else if Hashtbl.mem scan "-h" then Try_help
+      else if nix_wrapper_target path <> None then Try_help
+      else Skip
+
+(* detect available cpu cores by counting "processor" lines in /proc/cpuinfo.
+ * falls back to 4 if /proc/cpuinfo can't be read (e.g. on non-linux). *)
+let num_cores () =
+  try
+    let ic = open_in "/proc/cpuinfo" in
+    let n = ref 0 in
+    (try while true do
+       if String.starts_with ~prefix:"processor" (input_line ic) then incr n
+     done with End_of_file -> ());
+    close_in ic; max 1 !n
+  with _ -> 4
+
+(* try to get native nushell completions from a binary.
+ * tries several common subcommand patterns that tools use for shell completions.
+ * returns the first one that produces valid nushell source code.
+ * the 500ms timeout is generous enough for most tools but prevents hangs.
+ *
+ * the patterns cover: cobra (go), clap (rust), click (python), and various
+ * ad-hoc implementations. *)
+let try_native_completion bin_path =
+  List.find_map (fun args ->
+    match run_cmd args 500 with
+    | Some text when is_nushell_source text -> Some text
+    | _ -> None
+  ) [
+    [bin_path; "completions"; "nushell"];
+    [bin_path; "completion"; "nushell"];
+    [bin_path; "--completions"; "nushell"];
+    [bin_path; "--completion"; "nushell"];
+    [bin_path; "generate-completion"; "nushell"];
+    [bin_path; "--generate-completion"; "nushell"];
+    [bin_path; "shell-completions"; "nushell"];
+  ]
+
+(* parse a manpage file, extracting the command name, its flags/subcommands,
+ * and any clap-style per-subcommand sections.
+ * returns none for nushell builtins or failed parses. *)
+let parse_manpage_for_command file =
+  let contents = read_manpage_file file in
+  let fallback = cmd_name_of_manpage file in
+  let cmd = match extract_synopsis_command contents with
+    | Some name -> name | None -> fallback in
+  if is_nushell_builtin cmd then None
+  else
+    let result = parse_manpage_string contents in
+    let sub_sections = extract_subcommand_sections contents in
+    let result = if sub_sections <> [] then
+      { result with subcommands = List.map (fun (name, desc, _) ->
+        { name; desc }) sub_sections }
+    else result in
+    let subs = List.map (fun (name, _desc, r) ->
+      (cmd ^ " " ^ name, r)) sub_sections in
+    Some (cmd, result, subs)
+
+(* "inshellah manpage FILE" — parse one manpage and print the nushell extern *)
+let cmd_manpage file =
+  match parse_manpage_for_command file with
+  | Some (cmd, result, _) when result.entries <> [] ->
+    print_string (generate_extern cmd result)
+  | _ -> ()
+
+(* "inshellah manpage-dir DIR" — batch-process all manpages under a directory *)
+let cmd_manpage_dir dir =
+  List.iter (fun section ->
+    let subdir = Filename.concat dir (Printf.sprintf "man%d" section) in
+    if is_dir subdir then
+      Array.iter (fun file ->
+        (try cmd_manpage (Filename.concat subdir file) with _ -> ())
+      ) (Sys.readdir subdir)
+  ) command_sections
+
+(* safety limit: don't accumulate more than 500 subcommand resolution results
+ * per binary. prevents runaway recursion on tools with enormous subcommand trees. *)
+let max_resolve_results = 500
+
+(* safe wrapper around parse_manpage_for_command that catches all exceptions *)
+let process_manpage file =
+  try
+    match parse_manpage_for_command file with
+    | Some (cmd, result, subs) when result.entries <> [] || subs <> [] ->
+      Some (cmd, result, subs)
+    | _ -> None
+  with _ -> None
+
+(* collect the set of command names that have manpages in a given man directory.
+ * used during indexing to skip --help for commands that will be handled by
+ * the manpage parsing phase instead (manpages are more reliable than --help). *)
+let manpaged_commands mandir =
+  List.fold_left (fun acc section ->
+    let subdir = Filename.concat mandir (Printf.sprintf "man%d" section) in
+    if is_dir subdir then
+      Array.fold_left (fun acc f -> SSet.add (cmd_name_of_manpage f) acc)
+        acc (Sys.readdir subdir)
+    else acc
+  ) SSet.empty command_sections
+
+(* parallel structured help resolver — recursively resolves a command and
+ * all its subcommands by running --help on each, forking a child process
+ * per subcommand for parallelism.
+ *
+ * the resolver works as a breadth-first queue:
+ *   1. start with the root command in the queue
+ *   2. fork a child for each queued item (up to num_cores concurrent)
+ *   3. the child runs --help, parses the output, marshals the result via pipe
+ *   4. the parent collects results and enqueues discovered subcommands
+ *   5. repeat until queue is empty and all children have finished
+ *
+ * depth is limited to 5 levels and total results to max_resolve_results
+ * to prevent runaway recursion on pathological command trees.
+ *
+ * peculiarity: the child process detects "self-listing" — when a subcommand's
+ * --help lists itself as a subcommand (e.g. "git help" listing "help" as a
+ * subcommand of itself). this would cause infinite recursion, so such results
+ * are discarded.
+ *
+ * peculiarity: children close all pipe fds from other pending children
+ * immediately after fork to prevent fd leaks. the parent drains pipes
+ * regularly to prevent children from blocking on full pipe buffers. *)
+let help_resolve_par ?(timeout=200) cmd rest name =
+  let max_jobs = num_cores () in
+  let queue = Queue.create () in
+  Queue.push (rest, name, 0) queue;
+  let results = ref [] in
+  (* pending: (pid, rd, buf, rest, name, depth) *)
+  let pending = ref [] in
+  let collect rd buf q_rest q_name q_depth =
+    drain_fd rd buf;
+    (try Unix.close rd with _ -> ());
+    let data = Buffer.contents buf in
+    let result : (help_result * subcommand list) option =
+      if String.length data > 0 then
+        try Marshal.from_string data 0 with _ -> None
+      else None in
+    match result with
+    | None -> ()
+    | Some (r, subs) ->
+      let at_limit = q_depth >= 5 || List.length !results >= max_resolve_results in
+      results := (q_name, r) :: !results;
+      if not at_limit then
+        List.iter (fun (sc : subcommand) ->
+          Queue.push (q_rest @ [sc.name], q_name ^ " " ^ sc.name, q_depth + 1) queue
+        ) subs in
+  let reap () =
+    pending := List.filter (fun (pid, rd, buf, q_rest, q_name, q_depth) ->
+      drain_fd rd buf;
+      match Unix.waitpid [Unix.WNOHANG] pid with
+      | (0, _) -> true
+      | _ -> collect rd buf q_rest q_name q_depth; false
+      | exception Unix.Unix_error (Unix.ECHILD, _, _) ->
+        (try Unix.close rd with _ -> ()); false
+    ) !pending in
+  let wait_for_slot () =
+    while List.length !pending >= max_jobs do
+      reap ();
+      if List.length !pending >= max_jobs then begin
+        let fds = List.map (fun (_, rd, _, _, _, _) -> rd) !pending in
+        ignore (Unix.select fds [] [] 0.05)
+      end
+    done in
+  while not (Queue.is_empty queue) || !pending <> [] do
+    while not (Queue.is_empty queue) do
+      let (q_rest, q_name, q_depth) = Queue.pop queue in
+      wait_for_slot ();
+      let (rd, wr) = Unix.pipe () in
+      let pid = Unix.fork () in
+      if pid = 0 then begin
+        Unix.close rd;
+        List.iter (fun (_, prd, _, _, _, _) ->
+          try Unix.close prd with _ -> ()) !pending;
+        let result =
+          let text = match run_cmd (cmd :: q_rest @ ["--help"]) timeout with
+            | Some _ as r -> r
+            | None -> run_cmd (cmd :: q_rest @ ["-h"]) timeout in
+          match text with
+          | None -> None
+          | Some text ->
+            (match parse_help text with
+             | Error _ -> None
+             | Ok r when r.entries = [] && r.subcommands = [] && r.positionals = [] -> None
+             | Ok r ->
+               let self_listed = match q_rest with
+                 | [] -> false
+                 | _ ->
+                   let leaf = List.nth q_rest (List.length q_rest - 1) in
+                   List.exists (fun (sc : subcommand) -> sc.name = leaf) r.subcommands in
+               if self_listed then None
+               else
+                 let at_limit = q_depth >= 5 in
+                 let subs = if at_limit then [] else r.subcommands in
+                 Some (r, subs)) in
+        let oc = Unix.out_channel_of_descr wr in
+        Marshal.to_channel oc (result : (help_result * subcommand list) option) [];
+        close_out oc;
+        exit 0
+      end else begin
+        Unix.close wr;
+        pending := (pid, rd, Buffer.create 4096, q_rest, q_name, q_depth) :: !pending
+      end
+    done;
+    if !pending <> [] then begin
+      reap ();
+      if !pending <> [] && Queue.is_empty queue then begin
+        let fds = List.map (fun (_, rd, _, _, _, _) -> rd) !pending in
+        ignore (Unix.select fds [] [] 0.05)
+      end
+    end
+  done;
+  List.rev !results
+
+(* "inshellah index" — the main indexing command.
+ * processes all binaries and manpages in the given prefix directories,
+ * writing completion data to the cache dir.
+ *
+ * the pipeline has two phases:
+ *
+ * phase 1 (binaries): fork one child per binary. each child:
+ *   - tries native nushell completions (if classified as Try_native_and_help)
+ *   - falls back to help_resolve_par (which itself forks per subcommand)
+ *   - marshals the result back via pipe as a tagged variant:
+ *     `Native of string — raw nushell source
+ *     `Parsed of (string * help_result) list — parsed flag data
+ *     `None — nothing useful extracted
+ *
+ * phase 2 (manpages): sequentially parse manpages for commands not yet
+ *   covered by phase 1. manpages are more reliable than --help for many
+ *   gnu tools, but slower to process.
+ *
+ * commands on the ignorelist are skipped entirely. commands on the
+ * help_only list skip manpage parsing and only use --help. commands
+ * with manpages skip --help in phase 1 (they'll be handled in phase 2).
+ *
+ * peculiarity: the done_cmds set tracks which commands have already been
+ * indexed to prevent duplicates across phases and across multiple prefix
+ * directories. *)
+let cmd_index bindirs mandirs ignorelist help_only dir =
+  ensure_dir dir;
+  let done_cmds = ref SSet.empty in
+  let n_results = ref 0 in
+  let index_bindir bindir mandir =
+    if not (is_dir bindir) then
+      Printf.eprintf "skipping %s (not found)\n" bindir
+    else begin
+      let bins = Sys.readdir bindir in
+      Array.sort String.compare bins;
+      let manpaged = if is_dir mandir
+        then manpaged_commands mandir else SSet.empty in
+      let max_jobs = num_cores () in
+      let classified = Array.map (fun name ->
+        if SSet.mem name ignorelist then (name, Skip)
+        else if SSet.mem name help_only then (name, classify_binary bindir name)
+        else if SSet.mem name manpaged then (name, Skip)
+        else (name, classify_binary bindir name)
+      ) bins in
+      let pending = ref [] in
+      let process_result name rd buf =
+        drain_fd rd buf;
+        (try Unix.close rd with _ -> ());
+        let data = Buffer.contents buf in
+        if String.length data > 0 then begin
+          let result : [`Native of string | `Parsed of (string * help_result) list | `None] =
+            try Marshal.from_string data 0 with _ -> `None in
+          (match result with
+          | `Native src ->
+            write_native ~dir name src;
+            incr n_results
+          | `Parsed pairs ->
+            List.iter (fun (cmd_name, r) ->
+              if not (SSet.mem cmd_name !done_cmds) then begin
+                write_result ~dir ~source:"help" cmd_name r;
+                done_cmds := SSet.add cmd_name !done_cmds;
+                incr n_results
+              end
+            ) pairs
+          | `None -> ())
+        end;
+        done_cmds := SSet.add name !done_cmds in
+      let reap () =
+        pending := List.filter (fun (pid, rd, buf, name) ->
+          drain_fd rd buf;
+          match Unix.waitpid [Unix.WNOHANG] pid with
+          | (0, _) -> true
+          | _ ->
+            process_result name rd buf;
+            false
+          | exception Unix.Unix_error (Unix.ECHILD, _, _) ->
+            (try Unix.close rd with _ -> ()); false
+        ) !pending in
+      let wait_for_slot () =
+        while List.length !pending >= max_jobs do
+          reap ();
+          if List.length !pending >= max_jobs then begin
+            let fds = List.map (fun (_, rd, _, _) -> rd) !pending in
+            ignore (Unix.select fds [] [] 0.05)
+          end
+        done in
+      Array.iter (fun (name, cls) ->
+        match cls with
+        | Skip -> ()
+        | Try_help | Try_native_and_help ->
+          wait_for_slot ();
+          let (rd, wr) = Unix.pipe () in
+          let pid = Unix.fork () in
+          if pid = 0 then begin
+            Unix.close rd;
+            List.iter (fun (_, prd, _, _) ->
+              try Unix.close prd with _ -> ()) !pending;
+            let result =
+              try
+                let path = Filename.concat bindir name in
+                let native = match cls with
+                  | Try_native_and_help ->
+                    (match try_native_completion path with
+                     | Some src -> Some src | None -> None)
+                  | _ -> None in
+                match native with
+                | Some src -> `Native src
+                | None ->
+                  let pairs = help_resolve_par ~timeout:200 path [] name in
+                  if pairs <> [] then `Parsed pairs else `None
+              with _ -> `None in
+            let oc = Unix.out_channel_of_descr wr in
+            Marshal.to_channel oc
+              (result : [`Native of string | `Parsed of (string * help_result) list | `None]) [];
+            close_out oc;
+            exit 0
+          end else begin
+            Unix.close wr;
+            pending := (pid, rd, Buffer.create 4096, name) :: !pending
+          end
+      ) classified;
+      while !pending <> [] do
+        reap ();
+        if !pending <> [] then begin
+          let fds = List.map (fun (_, rd, _, _) -> rd) !pending in
+          ignore (Unix.select fds [] [] 0.05)
+        end
+      done;
+      (* Phase 2: manpages *)
+      if is_dir mandir then
+        List.iter (fun section ->
+          let subdir = Filename.concat mandir (Printf.sprintf "man%d" section) in
+          if is_dir subdir then begin
+            let files = Sys.readdir subdir in
+            Array.sort String.compare files;
+            Array.iter (fun file ->
+              let base_cmd = cmd_name_of_manpage file in
+              if SSet.mem base_cmd help_only then ()
+              else match process_manpage (Filename.concat subdir file) with
+              | None -> ()
+              | Some (cmd, result, subs) ->
+                if not (SSet.mem cmd !done_cmds) then begin
+                  write_result ~dir ~source:"manpage" cmd result;
+                  done_cmds := SSet.add cmd !done_cmds;
+                  incr n_results
+                end;
+                List.iter (fun (sub_cmd, sub_result) ->
+                  if not (SSet.mem sub_cmd !done_cmds) then begin
+                    write_result ~dir ~source:"manpage" sub_cmd sub_result;
+                    done_cmds := SSet.add sub_cmd !done_cmds;
+                    incr n_results
+                  end
+                ) subs
+            ) files
+          end
+        ) command_sections
+    end in
+  List.iter2 index_bindir bindirs mandirs;
+  Printf.printf "indexed %d commands into %s\n" !n_results dir
+
+(* "inshellah dump" — list all indexed commands with their source type *)
+let cmd_dump dirs =
+  let cmds = all_commands dirs in
+  Printf.printf "%d commands\n" (List.length cmds);
+  List.iter (fun cmd ->
+    let src = match file_type_of dirs cmd with
+      | Some s -> s | None -> "?" in
+    Printf.printf "  %-40s [%s]\n" cmd src
+  ) cmds
+
+(* search $PATH for an executable with the given name.
+ * used during completion to find binaries for on-the-fly resolution. *)
+let find_in_path name =
+  try
+    Sys.getenv "PATH"
+    |> String.split_on_char ':'
+    |> List.find_map (fun dir ->
+         let p = Filename.concat dir name in
+         if is_executable p then Some p else None)
+  with Not_found -> None
+
+(* resolve a command's completions on-the-fly and cache the results.
+ * called during "complete" when a command isn't in the index.
+ * runs help_resolve_par and writes results to the user's cache dir. *)
+let resolve_and_cache ~dir name path =
+  let pairs = help_resolve_par ~timeout:200 path [] name in
+  if pairs <> [] then begin
+    ensure_dir dir;
+    List.iter (fun (cmd_name, r) -> write_result ~dir cmd_name r) pairs;
+    Some pairs
+  end else None
+
+(* format a single completion candidate as json for nushell's completer protocol *)
+let completion_json value desc =
+  Printf.sprintf "{\"value\":\"%s\",\"description\":\"%s\"}"
+    (escape_json value) (escape_json desc)
+
+(* fuzzy matching: returns a score > 0 if needle is a subsequence of haystack.
+ * higher scores = better match. scoring tiers:
+ *   - exact match: 1000
+ *   - prefix match: 900 + length bonus (how much of the haystack is covered)
+ *   - subsequence: base 10 per char + bonuses for:
+ *     - word boundary alignment (50): matching at '-', '_', or camelCase transitions
+ *     - consecutive matches (20): matching adjacent characters
+ *
+ * this drives the completion candidate ranking. users typing "ser" should see
+ * "--server" ranked above "--preserve" even though both contain "ser" as a
+ * subsequence. the word-boundary bonus achieves this. *)
+let fuzzy_score needle haystack =
+  let nlen = String.length needle and hlen = String.length haystack in
+  if nlen = 0 then 1
+  else if nlen > hlen then 0
+  else if needle = haystack then 1000
+  else
+    let needle = String.lowercase_ascii needle
+    and haystack_lc = String.lowercase_ascii haystack in
+    if String.starts_with ~prefix:needle haystack_lc then
+      900 + (nlen * 100 / hlen)
+    else
+      let is_boundary hi =
+        hi = 0 || haystack.[hi - 1] = '-' || haystack.[hi - 1] = '_'
+        || (haystack.[hi - 1] >= 'a' && haystack.[hi - 1] <= 'z'
+            && haystack.[hi] >= 'A' && haystack.[hi] <= 'Z') in
+      (* Walk haystack matching needle chars as a subsequence *)
+      let ni, score, _, _ =
+        String.fold_left (fun (ni, score, hi, prev_match) c ->
+          if ni >= nlen then (ni, score, hi + 1, prev_match)
+          else if c = needle.[ni] then
+            let bonus = (if is_boundary hi then 50 else 10)
+                      + (if prev_match = hi - 1 then 20 else 0) in
+            (ni + 1, score + bonus, hi + 1, hi)
+          else (ni, score, hi + 1, prev_match)
+        ) (0, 0, 0, -1) haystack_lc in
+      if ni = nlen then score else 0
+
+(* known privilege-escalation wrappers. when one of these is the first token,
+ * we strip it and its options before completing the real command.
+ *
+ * rather than maintaining per-command option tables (fragile — e.g. sudo's
+ * -h is --help not --host, flags differ across implementations), we find the
+ * real command by scanning for the first non-flag token that is a known
+ * command (exists in the completion store or in $PATH). tokens like "root"
+ * in "sudo -u root" are skipped because they aren't commands. *)
+let elevation_commands =
+  ["sudo"; "run0"; "doas"; "pkexec"; "su"; "calife"; "sux"; "sudoedit";
+   "please"; "super"; "priv"]
+
+(* scan past the elevation command's flags and arguments to find the real
+ * command. is_command checks whether a token names a known command.
+ * returns Some (real_cmd :: args) or None if no command was found. *)
+let find_real_command is_command args =
+  let rec scan = function
+    | [] -> None
+    | "--" :: rest -> Some rest
+    | arg :: rest when String.length arg > 0 && arg.[0] = '-' ->
+      scan rest
+    | arg :: _ as cmd_and_rest when is_command arg ->
+      Some cmd_and_rest
+    | _ :: rest -> scan rest
+  in
+  scan args
+
+(* "inshellah complete CMD [ARGS...]" — the nushell custom completer.
+ * this is the hot path — called every time the user presses tab in nushell.
+ *
+ * the completion logic:
+ *   1. try to find the command (or longest subcommand prefix) in the store
+ *   2. if not found, try on-the-fly resolution (find in $PATH, run --help, cache)
+ *   3. score all candidate completions against the partial input using fuzzy_score
+ *   4. output scored candidates as a json array
+ *
+ * subcommand resolution: the lookup tries longest prefix first.
+ * for "git add --", it first looks for "git add", then "git".
+ * this ensures subcommand-specific flags are shown.
+ *
+ * peculiarity: nushell sends a trailing empty token when the cursor is after
+ * a space ("git add "). in this case all_tokens includes the empty string.
+ * when the last token is non-empty, the user is still typing it, so we use
+ * it as the fuzzy filter. when empty, we show all candidates.
+ *
+ * peculiarity: if only a parent command matched (e.g. "git" matched but not
+ * "git add"), we suppress subcommand suggestions and only show flags. this
+ * prevents showing sibling subcommands when the user has already committed
+ * to a specific subcommand path. *)
+let cmd_complete spans user_dir system_dirs =
+  let dirs = user_dir :: system_dirs in
+  (* if the command line starts with a privilege-escalation wrapper, scan past
+   * it to find the real command. we identify the command by checking the store
+   * and $PATH — this avoids needing per-command option tables which are fragile
+   * across different implementations. if no real command is found, fall back to
+   * completing the elevation command itself. *)
+  let spans = match spans with
+    | cmd :: rest when List.mem cmd elevation_commands ->
+      let is_command name =
+        name <> "" && (lookup dirs name <> None || find_in_path name <> None)
+      in
+      (match find_real_command is_command rest with
+       | Some (_ :: _ as real_spans) -> real_spans
+       | _ -> spans)
+    | _ -> spans in
+  match spans with
+  | [] -> print_string "[]\n"
+  | cmd_name :: rest ->
+    (* Try longest prefix match: "git add" before "git" *)
+    let find_result tokens =
+      let n = List.length tokens in
+      List.init n Fun.id |> List.find_map (fun drop ->
+        let prefix = List.filteri (fun i _ -> i < n - drop) tokens in
+        match prefix with
+        | [] -> None
+        | _ ->
+          let try_name = String.concat " " prefix in
+          match lookup dirs try_name with
+          | Some r -> Some (try_name, r, List.length prefix)
+          | None -> None) in
+    let all_tokens = cmd_name :: rest in
+    let last_token = match rest with
+      | [] -> "" | _ -> List.nth rest (List.length rest - 1) in
+    (* Only treat the last token as a completed subcommand when nushell
+       sends a trailing empty token (cursor is after a space).
+       Otherwise the user is still typing and we treat it as partial. *)
+    let lookup_tokens = if last_token = "" then all_tokens
+      else match rest with
+        | _ :: _ -> cmd_name :: List.rev (List.tl (List.rev rest))
+        | _ -> [cmd_name] in
+    let resolve tokens partial =
+      match find_result tokens with
+      | Some _ as found -> (found, partial)
+      | None -> (None, partial) in
+    let found, partial = resolve lookup_tokens last_token in
+    (* Try on-the-fly resolution when no match or only a parent matched *)
+    let n_lookup = List.length lookup_tokens in
+    let result, partial = match found with
+      | Some (_, _, depth) when depth >= n_lookup - 1 ->
+        (* Exact or near-exact match — use it *)
+        (found, partial)
+      | _ ->
+        (* No match, or only a parent matched — try on-the-fly resolution *)
+        (match find_in_path cmd_name with
+         | Some path ->
+           (match resolve_and_cache ~dir:user_dir cmd_name path with
+            | Some _pairs -> resolve lookup_tokens last_token
+            | None -> (found, partial))
+         | None -> (found, partial)) in
+    let candidates = match result with
+      | None -> []
+      | Some (_matched_name, r, depth) ->
+        (* When the match is shallower than requested, the user already
+           typed a subcommand beyond the matched level — don't show
+           sibling subcommands, only flags *)
+        let sub_candidates = if depth < n_lookup - 1 then [] else
+        let subs = match r.subcommands with
+          | _ :: _ -> r.subcommands
+          | [] -> subcommands_of dirs _matched_name in
+        List.filter_map (fun (sc : subcommand) ->
+          let s = fuzzy_score partial sc.name in
+          if s > 0 then Some (s, completion_json sc.name sc.desc) else None
+        ) subs in
+        (* build flag completion candidates from the entry list.
+         * for flags with both short and long forms (Both), we pick which form
+         * to display based on what the user is currently typing:
+         *   - if the partial input matches the short flag better, show the short
+         *     flag as the value and note the long form in the description
+         *   - otherwise (including empty partial), prefer the long flag and note
+         *     the short form in the description
+         * this keeps the candidate list clean (one entry per flag) while still
+         * surfacing the alternate form so the user knows about it.
+         *
+         * parameter names are appended to descriptions in angle brackets for
+         * mandatory params and square brackets for optional ones, matching the
+         * conventions users expect from cli help text. *)
+        let flag_candidates = List.filter_map (fun (e : entry) ->
+          let base_desc = match e.param with
+            | Some (Mandatory p) -> if e.desc <> "" then e.desc ^ " <" ^ p ^ ">" else "<" ^ p ^ ">"
+            | Some (Optional p) -> if e.desc <> "" then e.desc ^ " [" ^ p ^ "]" else "[" ^ p ^ "]"
+            | None -> e.desc in
+          let flag, desc = match e.switch with
+            | Long l -> ("--" ^ l, base_desc)
+            | Short c -> (Printf.sprintf "-%c" c, base_desc)
+            | Both (c, l) ->
+              (* score the partial against both forms to decide which to present.
+               * e.g. typing "-s" scores higher against "-s" than "--squeeze-blank",
+               * so we show "-s (aka --squeeze-blank)". when the partial is empty or
+               * matches the long form better, we default to the long form. *)
+              let long_flag = "--" ^ l in
+              let short_flag = Printf.sprintf "-%c" c in
+              let long_score = fuzzy_score partial long_flag in
+              let short_score = fuzzy_score partial short_flag in
+              if short_score > long_score then
+                (short_flag, Printf.sprintf "(aka %s) %s" long_flag base_desc)
+              else
+                (long_flag, Printf.sprintf "(aka %s) %s" short_flag base_desc) in
+          let s = fuzzy_score partial flag in
+          if s > 0 then Some (s, completion_json flag desc) else None
+        ) r.entries in
+        let scored = sub_candidates @ flag_candidates in
+        List.sort (fun (a, _) (b, _) -> compare b a) scored
+        |> List.map snd in
+    Printf.printf "[%s]\n" (String.concat "," candidates)
+
+(* "inshellah query CMD" — print the raw stored data for a command *)
+let cmd_query cmd dirs =
+  match lookup_raw dirs cmd with
+  | None ->
+    Printf.eprintf "not found: %s\n" cmd; exit 1
+  | Some data ->
+    print_string data; print_newline ()
+
+(* load a newline-separated list of command names to ignore.
+ * blank lines and lines starting with '#' are skipped. *)
+let load_ignorelist path =
+  try
+    In_channel.with_open_text path In_channel.input_all
+    |> String.split_on_char '\n'
+    |> List.filter_map (fun line ->
+         let line = String.trim line in
+         if String.length line > 0 && line.[0] <> '#' then Some line else None)
+    |> SSet.of_list
+  with _ -> SSet.empty
+
+(* parse "index" subcommand arguments: prefix dirs + optional --dir, --ignore, --help-only *)
+let parse_index_args args =
+  let rec go prefixes dir ignore help_only = function
+    | [] -> (List.rev prefixes, dir, ignore, help_only)
+    | "--dir" :: path :: rest -> go prefixes path ignore help_only rest
+    | "--ignore" :: path :: rest -> go prefixes dir (SSet.union ignore (load_ignorelist path)) help_only rest
+    | "--help-only" :: path :: rest -> go prefixes dir ignore (SSet.union help_only (load_ignorelist path)) rest
+    | prefix :: rest -> go (prefix :: prefixes) dir ignore help_only rest in
+  go [] (default_store_path ()) SSet.empty SSet.empty args
+
+(* parse common --dir/--system-dir arguments for complete/query/dump commands *)
+let parse_dir_args args =
+  let rec go user_dir system_dirs rest_args = function
+    | [] -> (user_dir, system_dirs, List.rev rest_args)
+    | "--dir" :: path :: rest -> go path system_dirs rest_args rest
+    | "--system-dir" :: path :: rest -> go user_dir (path :: system_dirs) rest_args rest
+    | arg :: rest -> go user_dir system_dirs (arg :: rest_args) rest in
+  go (default_store_path ()) [] [] args
+
+(* --- entry point ---
+ * dispatch on the first argument to the appropriate subcommand handler. *)
+let () =
+  match Array.to_list Sys.argv |> List.tl with
+  | "index" :: rest ->
+    let (prefixes, dir, ignorelist, help_only) = parse_index_args rest in
+    if prefixes = [] then (Printf.eprintf "error: index requires at least one prefix dir\n"; exit 1);
+    let bindirs = List.map (fun p -> Filename.concat p "bin") prefixes in
+    let mandirs = List.map (fun p -> Filename.concat p "share/man") prefixes in
+    cmd_index bindirs mandirs ignorelist help_only dir
+  | "complete" :: rest ->
+    let (user_dir, system_dirs, spans) = parse_dir_args rest in
+    cmd_complete spans user_dir system_dirs
+  | "query" :: rest ->
+    let (user_dir, system_dirs, args) = parse_dir_args rest in
+    (match args with
+     | [cmd] -> cmd_query cmd (user_dir :: system_dirs)
+     | _ -> Printf.eprintf "error: query CMD [--dir PATH] [--system-dir PATH]\n"; exit 1)
+  | "dump" :: rest ->
+    let (user_dir, system_dirs, _) = parse_dir_args rest in
+    cmd_dump (user_dir :: system_dirs)
+  | ["manpage"; file] -> cmd_manpage file
+  | ["manpage-dir"; dir] -> cmd_manpage_dir dir
+  | _ -> usage ()