From 2126311b5711d089257f56353fb5c083a94fb68f Mon Sep 17 00:00:00 2001 From: atagen Date: Wed, 18 Mar 2026 15:40:47 +1100 Subject: [PATCH] init --- .envrc | 1 + .gitignore | 3 + README.md | 11 + bin/.ocamlformat | 0 bin/dune | 4 + bin/main.ml | 998 +++++++++++++++++++++++++++++++++ doc/nixos.md | 192 +++++++ doc/nushell-integration.md | 184 ++++++ doc/runtime-completions.md | 84 +++ dune-project | 28 + flake.lock | 27 + flake.nix | 71 +++ inshellah.opam | 35 ++ lib/.ocamlformat | 0 lib/dune | 3 + lib/manpage.ml | 1088 ++++++++++++++++++++++++++++++++++++ lib/nushell.ml | 242 ++++++++ lib/parser.ml | 802 ++++++++++++++++++++++++++ lib/store.ml | 444 +++++++++++++++ nix/module.nix | 107 ++++ test/dune | 3 + test/test_inshellah.ml | 492 ++++++++++++++++ 22 files changed, 4819 insertions(+) create mode 100644 .envrc create mode 100644 .gitignore create mode 100644 README.md create mode 100644 bin/.ocamlformat create mode 100644 bin/dune create mode 100644 bin/main.ml create mode 100644 doc/nixos.md create mode 100644 doc/nushell-integration.md create mode 100644 doc/runtime-completions.md create mode 100644 dune-project create mode 100644 flake.lock create mode 100644 flake.nix create mode 100644 inshellah.opam create mode 100644 lib/.ocamlformat create mode 100644 lib/dune create mode 100644 lib/manpage.ml create mode 100644 lib/nushell.ml create mode 100644 lib/parser.ml create mode 100644 lib/store.ml create mode 100644 nix/module.nix create mode 100644 test/dune create mode 100644 test/test_inshellah.ml diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..3550a30 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use flake diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6e66774 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +/target +/_build +/.direnv diff --git a/README.md b/README.md new file mode 100644 index 0000000..899b887 --- /dev/null +++ b/README.md @@ -0,0 +1,11 @@ +# inshellah + +nushell completions engine. indexes completions from manpages, native +generators, and `--help` output, then serves them to nushell's external +completer. + +see `doc/` for details: + +- [nushell integration](doc/nushell-integration.md) — setup, usage, examples +- [nixos module](doc/nixos.md) — automatic build-time indexing +- [runtime completions](doc/runtime-completions.md) — on-the-fly caching via the completer diff --git a/bin/.ocamlformat b/bin/.ocamlformat new file mode 100644 index 0000000..e69de29 diff --git a/bin/dune b/bin/dune new file mode 100644 index 0000000..4bb8309 --- /dev/null +++ b/bin/dune @@ -0,0 +1,4 @@ +(executable + (public_name inshellah) + (name main) + (libraries inshellah)) diff --git a/bin/main.ml b/bin/main.ml new file mode 100644 index 0000000..fcbde6e --- /dev/null +++ b/bin/main.ml @@ -0,0 +1,998 @@ +(* main.ml — cli entry point for inshellah, a nushell completions engine. + * + * inshellah generates nushell "extern" definitions for external commands by + * parsing their manpages and --help output. it has two main modes: + * + * 1. indexing (batch): scan a prefix directory's bin/ and share/man/, + * extract completions for every binary, and write them to a cache dir. + * this is typically run once per nix profile or system update. + * + * 2. completing (interactive): given a command and its current arguments, + * look up the cached data and return json completion candidates for + * nushell's custom completer protocol. + * + * the indexing pipeline for each binary: + * a. classify the binary (skip? try --help? try native completions?) + * b. if the tool has native nushell completion support, try various + * subcommand patterns ("completions nushell", "--completion nushell", etc.) + * c. otherwise, run the tool with --help/-h and parse the output + * d. recursively resolve subcommands (depth-limited to 5) + * e. after binaries, parse manpages for any commands not yet covered + * + * parallelism: indexing forks per binary, and subcommand resolution forks + * per subcommand. results are marshaled back via pipes. this gives good + * throughput on multi-core systems while keeping the code simple (no threads, + * no async runtime — just unix fork/pipe/waitpid). + *) + +open Inshellah.Parser +open Inshellah.Manpage +open Inshellah.Nushell +open Inshellah.Store + +module SSet = Set.Make(String) + +(* print usage and exit. called when no valid subcommand is given. *) +let usage () = + Printf.eprintf + {|inshellah - nushell completions engine + +Usage: + inshellah index PREFIX... [--dir PATH] [--ignore FILE] [--help-only FILE] + Index completions into a directory of JSON/nu files. + PREFIX is a directory containing bin/ and share/man/. + Default dir: $XDG_CACHE_HOME/inshellah + --ignore FILE skip listed commands entirely + --help-only FILE skip manpages for listed commands, use --help instead + inshellah complete CMD [ARGS...] [--dir PATH] [--system-dir PATH] + Nushell custom completer. Outputs JSON completion candidates. + Falls back to --help resolution if command is not indexed. + inshellah query CMD [--dir PATH] [--system-dir PATH] + Print stored completion data for CMD. + inshellah dump [--dir PATH] [--system-dir PATH] + List indexed commands. + inshellah manpage FILE Parse a manpage and emit nushell extern + inshellah manpage-dir DIR Batch-process manpages under DIR + +|}; + exit 1 + +(* manpage sections that contain command documentation. + * section 1 = user commands, section 8 = system administration commands. *) +let command_sections = [1; 8] + +(* simple substring search using Str *) +let contains_str s sub = + try ignore (Str.search_forward (Str.regexp_string sub) s 0); true + with Not_found -> false + +(* heuristic to detect whether text is valid nushell source code. + * checks for common nushell declaration keywords. the length > 20 + * check avoids false positives on short error messages. *) +let is_nushell_source text = + String.length text > 20 + && (contains_str text "export extern" + || contains_str text "export def" + || (contains_str text "module " && contains_str text "export")) + +(* extract command name from a manpage filename. + * "ls.1.gz" → strip .gz → "ls.1" → chop extension → "ls" *) +let cmd_name_of_manpage path = + let base = Filename.basename path in + let base = + if Filename.check_suffix base ".gz" then Filename.chop_suffix base ".gz" + else base in + try Filename.chop_extension base with Invalid_argument _ -> base + +(* sanitized environment for child processes. + * strips display-related variables (DISPLAY, WAYLAND_DISPLAY, etc.) to prevent + * gui tools from trying to open windows when we run them with --help. + * without this, some tools (e.g. ckb-next) would pop up dialogs or hang + * waiting for a display connection. *) +let safe_env = lazy ( + Array.of_list ( + List.filter (fun s -> + not (String.starts_with ~prefix:"DISPLAY=" s + || String.starts_with ~prefix:"WAYLAND_DISPLAY=" s + || String.starts_with ~prefix:"DBUS_SESSION_BUS_ADDRESS=" s + || String.starts_with ~prefix:"XAUTHORITY=" s)) + (Array.to_list (Unix.environment ())))) + +(* Non-blocking drain of a pipe fd into a buffer. Safe to call repeatedly; + reads whatever is available without blocking. Used by all fork-pipe sites + to keep pipes drained so children never block on write. *) +let drain_fd rd buf = + let chunk = Bytes.create 8192 in + let continue = ref true in + while !continue do + match Unix.select [rd] [] [] 0.0 with + | (_ :: _, _, _) -> + (try + let n = Unix.read rd chunk 0 8192 in + if n = 0 then continue := false + else Buffer.add_subbytes buf chunk 0 n + with Unix.Unix_error _ -> continue := false) + | _ -> continue := false + done + +(* run a command with a timeout, capturing its stdout+stderr. + * forks a child process, redirects stdin from /dev/null, and merges + * stdout+stderr onto a pipe. reads from the pipe with select() polling + * until either the child exits or the deadline is reached. + * + * peculiarity: the child is run in /tmp to prevent tools that create + * side-effect files (like ckb-next-dev-detect-report.gz) from polluting + * the user's working directory. we chdir to /tmp before fork and restore after. + * + * peculiarity: the select timeout is capped at 0.05s per iteration to ensure + * we check the deadline frequently even when no data is available. + * + * returns none if the process couldn't be started, produced no output, + * or was killed due to timeout. *) +let run_cmd args timeout_ms = + let (rd, wr) = Unix.pipe () in + let devnull = Unix.openfile "/dev/null" [Unix.O_RDONLY] 0 in + let argv = Array.of_list args in + (* Run subprocesses in /tmp so commands that write side-effect files + (e.g. ckb-next-dev-detect-report.gz) don't pollute the working dir *) + let saved_cwd = Sys.getcwd () in + Sys.chdir "/tmp"; + let pid = + try Unix.create_process_env (List.hd args) argv + (Lazy.force safe_env) devnull wr wr + with Unix.Unix_error _ -> + Unix.close rd; Unix.close wr; Unix.close devnull; -1 in + Sys.chdir saved_cwd; + Unix.close wr; Unix.close devnull; + if pid < 0 then (Unix.close rd; None) + else begin + let buf = Buffer.create 4096 in + let deadline = Unix.gettimeofday () +. (float_of_int timeout_ms /. 1000.0) in + let chunk = Bytes.create 8192 in + let alive = ref true in + (try while !alive do + let remaining = deadline -. Unix.gettimeofday () in + if remaining <= 0.0 then alive := false + else match Unix.select [rd] [] [] (min remaining 0.05) with + | (_ :: _, _, _) -> + let n = Unix.read rd chunk 0 8192 in + if n = 0 then raise Exit + else Buffer.add_subbytes buf chunk 0 n + | _ -> () + done with Exit -> ()); + Unix.close rd; + if not !alive then begin + (try Unix.kill pid Sys.sigkill with Unix.Unix_error _ -> ()); + ignore (Unix.waitpid [] pid) + end else + ignore (Unix.waitpid [] pid); + if Buffer.length buf > 0 then Some (Buffer.contents buf) else None + end + +(* check if a path is a regular file with at least one execute bit set *) +let is_executable path = + try let st = Unix.stat path in + st.st_kind = Unix.S_REG && st.st_perm land 0o111 <> 0 + with Unix.Unix_error _ -> false + +(* check if a file is a script by looking for a #! shebang. + * follows symlinks via realpath before reading. *) +let is_script path = + try + let real = Unix.realpath path in + let ic = open_in_bin real in + let has_shebang = + try let b = Bytes.create 2 in + really_input ic b 0 2; + Bytes.get b 0 = '#' && Bytes.get b 1 = '!' + with End_of_file -> false in + close_in ic; + has_shebang + with _ -> false + +(* scan an elf binary for string needles without loading the entire file. + * reads the file in 64kb chunks, searching each chunk for the needle strings. + * uses a sliding window (carry) of max_needle bytes between chunks to handle + * needles that span chunk boundaries. + * + * peculiarity: on read failure (e.g. if the path resolves to something + * unreadable), all needles are marked as found. this is a conservative + * fallback — we'd rather try --help on an unreadable binary than skip it. + * + * the inner loop is a manual byte-by-byte comparison rather than using + * String.contains or Str for performance — this runs on every binary + * in the prefix, so it needs to be fast. *) +let elf_scan path needles = + let found = Hashtbl.create 4 in + let remaining () = List.filter (fun n -> not (Hashtbl.mem found n)) needles in + (try + let real = Unix.realpath path in + let ic = open_in_bin real in + let magic = Bytes.create 4 in + really_input ic magic 0 4; + if Bytes.get magic 0 = '\x7f' && Bytes.get magic 1 = 'E' + && Bytes.get magic 2 = 'L' && Bytes.get magic 3 = 'F' then begin + let max_needle = List.fold_left (fun m n -> max m (String.length n)) 0 needles in + let chunk_size = 65536 in + let buf = Bytes.create (chunk_size + max_needle) in + let carry = ref 0 in + let eof = ref false in + while not !eof && remaining () <> [] do + let n = (try input ic buf !carry chunk_size with End_of_file -> 0) in + if n = 0 then eof := true + else begin + let total = !carry + n in + List.iter (fun needle -> + if not (Hashtbl.mem found needle) then begin + let nlen = String.length needle in + let i = ref 0 in + while !i <= total - nlen do + if Bytes.get buf !i = needle.[0] then begin + let ok = ref true in + for j = 1 to nlen - 1 do + if Bytes.get buf (!i + j) <> needle.[j] then ok := false + done; + if !ok then (Hashtbl.replace found needle true; i := total) + else incr i + end else incr i + done + end + ) (remaining ()); + let new_carry = min max_needle total in + Bytes.blit buf (total - new_carry) buf 0 new_carry; + carry := new_carry + end + done + end; + close_in ic + with _ -> + List.iter (fun n -> Hashtbl.replace found n true) needles); + found + +(* detect nix-generated c wrapper scripts and extract the real binary path. + * nix's makeCWrapper creates small c programs that set up the environment + * and exec the real binary. these wrappers won't contain "-h" or "completion" + * in their own binary (they're just wrappers), so elf_scan would say "skip". + * this function reads the wrapper source to find the actual /nix/store/.../bin/... + * target path, so we can try --help on the real binary instead. + * + * peculiarity: caps the read at 64kb to avoid accidentally reading a large + * non-wrapper binary into memory. *) +let nix_wrapper_target path = + try + let real = Unix.realpath path in + let ic = open_in_bin real in + let n = in_channel_length ic in + if n > 65536 then (close_in ic; None) + else begin + let s = Bytes.create n in + really_input ic s 0 n; close_in ic; + let s = Bytes.to_string s in + if not (contains_str s "makeCWrapper") then None + else + let re = Str.regexp "/nix/store/[a-z0-9]+-[^' \n\r\x00]+/bin/[a-zA-Z0-9._-]+" in + try ignore (Str.search_forward re s 0); + let target = Str.matched_string s in + if Sys.file_exists target then Some target else None + with Not_found -> None + end + with _ -> None + +(* heuristic filter for binary names that should never be indexed. + * skips: empty names, "-", dotfiles, libraries (lib-prefix), daemon wrappers + * (suffixes -daemon, -wrapped), shared objects (.so suffix), and names with no + * alphanumeric characters (e.g. punctuation-only names). *) +let skip_name name = + String.length name = 0 || name = "-" || name.[0] = '.' + || String.starts_with ~prefix:"lib" name + || String.ends_with ~suffix:"-daemon" name + || String.ends_with ~suffix:"-wrapped" name + || String.ends_with ~suffix:".so" name + || not (String.exists (fun c -> (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) name) + +(* classification result for a binary. + * Skip — don't index this binary at all + * Try_help — only try --help (scripts, binaries without "completion" string) + * Try_native_and_help — try native nushell completion first, fall back to --help *) +type bin_class = Skip | Try_help | Try_native_and_help + +(* classify a binary to decide the indexing strategy. + * decision tree: + * 1. nushell builtin or bad name → Skip + * 2. not executable → Skip + * 3. script (has shebang) → Try_help (scripts can't have native completions) + * 4. elf binary containing "completion" → Try_native_and_help + * 5. elf binary containing "-h" → Try_help + * 6. nix wrapper → Try_help (the wrapper itself is just an exec shim) + * 7. otherwise → Skip (binary has no help infrastructure) *) +let classify_binary bindir name = + if is_nushell_builtin name || skip_name name then Skip + else + let path = Filename.concat bindir name in + if not (is_executable path) then Skip + else if is_script path then Try_help + else + let scan = elf_scan path ["-h"; "completion"] in + if Hashtbl.mem scan "completion" then Try_native_and_help + else if Hashtbl.mem scan "-h" then Try_help + else if nix_wrapper_target path <> None then Try_help + else Skip + +(* detect available cpu cores by counting "processor" lines in /proc/cpuinfo. + * falls back to 4 if /proc/cpuinfo can't be read (e.g. on non-linux). *) +let num_cores () = + try + let ic = open_in "/proc/cpuinfo" in + let n = ref 0 in + (try while true do + if String.starts_with ~prefix:"processor" (input_line ic) then incr n + done with End_of_file -> ()); + close_in ic; max 1 !n + with _ -> 4 + +(* try to get native nushell completions from a binary. + * tries several common subcommand patterns that tools use for shell completions. + * returns the first one that produces valid nushell source code. + * the 500ms timeout is generous enough for most tools but prevents hangs. + * + * the patterns cover: cobra (go), clap (rust), click (python), and various + * ad-hoc implementations. *) +let try_native_completion bin_path = + List.find_map (fun args -> + match run_cmd args 500 with + | Some text when is_nushell_source text -> Some text + | _ -> None + ) [ + [bin_path; "completions"; "nushell"]; + [bin_path; "completion"; "nushell"]; + [bin_path; "--completions"; "nushell"]; + [bin_path; "--completion"; "nushell"]; + [bin_path; "generate-completion"; "nushell"]; + [bin_path; "--generate-completion"; "nushell"]; + [bin_path; "shell-completions"; "nushell"]; + ] + +(* parse a manpage file, extracting the command name, its flags/subcommands, + * and any clap-style per-subcommand sections. + * returns none for nushell builtins or failed parses. *) +let parse_manpage_for_command file = + let contents = read_manpage_file file in + let fallback = cmd_name_of_manpage file in + let cmd = match extract_synopsis_command contents with + | Some name -> name | None -> fallback in + if is_nushell_builtin cmd then None + else + let result = parse_manpage_string contents in + let sub_sections = extract_subcommand_sections contents in + let result = if sub_sections <> [] then + { result with subcommands = List.map (fun (name, desc, _) -> + { name; desc }) sub_sections } + else result in + let subs = List.map (fun (name, _desc, r) -> + (cmd ^ " " ^ name, r)) sub_sections in + Some (cmd, result, subs) + +(* "inshellah manpage FILE" — parse one manpage and print the nushell extern *) +let cmd_manpage file = + match parse_manpage_for_command file with + | Some (cmd, result, _) when result.entries <> [] -> + print_string (generate_extern cmd result) + | _ -> () + +(* "inshellah manpage-dir DIR" — batch-process all manpages under a directory *) +let cmd_manpage_dir dir = + List.iter (fun section -> + let subdir = Filename.concat dir (Printf.sprintf "man%d" section) in + if is_dir subdir then + Array.iter (fun file -> + (try cmd_manpage (Filename.concat subdir file) with _ -> ()) + ) (Sys.readdir subdir) + ) command_sections + +(* safety limit: don't accumulate more than 500 subcommand resolution results + * per binary. prevents runaway recursion on tools with enormous subcommand trees. *) +let max_resolve_results = 500 + +(* safe wrapper around parse_manpage_for_command that catches all exceptions *) +let process_manpage file = + try + match parse_manpage_for_command file with + | Some (cmd, result, subs) when result.entries <> [] || subs <> [] -> + Some (cmd, result, subs) + | _ -> None + with _ -> None + +(* collect the set of command names that have manpages in a given man directory. + * used during indexing to skip --help for commands that will be handled by + * the manpage parsing phase instead (manpages are more reliable than --help). *) +let manpaged_commands mandir = + List.fold_left (fun acc section -> + let subdir = Filename.concat mandir (Printf.sprintf "man%d" section) in + if is_dir subdir then + Array.fold_left (fun acc f -> SSet.add (cmd_name_of_manpage f) acc) + acc (Sys.readdir subdir) + else acc + ) SSet.empty command_sections + +(* parallel structured help resolver — recursively resolves a command and + * all its subcommands by running --help on each, forking a child process + * per subcommand for parallelism. + * + * the resolver works as a breadth-first queue: + * 1. start with the root command in the queue + * 2. fork a child for each queued item (up to num_cores concurrent) + * 3. the child runs --help, parses the output, marshals the result via pipe + * 4. the parent collects results and enqueues discovered subcommands + * 5. repeat until queue is empty and all children have finished + * + * depth is limited to 5 levels and total results to max_resolve_results + * to prevent runaway recursion on pathological command trees. + * + * peculiarity: the child process detects "self-listing" — when a subcommand's + * --help lists itself as a subcommand (e.g. "git help" listing "help" as a + * subcommand of itself). this would cause infinite recursion, so such results + * are discarded. + * + * peculiarity: children close all pipe fds from other pending children + * immediately after fork to prevent fd leaks. the parent drains pipes + * regularly to prevent children from blocking on full pipe buffers. *) +let help_resolve_par ?(timeout=200) cmd rest name = + let max_jobs = num_cores () in + let queue = Queue.create () in + Queue.push (rest, name, 0) queue; + let results = ref [] in + (* pending: (pid, rd, buf, rest, name, depth) *) + let pending = ref [] in + let collect rd buf q_rest q_name q_depth = + drain_fd rd buf; + (try Unix.close rd with _ -> ()); + let data = Buffer.contents buf in + let result : (help_result * subcommand list) option = + if String.length data > 0 then + try Marshal.from_string data 0 with _ -> None + else None in + match result with + | None -> () + | Some (r, subs) -> + let at_limit = q_depth >= 5 || List.length !results >= max_resolve_results in + results := (q_name, r) :: !results; + if not at_limit then + List.iter (fun (sc : subcommand) -> + Queue.push (q_rest @ [sc.name], q_name ^ " " ^ sc.name, q_depth + 1) queue + ) subs in + let reap () = + pending := List.filter (fun (pid, rd, buf, q_rest, q_name, q_depth) -> + drain_fd rd buf; + match Unix.waitpid [Unix.WNOHANG] pid with + | (0, _) -> true + | _ -> collect rd buf q_rest q_name q_depth; false + | exception Unix.Unix_error (Unix.ECHILD, _, _) -> + (try Unix.close rd with _ -> ()); false + ) !pending in + let wait_for_slot () = + while List.length !pending >= max_jobs do + reap (); + if List.length !pending >= max_jobs then begin + let fds = List.map (fun (_, rd, _, _, _, _) -> rd) !pending in + ignore (Unix.select fds [] [] 0.05) + end + done in + while not (Queue.is_empty queue) || !pending <> [] do + while not (Queue.is_empty queue) do + let (q_rest, q_name, q_depth) = Queue.pop queue in + wait_for_slot (); + let (rd, wr) = Unix.pipe () in + let pid = Unix.fork () in + if pid = 0 then begin + Unix.close rd; + List.iter (fun (_, prd, _, _, _, _) -> + try Unix.close prd with _ -> ()) !pending; + let result = + let text = match run_cmd (cmd :: q_rest @ ["--help"]) timeout with + | Some _ as r -> r + | None -> run_cmd (cmd :: q_rest @ ["-h"]) timeout in + match text with + | None -> None + | Some text -> + (match parse_help text with + | Error _ -> None + | Ok r when r.entries = [] && r.subcommands = [] && r.positionals = [] -> None + | Ok r -> + let self_listed = match q_rest with + | [] -> false + | _ -> + let leaf = List.nth q_rest (List.length q_rest - 1) in + List.exists (fun (sc : subcommand) -> sc.name = leaf) r.subcommands in + if self_listed then None + else + let at_limit = q_depth >= 5 in + let subs = if at_limit then [] else r.subcommands in + Some (r, subs)) in + let oc = Unix.out_channel_of_descr wr in + Marshal.to_channel oc (result : (help_result * subcommand list) option) []; + close_out oc; + exit 0 + end else begin + Unix.close wr; + pending := (pid, rd, Buffer.create 4096, q_rest, q_name, q_depth) :: !pending + end + done; + if !pending <> [] then begin + reap (); + if !pending <> [] && Queue.is_empty queue then begin + let fds = List.map (fun (_, rd, _, _, _, _) -> rd) !pending in + ignore (Unix.select fds [] [] 0.05) + end + end + done; + List.rev !results + +(* "inshellah index" — the main indexing command. + * processes all binaries and manpages in the given prefix directories, + * writing completion data to the cache dir. + * + * the pipeline has two phases: + * + * phase 1 (binaries): fork one child per binary. each child: + * - tries native nushell completions (if classified as Try_native_and_help) + * - falls back to help_resolve_par (which itself forks per subcommand) + * - marshals the result back via pipe as a tagged variant: + * `Native of string — raw nushell source + * `Parsed of (string * help_result) list — parsed flag data + * `None — nothing useful extracted + * + * phase 2 (manpages): sequentially parse manpages for commands not yet + * covered by phase 1. manpages are more reliable than --help for many + * gnu tools, but slower to process. + * + * commands on the ignorelist are skipped entirely. commands on the + * help_only list skip manpage parsing and only use --help. commands + * with manpages skip --help in phase 1 (they'll be handled in phase 2). + * + * peculiarity: the done_cmds set tracks which commands have already been + * indexed to prevent duplicates across phases and across multiple prefix + * directories. *) + +(* known privilege-escalation wrappers — defined here (before cmd_index and + * cmd_complete) because both need the list: cmd_index writes @complete + * external stubs, and cmd_complete strips the wrapper to find the real command. *) +let elevation_commands = + ["sudo"; "run0"; "doas"; "pkexec"; "su"; "calife"; "sux"; "sudoedit"; + "please"; "super"; "priv"] + +let cmd_index bindirs mandirs ignorelist help_only dir = + ensure_dir dir; + let done_cmds = ref SSet.empty in + let n_results = ref 0 in + let index_bindir bindir mandir = + if not (is_dir bindir) then + Printf.eprintf "skipping %s (not found)\n" bindir + else begin + let bins = Sys.readdir bindir in + Array.sort String.compare bins; + let manpaged = if is_dir mandir + then manpaged_commands mandir else SSet.empty in + let max_jobs = num_cores () in + let classified = Array.map (fun name -> + if SSet.mem name ignorelist then (name, Skip) + else if SSet.mem name help_only then (name, classify_binary bindir name) + else if SSet.mem name manpaged then (name, Skip) + else (name, classify_binary bindir name) + ) bins in + let pending = ref [] in + let process_result name rd buf = + drain_fd rd buf; + (try Unix.close rd with _ -> ()); + let data = Buffer.contents buf in + if String.length data > 0 then begin + let result : [`Native of string | `Parsed of (string * help_result) list | `None] = + try Marshal.from_string data 0 with _ -> `None in + (match result with + | `Native src -> + write_native ~dir name src; + incr n_results + | `Parsed pairs -> + List.iter (fun (cmd_name, r) -> + if not (SSet.mem cmd_name !done_cmds) then begin + write_result ~dir ~source:"help" cmd_name r; + done_cmds := SSet.add cmd_name !done_cmds; + incr n_results + end + ) pairs + | `None -> ()) + end; + done_cmds := SSet.add name !done_cmds in + let reap () = + pending := List.filter (fun (pid, rd, buf, name) -> + drain_fd rd buf; + match Unix.waitpid [Unix.WNOHANG] pid with + | (0, _) -> true + | _ -> + process_result name rd buf; + false + | exception Unix.Unix_error (Unix.ECHILD, _, _) -> + (try Unix.close rd with _ -> ()); false + ) !pending in + let wait_for_slot () = + while List.length !pending >= max_jobs do + reap (); + if List.length !pending >= max_jobs then begin + let fds = List.map (fun (_, rd, _, _) -> rd) !pending in + ignore (Unix.select fds [] [] 0.05) + end + done in + Array.iter (fun (name, cls) -> + match cls with + | Skip -> () + | Try_help | Try_native_and_help -> + wait_for_slot (); + let (rd, wr) = Unix.pipe () in + let pid = Unix.fork () in + if pid = 0 then begin + Unix.close rd; + List.iter (fun (_, prd, _, _) -> + try Unix.close prd with _ -> ()) !pending; + let result = + try + let path = Filename.concat bindir name in + let native = match cls with + | Try_native_and_help -> + (match try_native_completion path with + | Some src -> Some src | None -> None) + | _ -> None in + match native with + | Some src -> `Native src + | None -> + let pairs = help_resolve_par ~timeout:200 path [] name in + if pairs <> [] then `Parsed pairs else `None + with _ -> `None in + let oc = Unix.out_channel_of_descr wr in + Marshal.to_channel oc + (result : [`Native of string | `Parsed of (string * help_result) list | `None]) []; + close_out oc; + exit 0 + end else begin + Unix.close wr; + pending := (pid, rd, Buffer.create 4096, name) :: !pending + end + ) classified; + while !pending <> [] do + reap (); + if !pending <> [] then begin + let fds = List.map (fun (_, rd, _, _) -> rd) !pending in + ignore (Unix.select fds [] [] 0.05) + end + done; + (* Phase 2: manpages *) + if is_dir mandir then + List.iter (fun section -> + let subdir = Filename.concat mandir (Printf.sprintf "man%d" section) in + if is_dir subdir then begin + let files = Sys.readdir subdir in + Array.sort String.compare files; + Array.iter (fun file -> + let base_cmd = cmd_name_of_manpage file in + if SSet.mem base_cmd help_only then () + else match process_manpage (Filename.concat subdir file) with + | None -> () + | Some (cmd, result, subs) -> + if not (SSet.mem cmd !done_cmds) then begin + write_result ~dir ~source:"manpage" cmd result; + done_cmds := SSet.add cmd !done_cmds; + incr n_results + end; + List.iter (fun (sub_cmd, sub_result) -> + if not (SSet.mem sub_cmd !done_cmds) then begin + write_result ~dir ~source:"manpage" sub_cmd sub_result; + done_cmds := SSet.add sub_cmd !done_cmds; + incr n_results + end + ) subs + ) files + end + ) command_sections + end in + List.iter2 index_bindir bindirs mandirs; + (* write @complete external stubs for elevation commands (sudo, doas, etc.) + * so nushell routes their completions through the external completer. + * without this, nushell hardcodes sudo/doas to show command-name completion + * and never calls the external completer for their own flags. *) + List.iter (fun cmd -> + let json_path = Filename.concat dir (filename_of_command cmd ^ ".json") in + if Sys.file_exists json_path then + write_native ~dir cmd + (Printf.sprintf "@complete external\nextern \"%s\" []\n" cmd) + ) elevation_commands; + Printf.printf "indexed %d commands into %s\n" !n_results dir + +(* "inshellah dump" — list all indexed commands with their source type *) +let cmd_dump dirs = + let cmds = all_commands dirs in + Printf.printf "%d commands\n" (List.length cmds); + List.iter (fun cmd -> + let src = match file_type_of dirs cmd with + | Some s -> s | None -> "?" in + Printf.printf " %-40s [%s]\n" cmd src + ) cmds + +(* search $PATH for an executable with the given name. + * used during completion to find binaries for on-the-fly resolution. *) +let find_in_path name = + try + Sys.getenv "PATH" + |> String.split_on_char ':' + |> List.find_map (fun dir -> + let p = Filename.concat dir name in + if is_executable p then Some p else None) + with Not_found -> None + +(* resolve a command's completions on-the-fly and cache the results. + * called during "complete" when a command isn't in the index. + * runs help_resolve_par and writes results to the user's cache dir. *) +let resolve_and_cache ~dir name path = + let pairs = help_resolve_par ~timeout:200 path [] name in + if pairs <> [] then begin + ensure_dir dir; + List.iter (fun (cmd_name, r) -> write_result ~dir cmd_name r) pairs; + Some pairs + end else None + +(* format a single completion candidate as json for nushell's completer protocol *) +let completion_json value desc = + Printf.sprintf "{\"value\":\"%s\",\"description\":\"%s\"}" + (escape_json value) (escape_json desc) + +(* fuzzy matching: returns a score > 0 if needle is a subsequence of haystack. + * higher scores = better match. scoring tiers: + * - exact match: 1000 + * - prefix match: 900 + length bonus (how much of the haystack is covered) + * - subsequence: base 10 per char + bonuses for: + * - word boundary alignment (50): matching at '-', '_', or camelCase transitions + * - consecutive matches (20): matching adjacent characters + * + * this drives the completion candidate ranking. users typing "ser" should see + * "--server" ranked above "--preserve" even though both contain "ser" as a + * subsequence. the word-boundary bonus achieves this. *) +let fuzzy_score needle haystack = + let nlen = String.length needle and hlen = String.length haystack in + if nlen = 0 then 1 + else if nlen > hlen then 0 + else if needle = haystack then 1000 + else + let needle = String.lowercase_ascii needle + and haystack_lc = String.lowercase_ascii haystack in + if String.starts_with ~prefix:needle haystack_lc then + 900 + (nlen * 100 / hlen) + else + let is_boundary hi = + hi = 0 || haystack.[hi - 1] = '-' || haystack.[hi - 1] = '_' + || (haystack.[hi - 1] >= 'a' && haystack.[hi - 1] <= 'z' + && haystack.[hi] >= 'A' && haystack.[hi] <= 'Z') in + (* Walk haystack matching needle chars as a subsequence *) + let ni, score, _, _ = + String.fold_left (fun (ni, score, hi, prev_match) c -> + if ni >= nlen then (ni, score, hi + 1, prev_match) + else if c = needle.[ni] then + let bonus = (if is_boundary hi then 50 else 10) + + (if prev_match = hi - 1 then 20 else 0) in + (ni + 1, score + bonus, hi + 1, hi) + else (ni, score, hi + 1, prev_match) + ) (0, 0, 0, -1) haystack_lc in + if ni = nlen then score else 0 + +(* scan past the elevation command's flags and arguments to find the real + * command. is_command checks whether a token names a known command. + * returns Some (real_cmd :: args) or None if no command was found. *) +let find_real_command is_command args = + let rec scan = function + | [] -> None + | "--" :: rest -> Some rest + | arg :: rest when String.length arg > 0 && arg.[0] = '-' -> + scan rest + | arg :: _ as cmd_and_rest when is_command arg -> + Some cmd_and_rest + | _ :: rest -> scan rest + in + scan args + +(* "inshellah complete CMD [ARGS...]" — the nushell custom completer. + * this is the hot path — called every time the user presses tab in nushell. + * + * the completion logic: + * 1. try to find the command (or longest subcommand prefix) in the store + * 2. if not found, try on-the-fly resolution (find in $PATH, run --help, cache) + * 3. score all candidate completions against the partial input using fuzzy_score + * 4. output scored candidates as a json array + * + * subcommand resolution: the lookup tries longest prefix first. + * for "git add --", it first looks for "git add", then "git". + * this ensures subcommand-specific flags are shown. + * + * peculiarity: nushell sends a trailing empty token when the cursor is after + * a space ("git add "). in this case all_tokens includes the empty string. + * when the last token is non-empty, the user is still typing it, so we use + * it as the fuzzy filter. when empty, we show all candidates. + * + * peculiarity: if only a parent command matched (e.g. "git" matched but not + * "git add"), we suppress subcommand suggestions and only show flags. this + * prevents showing sibling subcommands when the user has already committed + * to a specific subcommand path. *) +let cmd_complete spans user_dir system_dirs = + let dirs = user_dir :: system_dirs in + (* if the command line starts with a privilege-escalation wrapper, scan past + * it to find the real command. we identify the command by checking the store + * and $PATH — this avoids needing per-command option tables which are fragile + * across different implementations. if no real command is found, fall back to + * completing the elevation command itself. *) + let spans = match spans with + | cmd :: rest when List.mem cmd elevation_commands -> + let is_command name = + name <> "" && (lookup dirs name <> None || find_in_path name <> None) + in + (match find_real_command is_command rest with + | Some (_ :: _ as real_spans) -> real_spans + | _ -> spans) + | _ -> spans in + match spans with + | [] -> print_string "[]\n" + | cmd_name :: rest -> + (* Try longest prefix match: "git add" before "git" *) + let find_result tokens = + let n = List.length tokens in + List.init n Fun.id |> List.find_map (fun drop -> + let prefix = List.filteri (fun i _ -> i < n - drop) tokens in + match prefix with + | [] -> None + | _ -> + let try_name = String.concat " " prefix in + match lookup dirs try_name with + | Some r -> Some (try_name, r, List.length prefix) + | None -> None) in + let all_tokens = cmd_name :: rest in + let last_token = match rest with + | [] -> "" | _ -> List.nth rest (List.length rest - 1) in + (* Only treat the last token as a completed subcommand when nushell + sends a trailing empty token (cursor is after a space). + Otherwise the user is still typing and we treat it as partial. *) + let lookup_tokens = if last_token = "" then all_tokens + else match rest with + | _ :: _ -> cmd_name :: List.rev (List.tl (List.rev rest)) + | _ -> [cmd_name] in + let resolve tokens partial = + match find_result tokens with + | Some _ as found -> (found, partial) + | None -> (None, partial) in + let found, partial = resolve lookup_tokens last_token in + (* Try on-the-fly resolution when no match or only a parent matched *) + let n_lookup = List.length lookup_tokens in + let result, partial = match found with + | Some (_, _, depth) when depth >= n_lookup - 1 -> + (* Exact or near-exact match — use it *) + (found, partial) + | _ -> + (* No match, or only a parent matched — try on-the-fly resolution *) + (match find_in_path cmd_name with + | Some path -> + (match resolve_and_cache ~dir:user_dir cmd_name path with + | Some _pairs -> resolve lookup_tokens last_token + | None -> (found, partial)) + | None -> (found, partial)) in + let candidates = match result with + | None -> [] + | Some (_matched_name, r, depth) -> + (* When the match is shallower than requested, the user already + typed a subcommand beyond the matched level — don't show + sibling subcommands, only flags *) + let sub_candidates = if depth < n_lookup - 1 then [] else + let subs = match r.subcommands with + | _ :: _ -> r.subcommands + | [] -> subcommands_of dirs _matched_name in + List.filter_map (fun (sc : subcommand) -> + let s = fuzzy_score partial sc.name in + if s > 0 then Some (s, completion_json sc.name sc.desc) else None + ) subs in + (* build flag completion candidates from the entry list. + * for flags with both short and long forms (Both), we pick which form + * to display based on what the user is currently typing: + * - if the partial input matches the short flag better, show the short + * flag as the value and note the long form in the description + * - otherwise (including empty partial), prefer the long flag and note + * the short form in the description + * this keeps the candidate list clean (one entry per flag) while still + * surfacing the alternate form so the user knows about it. + * + * parameter names are appended to descriptions in angle brackets for + * mandatory params and square brackets for optional ones, matching the + * conventions users expect from cli help text. *) + let flag_candidates = List.filter_map (fun (e : entry) -> + let base_desc = match e.param with + | Some (Mandatory p) -> if e.desc <> "" then e.desc ^ " <" ^ p ^ ">" else "<" ^ p ^ ">" + | Some (Optional p) -> if e.desc <> "" then e.desc ^ " [" ^ p ^ "]" else "[" ^ p ^ "]" + | None -> e.desc in + let flag, desc = match e.switch with + | Long l -> ("--" ^ l, base_desc) + | Short c -> (Printf.sprintf "-%c" c, base_desc) + | Both (c, l) -> + (* score the partial against both forms to decide which to present. + * e.g. typing "-s" scores higher against "-s" than "--squeeze-blank", + * so we show "-s (aka --squeeze-blank)". when the partial is empty or + * matches the long form better, we default to the long form. *) + let long_flag = "--" ^ l in + let short_flag = Printf.sprintf "-%c" c in + let long_score = fuzzy_score partial long_flag in + let short_score = fuzzy_score partial short_flag in + if short_score > long_score then + (short_flag, Printf.sprintf "(aka %s) %s" long_flag base_desc) + else + (long_flag, Printf.sprintf "(aka %s) %s" short_flag base_desc) in + let s = fuzzy_score partial flag in + if s > 0 then Some (s, completion_json flag desc) else None + ) r.entries in + let scored = sub_candidates @ flag_candidates in + List.sort (fun (a, _) (b, _) -> compare b a) scored + |> List.map snd in + Printf.printf "[%s]\n" (String.concat "," candidates) + +(* "inshellah query CMD" — print the raw stored data for a command *) +let cmd_query cmd dirs = + match lookup_raw dirs cmd with + | None -> + Printf.eprintf "not found: %s\n" cmd; exit 1 + | Some data -> + print_string data; print_newline () + +(* load a newline-separated list of command names to ignore. + * blank lines and lines starting with '#' are skipped. *) +let load_ignorelist path = + try + In_channel.with_open_text path In_channel.input_all + |> String.split_on_char '\n' + |> List.filter_map (fun line -> + let line = String.trim line in + if String.length line > 0 && line.[0] <> '#' then Some line else None) + |> SSet.of_list + with _ -> SSet.empty + +(* parse "index" subcommand arguments: prefix dirs + optional --dir, --ignore, --help-only *) +let parse_index_args args = + let rec go prefixes dir ignore help_only = function + | [] -> (List.rev prefixes, dir, ignore, help_only) + | "--dir" :: path :: rest -> go prefixes path ignore help_only rest + | "--ignore" :: path :: rest -> go prefixes dir (SSet.union ignore (load_ignorelist path)) help_only rest + | "--help-only" :: path :: rest -> go prefixes dir ignore (SSet.union help_only (load_ignorelist path)) rest + | prefix :: rest -> go (prefix :: prefixes) dir ignore help_only rest in + go [] (default_store_path ()) SSet.empty SSet.empty args + +(* parse common --dir/--system-dir arguments for complete/query/dump commands *) +let parse_dir_args args = + let rec go user_dir system_dirs rest_args = function + | [] -> (user_dir, system_dirs, List.rev rest_args) + | "--dir" :: path :: rest -> go path system_dirs rest_args rest + | "--system-dir" :: path :: rest -> go user_dir (path :: system_dirs) rest_args rest + | arg :: rest -> go user_dir system_dirs (arg :: rest_args) rest in + go (default_store_path ()) [] [] args + +(* --- entry point --- + * dispatch on the first argument to the appropriate subcommand handler. *) +let () = + match Array.to_list Sys.argv |> List.tl with + | "index" :: rest -> + let (prefixes, dir, ignorelist, help_only) = parse_index_args rest in + if prefixes = [] then (Printf.eprintf "error: index requires at least one prefix dir\n"; exit 1); + let bindirs = List.map (fun p -> Filename.concat p "bin") prefixes in + let mandirs = List.map (fun p -> Filename.concat p "share/man") prefixes in + cmd_index bindirs mandirs ignorelist help_only dir + | "complete" :: rest -> + let (user_dir, system_dirs, spans) = parse_dir_args rest in + cmd_complete spans user_dir system_dirs + | "query" :: rest -> + let (user_dir, system_dirs, args) = parse_dir_args rest in + (match args with + | [cmd] -> cmd_query cmd (user_dir :: system_dirs) + | _ -> Printf.eprintf "error: query CMD [--dir PATH] [--system-dir PATH]\n"; exit 1) + | "dump" :: rest -> + let (user_dir, system_dirs, _) = parse_dir_args rest in + cmd_dump (user_dir :: system_dirs) + | ["manpage"; file] -> cmd_manpage file + | ["manpage-dir"; dir] -> cmd_manpage_dir dir + | _ -> usage () diff --git a/doc/nixos.md b/doc/nixos.md new file mode 100644 index 0000000..a2cb934 --- /dev/null +++ b/doc/nixos.md @@ -0,0 +1,192 @@ +# nixos integration + +inshellah provides a nixos module that automatically indexes nushell +completions for all installed packages at system build time. + +## enabling + +```nix +# in your flake.nix outputs: +{ + nixosConfigurations.myhost = nixpkgs.lib.nixosSystem { + modules = [ + inshellah.nixosModules.default + { + programs.inshellah.enable = true; + } + ]; + }; +} +``` + +or if importing the module directly: + +```nix +# configuration.nix +{ pkgs, ... }: { + imports = [ ./path/to/inshellah/nix/module.nix ]; + programs.inshellah = { + enable = true; + package = pkgs.inshellah; # or your local build + }; +} +``` + +## what happens at build time + +the module hooks into `environment.extraSetup`, which runs during the +system profile build (the `buildEnv` that creates `/run/current-system/sw`). +at that point, all system packages are merged, so `$out/bin` contains every +executable and `$out/share/man` contains every manpage. + +inshellah runs a single command: + +``` +inshellah index "$out" --dir $out/share/inshellah +``` + +this executes a three-phase pipeline: + +### phase 1: native completion detection (parallel) + +for each executable, inshellah scans the elf binary for the string +`completion`. if found, it probes common patterns like +`CMD completions nushell` to see if the program can generate its own +nushell completions. native output is used verbatim — these are always +higher quality than parsed completions. + +programs like `niri`, and any clap/cobra tool with nushell support, +are handled this way. + +### phase 2: manpage parsing (sequential) + +for commands not covered by phase 1, inshellah parses manpages from +man1 (user commands) and man8 (sysadmin commands). it handles: + +- gnu `.TP` style (coreutils, help2man) +- `.IP` style (curl, hand-written) +- `.PP`+`.RS`/`.RE` style (git, docbook) +- nix3 bullet+hyperlink style (`nix run`, `nix build`, etc.) +- mdoc (bsd) format +- deroff fallback for unusual formats + +synopsis sections are parsed to detect subcommands: `git-commit.1` +generates `export extern "git commit"`, not `export extern "git-commit"`. + +### phase 3: --help fallback (parallel) + +remaining executables without manpages get `--help` (or `-h`) called +with a 200ms timeout. elf binaries are pre-scanned for the `-h` string +to skip those that don't support help flags. shell scripts are run +directly (they're fast). execution is parallelized to available cores. + +### output + +each command gets its own file in `/share/inshellah` under the system +profile. native generators produce `.nu` files; parsed results produce +`.json` files. the `complete` command reads both formats. + +nushell built-in commands (ls, cd, cp, mv, etc.) are excluded since +nushell provides its own completions. + +### performance + +on a typical nixos system (~950 executables, ~1600 manpages): +- total time: ~4-10 seconds +- native gzip decompression (camlzip, no process spawning) +- parallel --help with core-scaled forking +- elf string scanning to skip ~15% of binaries + +## module options + +```nix +programs.inshellah = { + enable = true; + + # the inshellah package (set automatically by the flake module) + package = pkgs.inshellah; + + # where to place indexed completion files under the system profile + # default: "/share/inshellah" + completionsPath = "/share/inshellah"; + + # commands to skip entirely during indexing + ignoreCommands = [ "problematic-tool" ]; + + # commands to skip manpage parsing for (uses --help instead) + helpOnlyCommands = [ "nix" ]; +}; +``` + +## using the completer + +the flake module sets a read-only `snippet` option containing the nushell +config needed to wire up the completer. you can access it via +`config.programs.inshellah.snippet` and paste it into your nushell config, +or source it from a file generated by your nixos config. + +the snippet sets up the external completer pointing at the system index +at `/run/current-system/sw/share/inshellah`: + +```nu +let inshellah_complete = {|spans| + inshellah complete ...$spans --system-dir /run/current-system/sw/share/inshellah | from json +} +$env.config.completions.external = { + enable: true + max_results: 100 + completer: $inshellah_complete +} +``` + +## home manager and other user-level package managers + +the nixos module only indexes packages installed at the system level +(those that end up in `/run/current-system/sw`). if you use home-manager, +nix-env, or another user-level package manager, those binaries and +manpages live elsewhere — typically under `/etc/profiles/per-user/` +or `~/.nix-profile`. + +to get completions for user-installed packages, run `inshellah index` +against those prefixes separately: + +```sh +# home-manager / per-user profile +inshellah index /etc/profiles/per-user/$USER + +# classic nix-env profile +inshellah index ~/.nix-profile +``` + +this indexes into the default user cache (`$XDG_CACHE_HOME/inshellah`), +which the completer searches automatically. you can re-run this after +installing new packages, or add it to a home-manager activation script. + +if you want to automate this in home-manager: + +```nix +# home.nix +home.activation.inshellah-index = lib.hm.dag.entryAfter [ "writeBoundary" ] '' + ${pkgs.inshellah}/bin/inshellah index /etc/profiles/per-user/$USER 2>/dev/null || true +''; +``` + +the completer will then search both the system index (`--system-dir`) +and the user cache, so completions from both sources are available. + +## troubleshooting + +**completions not appearing**: ensure the completer is configured in +your nushell config (see above). check that the system index exists: +`ls /run/current-system/sw/share/inshellah/`. + +**missing completions for a specific command**: check if it's a nushell +built-in (`help commands | where name == "thecommand"`). built-ins are +excluded because nushell serves its own completions for them. + +**stale completions after update**: completions regenerate on every +`nixos-rebuild`. if a command changed its flags, rebuild to pick up +the changes. + +**build-time errors**: indexing failures are non-fatal (`|| true`). +check `journalctl` for the build log if completions are missing. diff --git a/doc/nushell-integration.md b/doc/nushell-integration.md new file mode 100644 index 0000000..06279f1 --- /dev/null +++ b/doc/nushell-integration.md @@ -0,0 +1,184 @@ +# using inshellah completions in nushell + +inshellah indexes completions from three sources (in priority order): +1. **native generators** — programs that can emit nushell completions directly +2. **manpages** — groff/troff/mdoc manpage parsing +3. **`--help` output** — parsing help text as a fallback + +indexed data is stored as `.json` and `.nu` files in a directory that the +`complete` command reads from at tab-completion time. + +## quick start + +index completions from a system prefix: + +```sh +# index from a prefix containing bin/ and share/man/ +inshellah index /usr + +# index from multiple prefixes +inshellah index /usr /usr/local + +# store in a custom directory +inshellah index /usr --dir ~/my-completions +``` + +parse a single manpage: + +```sh +inshellah manpage /usr/share/man/man1/git.1.gz +``` + +batch-process all manpages under a directory (man1 and man8): + +```sh +inshellah manpage-dir /usr/share/man +``` + +## commands + +``` +inshellah index PREFIX... [--dir PATH] [--ignore FILE] [--help-only FILE] + index completions into a directory of json/nu files. + PREFIX is a directory containing bin/ and share/man/. + default dir: $XDG_CACHE_HOME/inshellah + --ignore FILE skip listed commands entirely + --help-only FILE skip manpages for listed commands, use --help instead + +inshellah complete CMD [ARGS...] [--dir PATH] [--system-dir PATH] + nushell custom completer. outputs json completion candidates. + falls back to --help resolution if command is not indexed. + +inshellah query CMD [--dir PATH] [--system-dir PATH] + print stored completion data for CMD. + +inshellah dump [--dir PATH] [--system-dir PATH] + list indexed commands. + +inshellah manpage FILE + parse a manpage and emit nushell extern block. + +inshellah manpage-dir DIR + batch-process manpages under DIR (man1 and man8 sections). +``` + +## the index pipeline + +the `index` command runs a three-phase pipeline over all executables +in each `PREFIX/bin`: + +### phase 1: native completion detection (parallel) + +for each executable, inshellah scans the elf binary for the string +`completion`. if found, it probes common patterns like +`CMD completions nushell` to see if the program can generate its own +nushell completions. native output is used verbatim — these are always +higher quality than parsed completions. + +programs like `niri`, and any clap/cobra tool with nushell support, +are handled this way. + +### phase 2: manpage parsing (sequential) + +for commands not covered by phase 1, inshellah parses manpages from +man1 (user commands) and man8 (sysadmin commands). it handles: + +- gnu `.TP` style (coreutils, help2man) +- `.IP` style (curl, hand-written) +- `.PP`+`.RS`/`.RE` style (git, docbook) +- nix3 bullet+hyperlink style (`nix run`, `nix build`, etc.) +- mdoc (bsd) format +- deroff fallback for unusual formats + +synopsis sections are parsed to detect subcommands: `git-commit.1` +generates `export extern "git commit"`, not `export extern "git-commit"`. + +### phase 3: --help fallback (parallel) + +remaining executables without manpages get `--help` (or `-h`) called +with a 200ms timeout. elf binaries are pre-scanned for the `-h` string +to skip those that don't support help flags. shell scripts are run +directly (they're fast). execution is parallelized to available cores. + +subcommands are recursively resolved — if `--help` output lists +subcommands, inshellah runs `CMD SUBCMD --help` for each. + +### output + +each command gets its own file in the index directory. native generators +produce `.nu` files; parsed results produce `.json` files. the `complete` +command reads both formats. + +nushell built-in commands (ls, cd, cp, mv, etc.) are excluded since +nushell provides its own completions. + +### performance + +on a typical nixos system (~950 executables, ~1600 manpages): +- total time: ~4-10 seconds +- native gzip decompression (camlzip, no process spawning) +- parallel --help with core-scaled forking +- elf string scanning to skip ~15% of binaries + +## the completer + +the `complete` command is designed to be wired into nushell as an +external completer. it reads from the index directory (`--dir`) and +optional system directories (`--system-dir`), performs fuzzy matching, +and outputs json completion candidates. + +if a command is not indexed, `complete` falls back to on-the-fly +`--help` resolution — it runs the command's help, caches the result +in the user directory, and returns completions immediately. + +### setting up the completer + +```nu +# ~/.config/nushell/config.nu +$env.config.completions.external = { + enable: true + completer: {|spans| + inshellah complete ...$spans + | from json + } +} +``` + +with the nixos module, use the provided `snippet` option value (see +[nixos.md](nixos.md)) which points at the system index automatically. + +## nixos module + +enable automatic completion indexing at system build time: + +```nix +{ + imports = [ ./path/to/inshellah/nix/module.nix ]; + programs.inshellah.enable = true; +} +``` + +this runs `inshellah index` during the system profile build. see +[nixos.md](nixos.md) for full details. + +## what gets generated + +the `manpage` and `manpage-dir` commands emit nushell `extern` blocks +with flags, parameter types, and descriptions: + +```nu +export extern "rg" [ + --regexp(-e): string # a pattern to search for + --file(-f): path # search for patterns from the given file + --count(-c) # only show the count of matching lines + --color: string # controls when to use color + --max-depth: int # limit the depth of directory traversal +] +``` + +subcommand manpages (e.g. `git-commit.1`) are detected via synopsis +parsing and generate the correct nushell name (`git commit` not +`git-commit`). + +nushell built-in commands (ls, cd, mv, etc.) are excluded since nushell +provides its own completions for these. diff --git a/doc/runtime-completions.md b/doc/runtime-completions.md new file mode 100644 index 0000000..5be5250 --- /dev/null +++ b/doc/runtime-completions.md @@ -0,0 +1,84 @@ +# runtime completion resolution + +the `complete` command has built-in on-the-fly resolution: when a command +is not found in the index, it falls back to running `--help`, caches the +result, and returns completions immediately. this means commands installed +outside the system profile (via cargo, pip, npm, go, etc.) get completions +on first tab-press with no manual setup. + +## how it works + +when you type `docker compose up --`: + +1. nushell calls `inshellah complete docker compose up --` +2. inshellah looks up the index for the longest matching prefix +3. if found, it fuzzy-matches flags and subcommands against the partial input +4. if not found, it locates the binary in `$PATH`, runs `--help`, + recursively resolves subcommands, caches the results in the user + directory (`$XDG_CACHE_HOME/inshellah`), and returns completions + +all subsequent completions for that command are instant (served from cache). + +## setup + +the completer works with no extra configuration beyond the basic setup: + +```nu +# ~/.config/nushell/config.nu +$env.config.completions.external = { + enable: true + completer: {|spans| + inshellah complete ...$spans + | from json + } +} +``` + +with the nixos module, add `--system-dir` to also search the system index: + +```nu +$env.config.completions.external = { + enable: true + completer: {|spans| + inshellah complete ...$spans --system-dir /run/current-system/sw/share/inshellah + | from json + } +} +``` + +or use the `snippet` option provided by the flake module (see +[nixos.md](nixos.md)). + +## cache management + +the user cache lives at `$XDG_CACHE_HOME/inshellah` (typically +`~/.cache/inshellah`). + +```sh +# list cached commands +inshellah dump + +# view cached data for a command +inshellah query docker + +# clear cache +rm -rf ~/.cache/inshellah/ + +# re-index from a prefix +inshellah index /usr --dir ~/.cache/inshellah +``` + +## when to use this vs build-time indexing + +the nixos module (`programs.inshellah.enable = true`) handles system +packages at build time. runtime resolution covers: + +- commands installed outside the system profile (cargo, pip, npm, go) +- subcommand completions at arbitrary depth +- systems without the nixos module + +for upfront indexing on non-nixos systems: + +```sh +inshellah index /usr /usr/local +``` diff --git a/dune-project b/dune-project new file mode 100644 index 0000000..4d29412 --- /dev/null +++ b/dune-project @@ -0,0 +1,28 @@ +(lang dune 3.20) + +(name inshellah) + +(generate_opam_files true) + +(source + (github username/reponame)) + +(authors "atagen ") + +(maintainers "atagen ") + +(license GPL-3.0-or-later) + +(package + (name inshellah) + (synopsis "Nushell completions generator") + (description + "Inshellah parses manpages and --help switches to generate completions for nushell.") + (depends + ocaml + dune + angstrom + angstrom-unix + camlzip) + (tags + (shell completions nushell parser angstrom))) diff --git a/flake.lock b/flake.lock new file mode 100644 index 0000000..3adb309 --- /dev/null +++ b/flake.lock @@ -0,0 +1,27 @@ +{ + "nodes": { + "nixpkgs": { + "locked": { + "lastModified": 1773385838, + "narHash": "sha256-ylF2AGl08seexxlLvMqj3jd+yZq56W9zicwe51mp0Pw=", + "owner": "nixos", + "repo": "nixpkgs", + "rev": "fef542e7a88eec2b698389e6279464fd479926b6", + "type": "github" + }, + "original": { + "owner": "nixos", + "ref": "nixpkgs-unstable", + "repo": "nixpkgs", + "type": "github" + } + }, + "root": { + "inputs": { + "nixpkgs": "nixpkgs" + } + } + }, + "root": "root", + "version": 7 +} diff --git a/flake.nix b/flake.nix new file mode 100644 index 0000000..f24111d --- /dev/null +++ b/flake.nix @@ -0,0 +1,71 @@ +{ + inputs.nixpkgs.url = "github:nixos/nixpkgs/nixpkgs-unstable"; + + outputs = + { self, nixpkgs }: + let + forAllSystems = + f: + nixpkgs.lib.genAttrs [ "x86_64-linux" "aarch64-linux" ] ( + system: f (import nixpkgs { inherit system; }) + ); + in + { + devShells = forAllSystems (pkgs: { + default = pkgs.mkShell { + packages = with pkgs.ocamlPackages; [ + dune_3 + ocaml + angstrom + angstrom-unix + camlzip + ppx_inline_test + ocaml-lsp + ocamlformat + ocamlformat-rpc-lib + utop + ]; + }; + }); + + packages = forAllSystems (pkgs: { + default = pkgs.ocamlPackages.buildDunePackage { + pname = "inshellah"; + version = "0.1"; + src = ./.; + nativeBuildInputs = [ pkgs.git ]; + buildInputs = with pkgs.ocamlPackages; [ + dune_3 + ocaml + angstrom + angstrom-unix + camlzip + ]; + + meta.mainProgram = "inshellah"; + }; + }); + + nixosModules.default = + { + pkgs, + lib, + config, + ... + }: + { + imports = [ ./nix/module.nix ]; + programs.inshellah.package = self.packages.${pkgs.stdenv.hostPlatform.system}.default; + programs.inshellah.snippet = '' + let inshellah_complete = {|spans| + ${lib.getExe config.programs.inshellah.package} complete ...$spans --system-dir /run/current-system/sw/${config.programs.inshellah.completionsPath} | from json + } + $env.config.completions.external = { + enable: true + max_results: 100 + completer: $inshellah_complete + } + ''; + }; + }; +} diff --git a/inshellah.opam b/inshellah.opam new file mode 100644 index 0000000..9888aa7 --- /dev/null +++ b/inshellah.opam @@ -0,0 +1,35 @@ +# This file is generated by dune, edit dune-project instead +opam-version: "2.0" +synopsis: "Nushell completions generator" +description: + "Inshellah parses manpages and --help switches to generate completions for nushell." +maintainer: ["atagen "] +authors: ["atagen "] +license: "GPL-3.0-or-later" +tags: ["shell" "completions" "nushell" "parser" "angstrom"] +homepage: "https://github.com/username/reponame" +bug-reports: "https://github.com/username/reponame/issues" +depends: [ + "ocaml" + "dune" {>= "3.20"} + "angstrom" + "angstrom-unix" + "camlzip" + "odoc" {with-doc} +] +build: [ + ["dune" "subst"] {dev} + [ + "dune" + "build" + "-p" + name + "-j" + jobs + "@install" + "@runtest" {with-test} + "@doc" {with-doc} + ] +] +dev-repo: "git+https://github.com/username/reponame.git" +x-maintenance-intent: ["(latest)"] diff --git a/lib/.ocamlformat b/lib/.ocamlformat new file mode 100644 index 0000000..e69de29 diff --git a/lib/dune b/lib/dune new file mode 100644 index 0000000..38defe1 --- /dev/null +++ b/lib/dune @@ -0,0 +1,3 @@ +(library + (name inshellah) + (libraries angstrom angstrom-unix camlzip str unix)) diff --git a/lib/manpage.ml b/lib/manpage.ml new file mode 100644 index 0000000..156dff2 --- /dev/null +++ b/lib/manpage.ml @@ -0,0 +1,1088 @@ +(* manpage.ml — parse unix manpages (groff/mdoc format) into help_result. + * + * manpages are written in roff/groff markup — a decades-old typesetting language + * used by man(1). this module strips the formatting and extracts structured data + * (flags, subcommands, positionals) from the raw groff source. + * + * there are two major manpage macro packages: + * - man (groff) — used by gnu/linux tools. uses macros like .SH, .TP, .IP, .PP + * - mdoc (bsd) — used by bsd tools. uses .Sh, .Fl, .Ar, .Op, .It, .Bl/.El + * + * this module handles both, auto-detecting the format by checking for .Sh macros. + * + * for groff manpages, flag extraction uses multiple "strategies" that target + * different common formatting patterns: + * - strategy_tp: .TP tagged paragraphs (gnu coreutils, help2man) + * - strategy_ip: .IP indented paragraphs (curl, hand-written) + * - strategy_pp_rs: .PP + .RS/.RE blocks (git, docbook) + * - strategy_nix: nix3-style bullet .IP with .UR/.UE hyperlinks + * - strategy_deroff: fallback — strip all groff, feed to help text parser + * + * the module tries all applicable strategies and picks the one that extracts + * the most flag entries, on the theory that more results = better match. + * + * key peculiarities: + * - groff has an enormous escape syntax (font changes, named characters, + * size changes, color, string variables, etc.) — strip_groff_escapes + * handles the common cases but is not exhaustive + * - font escapes like \fI (italic) need to insert spaces at word boundaries + * to prevent flag names from fusing with their parameter names + * - the strategies share the angstrom-based switch_parser from parser.ml + * for parsing the actual flag syntax out of the stripped text + *) + +open Parser + +(* --- groff escape/formatting stripper --- + * groff escapes start with backslash and use various continuation syntaxes. + * this function strips them, replacing named characters (like \(aq for + * apostrophe) with their text equivalents and discarding formatting directives. *) + +let strip_groff_escapes s = + let buf = Buffer.create (String.length s) in + let len = String.length s in + let i = ref 0 in + let last = ref '\000' in + let put c = Buffer.add_char buf c; last := c in + let is_alnum c = + (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') + in + while !i < len do + if s.[!i] = '\\' && !i + 1 < len then begin + let next = s.[!i + 1] in + match next with + | 'f' -> + (* Font escape: \fB, \fI, \fP, \fR, \f(XX, \f[...] *) + if !i + 2 < len then begin + let fc = s.[!i + 2] in + (* Insert space before italic font to preserve word boundaries + e.g. \fB--max-results\fR\fIcount\fR → "--max-results count" *) + if fc = 'I' && is_alnum !last then put ' '; + if fc = '(' then + i := !i + 5 (* \f(XX *) + else if fc = '[' then begin + i := !i + 3; + while !i < len && s.[!i] <> ']' do incr i done; + if !i < len then incr i + end else + i := !i + 3 (* \fX *) + end else + i := !i + 2 + | '-' -> + put '-'; + i := !i + 2 + | '&' | '/' | ',' -> + (* Zero-width characters *) + i := !i + 2 + | '(' -> + (* Two-char named character: \(aq, \(lq, \(rq, etc. *) + if !i + 3 < len then begin + let name = String.sub s (!i + 2) 2 in + (match name with + | "aq" -> put '\'' + | "lq" | "Lq" -> put '"' + | "rq" | "Rq" -> put '"' + | "em" | "en" -> put '-' + | _ -> ()); + i := !i + 4 + end else + i := !i + 2 + | '[' -> + (* Named character: \[...] *) + i := !i + 2; + let start = !i in + while !i < len && s.[!i] <> ']' do incr i done; + if !i < len then begin + let name = String.sub s start (!i - start) in + (match name with + | "aq" -> put '\'' + | "lq" | "Lq" -> put '"' + | "rq" | "Rq" -> put '"' + | _ -> ()); + incr i + end + | 's' -> + (* Size escape: \sN, \s+N, \s-N, \s'N' *) + i := !i + 2; + if !i < len && (s.[!i] = '+' || s.[!i] = '-') then incr i; + if !i < len && s.[!i] >= '0' && s.[!i] <= '9' then incr i; + if !i < len && s.[!i] >= '0' && s.[!i] <= '9' then incr i + | 'm' -> + (* Color escape: \m[...] *) + i := !i + 2; + if !i < len && s.[!i] = '[' then begin + incr i; + while !i < len && s.[!i] <> ']' do incr i done; + if !i < len then incr i + end + | 'X' -> + (* Device control: \X'...' *) + i := !i + 2; + if !i < len && s.[!i] = '\'' then begin + incr i; + while !i < len && s.[!i] <> '\'' do incr i done; + if !i < len then incr i + end + | '*' -> + (* String variable: \*X or \*(XX or \*[...] *) + i := !i + 2; + if !i < len then begin + if s.[!i] = '(' then + i := !i + 2 + else if s.[!i] = '[' then begin + incr i; + while !i < len && s.[!i] <> ']' do incr i done; + if !i < len then incr i + end else + incr i + end + | 'n' -> + (* Number register: \nX or \n(XX or \n[...] *) + i := !i + 2; + if !i < len then begin + if s.[!i] = '(' then + i := !i + 2 + else if s.[!i] = '[' then begin + incr i; + while !i < len && s.[!i] <> ']' do incr i done; + if !i < len then incr i + end else + incr i + end + | 'e' -> + put '\\'; + i := !i + 2 + | '\\' -> + put '\\'; + i := !i + 2 + | ' ' -> + put ' '; + i := !i + 2 + | _ -> + (* Unknown escape, skip *) + i := !i + 2 + end else begin + put s.[!i]; + incr i + end + done; + Buffer.contents buf + +(* strip inline macro formatting: .BI, .BR, .IR, etc. + * these macros alternate between fonts for their arguments, e.g.: + * .BI "--output " "FILE" + * becomes "--outputFILE" (arguments concatenated without spaces). + * + * peculiarity: quoted strings are kept together (quotes stripped), but + * unquoted spaces are consumed. this matches groff's actual rendering + * of these macros, where alternating-font arguments are concatenated. *) +let strip_inline_macro_args s = + let buf = Buffer.create (String.length s) in + let len = String.length s in + let i = ref 0 in + while !i < len do + if s.[!i] = '"' then begin + incr i; + while !i < len && s.[!i] <> '"' do + Buffer.add_char buf s.[!i]; + incr i + done; + if !i < len then incr i + end else if s.[!i] = ' ' || s.[!i] = '\t' then begin + incr i + end else begin + Buffer.add_char buf s.[!i]; + incr i + end + done; + Buffer.contents buf + +let strip_groff line = + let s = strip_groff_escapes line in + String.trim s + +(* --- line classification --- + * every line in a manpage is classified as one of four types. + * this classification drives all subsequent parsing — strategies + * pattern-match on sequences of classified lines. *) + +type groff_line = + | Macro of string * string (* macro name + args, e.g. ("SH", "OPTIONS") or ("TP", "") *) + | Text of string (* plain text after groff stripping *) + | Blank (* empty line *) + | Comment (* groff comment: .backslash-quote or backslash-quote *) + +(* classify a single line of manpage source. + * macro lines start with '.' or '\'' (groff alternate control char). + * the macro name is split from its arguments at the first space/tab. + * arguments wrapped in double quotes are unquoted. *) +let classify_line line = + let len = String.length line in + if len = 0 then Blank + else if len >= 2 && line.[0] = '.' && line.[1] = '\\' && (len < 3 || line.[2] = '"') then + Comment + else if len >= 3 && line.[0] = '\\' && line.[1] = '"' then + Comment + else if line.[0] = '.' || line.[0] = '\'' then begin + (* Macro line *) + let rest = String.sub line 1 (len - 1) in + let rest = String.trim rest in + (* Split into macro name and arguments *) + let space_pos = + try Some (String.index rest ' ') + with Not_found -> + try Some (String.index rest '\t') + with Not_found -> None + in + match space_pos with + | Some pos -> + let name = String.sub rest 0 pos in + let args = String.trim (String.sub rest (pos + 1) (String.length rest - pos - 1)) in + (* Strip quotes from args *) + let args = + let alen = String.length args in + if alen >= 2 && args.[0] = '"' && args.[alen - 1] = '"' then + String.sub args 1 (alen - 2) + else args + in + Macro (name, args) + | None -> + Macro (rest, "") + end else begin + let stripped = strip_groff line in + if String.length stripped = 0 then Blank + else Text stripped + end + +(* refined comment detection — the base classify_line may miss some comment + * forms, so this wrapper checks more carefully before falling through to + * the general classifier. *) +let is_comment_line line = + let len = String.length line in + (len >= 3 && line.[0] = '.' && line.[1] = '\\' && line.[2] = '"') + || (len >= 2 && line.[0] = '\\' && line.[1] = '"') + +let classify_line line = + if is_comment_line line then Comment + else classify_line line + +(* --- section extraction --- + * manpages are divided into sections by .SH macros. the options section + * contains the flag definitions we want. if there's no OPTIONS section, + * we fall back to DESCRIPTION (some simple tools put flags there). + * + * old-style nix manpages (nix-build, nix-env-install, etc.) split flags + * across multiple .SH sections with option-like names: e.g. "Options" for + * command-specific flags and "Common Options" for flags shared by all nix + * commands. collecting only the first such section misses the majority of + * flags, so we collect and concatenate ALL option-like sections. *) + +let extract_options_section lines = + let classified = List.map classify_line lines in + (* collect lines until the next .SH header, returning (content, rest) + * where rest starts at the .SH line (or is empty if at end of file). *) + let rec collect_section lines acc = + match lines with + | [] -> (List.rev acc, []) + | Macro ("SH", _) :: _ -> (List.rev acc, lines) + | line :: rest -> collect_section rest (line :: acc) + in + let is_options_section name = + let s = String.uppercase_ascii (String.trim name) in + s = "OPTIONS" + || (String.length s > 0 && + try let _ = Str.search_forward (Str.regexp_string "OPTION") s 0 in true + with Not_found -> false) + in + (* Collect from ALL option-like .SH sections and concatenate them. + * handles the common nix pattern where "Options" and "Common Options" + * are separate .SH sections but both contain relevant flags. + * + * a synthetic Macro("SH","") separator is inserted between sections so + * that collect_desc_text (which stops on SH/SS) does not let a description + * from the last entry in one section bleed into the intro text of the next. *) + let rec find_all_options lines acc = + match lines with + | [] -> acc + | Macro ("SH", args) :: rest when is_options_section args -> + let (section, remaining) = collect_section rest [] in + let sep = if acc = [] then [] else [Macro ("SH", "")] in + find_all_options remaining (acc @ sep @ section) + | _ :: rest -> find_all_options rest acc + in + (* Fallback: DESCRIPTION section *) + let rec find_description = function + | [] -> [] + | Macro ("SH", args) :: rest + when String.uppercase_ascii (String.trim args) = "DESCRIPTION" -> + fst (collect_section rest []) + | _ :: rest -> find_description rest + in + match find_all_options classified [] with + | [] -> find_description classified + | sections -> sections + +(* --- strategy-based entry extraction --- + * rather than a single monolithic parser, we use multiple "strategies" that + * each target a specific groff formatting pattern. this is necessary because + * manpage authors use very different macro combinations for the same purpose. + * + * the shared building blocks: + * - collect_text_lines: gather consecutive Text lines into one description string + * - parse_tag_to_entry: run the angstrom switch parser on a tag string to + * extract the flag definition. this reuses the same parser that handles + * --help output, giving consistent extraction across both sources. + * - tag_of_macro: extract the "tag" text from formatting macros like .B, .BI, etc. + *) + +(* collect consecutive text lines, joining them with spaces *) +let rec collect_text_lines lines acc = + match lines with + | Text s :: rest -> collect_text_lines rest (s :: acc) + | _ -> (String.concat " " (List.rev acc), lines) + +(* attempt to parse a tag string (e.g. "-v, --verbose FILE") into an entry. + * uses the angstrom switch_parser + param_parser from parser.ml. + * returns none if the tag doesn't look like a flag definition. *) +let parse_tag_to_entry tag desc = + let tag = strip_groff_escapes tag in + let tag = String.trim tag in + match Angstrom.parse_string ~consume:Angstrom.Consume.Prefix + (Angstrom.lift2 (fun sw p -> (sw, p)) switch_parser param_parser) tag with + | Ok (switch, param) -> Some { switch; param; desc } + | Error _ -> None + +(* Extract tag text from a macro line (.B, .I preserve spaces; .BI/.BR/.IR alternate) *) +let tag_of_macro name args = + match name with + | "B" | "I" -> strip_groff_escapes args |> String.trim + | _ -> strip_inline_macro_args args |> strip_groff_escapes |> String.trim + +(* strategy a: .TP style (most common — gnu coreutils, help2man). + * .TP introduces a tagged paragraph: the next line is the "tag" (flag name) + * and subsequent text lines are the description. the tag can be plain text + * or wrapped in a formatting macro (.B, .BI, etc.). + * + * example groff: + * .TP + * \fB\-v\fR, \fB\-\-verbose\fR + * increase verbosity *) +let strategy_tp lines = + let rec walk lines acc = + match lines with + | [] -> List.rev acc + | Macro ("TP", _) :: rest -> + (* Next line is the tag — could be Text or a formatting macro *) + begin match rest with + | Text tag :: rest2 -> + let (desc, rest3) = collect_text_lines rest2 [] in + let entry = parse_tag_to_entry tag desc in + walk rest3 (match entry with Some e -> e :: acc | None -> acc) + | Macro (("B" | "I" | "BI" | "BR" | "IR") as m, args) :: rest2 -> + let tag = tag_of_macro m args in + let (desc, rest3) = collect_text_lines rest2 [] in + let entry = parse_tag_to_entry tag desc in + walk rest3 (match entry with Some e -> e :: acc | None -> acc) + | _ -> walk rest acc + end + | _ :: rest -> walk rest acc + in + walk lines [] + +(* strategy b: .IP style (curl, hand-written manpages). + * .IP takes an inline tag argument: .IP "-v, --verbose" + * the description follows as text lines. simpler than .TP because + * the tag is on the macro line itself. *) +let strategy_ip lines = + let rec walk lines acc = + match lines with + | [] -> List.rev acc + | Macro ("IP", tag) :: rest -> + let tag = strip_groff_escapes tag in + let (desc, rest2) = collect_text_lines rest [] in + let entry = parse_tag_to_entry tag desc in + walk rest2 (match entry with Some e -> e :: acc | None -> acc) + | _ :: rest -> walk rest acc + in + walk lines [] + +(* strategy c: .PP + .RS/.RE style (git, docbook-generated manpages). + * flag entries are introduced by .PP (paragraph), with the flag name as + * plain text, followed by a .RS (indent) block containing the description, + * closed by .RE (de-indent). this is common in docbook→manpage toolchains. *) +let strategy_pp_rs lines = + let rec walk lines acc = + match lines with + | [] -> List.rev acc + | Macro ("PP", _) :: rest -> + begin match rest with + | Text tag :: rest2 -> + (* Look for .RS ... text ... .RE *) + let rec collect_rs lines desc_acc = + match lines with + | Macro ("RS", _) :: rest3 -> + collect_in_rs rest3 desc_acc + | Text s :: rest3 -> + (* Sometimes description follows directly *) + collect_rs rest3 (s :: desc_acc) + | _ -> (String.concat " " (List.rev desc_acc), lines) + and collect_in_rs lines desc_acc = + match lines with + | Macro ("RE", _) :: rest3 -> + (String.concat " " (List.rev desc_acc), rest3) + | Text s :: rest3 -> + collect_in_rs rest3 (s :: desc_acc) + | Macro ("PP", _) :: _ | Macro ("SH", _) :: _ -> + (String.concat " " (List.rev desc_acc), lines) + | _ :: rest3 -> collect_in_rs rest3 desc_acc + | [] -> (String.concat " " (List.rev desc_acc), []) + in + let (desc, rest3) = collect_rs rest2 [] in + let entry = parse_tag_to_entry tag desc in + walk rest3 (match entry with Some e -> e :: acc | None -> acc) + | _ -> walk rest acc + end + | _ :: rest -> walk rest acc + in + walk lines [] + +(* strategy d: deroff fallback — strip all groff markup, then feed the + * resulting plain text through the --help parser from parser.ml. + * this is the last resort when no structured macro pattern is recognized. + * it works surprisingly well for simple manpages but may miss entries + * in heavily formatted ones. *) +let strategy_deroff_lines lines = + let buf = Buffer.create 256 in + List.iter (fun line -> + match line with + | Text s -> + Buffer.add_string buf s; + Buffer.add_char buf '\n' + | Macro (("BI" | "BR" | "IR" | "B" | "I"), args) -> + let text = strip_inline_macro_args args in + let text = strip_groff_escapes text in + Buffer.add_string buf text; + Buffer.add_char buf '\n' + | Blank -> Buffer.add_char buf '\n' + | _ -> () + ) lines; + let text = Buffer.contents buf in + match parse_help text with + | Ok result -> result.entries + | Error _ -> [] + +(* strategy e: nix3-style bullet .IP with .UR/.UE hyperlinks. + * nix's manpages use .IP with bullet markers for flag entries, interleaved + * with .UR/.UE hyperlink macros. the flag tag is in text lines after the + * bullet .IP, and the description follows a non-bullet .IP marker. + * + * peculiarity: nix manpages nest .RS/.RE blocks inside descriptions for + * sub-examples. the skip_rs helper tracks nesting depth to skip these + * without losing the rest of the description. *) +let strategy_nix lines = + let is_bullet_ip args = + String.length (String.trim args) > 0 + in + let rec walk lines acc = + match lines with + | [] -> List.rev acc + | Macro ("IP", args) :: rest when is_bullet_ip args -> + (* Collect tag: skip UR/UE macros, collect Text lines *) + let rec collect_tag lines parts = + match lines with + | Macro ("UR", _) :: rest2 -> collect_tag rest2 parts + | Macro ("UE", _) :: rest2 -> collect_tag rest2 parts + | Text s :: rest2 -> collect_tag rest2 (s :: parts) + | _ -> (String.concat " " (List.rev parts), lines) + in + let (tag, rest2) = collect_tag rest [] in + (* Collect description after the description .IP marker *) + let rec collect_desc lines parts = + match lines with + | Macro ("IP", dargs) :: rest3 when not (is_bullet_ip dargs) -> + collect_desc_text rest3 parts + | _ -> (String.concat " " (List.rev parts), lines) + and collect_desc_text lines parts = + match lines with + | Text s :: rest3 -> collect_desc_text rest3 (s :: parts) + | Macro ("IP", args) :: _ when is_bullet_ip args -> + (String.concat " " (List.rev parts), lines) + | Macro (("SS" | "SH"), _) :: _ -> + (String.concat " " (List.rev parts), lines) + | Macro ("RS", _) :: rest3 -> + skip_rs rest3 parts 1 + | Macro ("IP", _) :: rest3 -> + (* Non-bullet IP = continuation paragraph *) + collect_desc_text rest3 parts + | Macro _ :: rest3 -> collect_desc_text rest3 parts + | Blank :: rest3 -> collect_desc_text rest3 parts + | Comment :: rest3 -> collect_desc_text rest3 parts + | [] -> (String.concat " " (List.rev parts), []) + and skip_rs lines parts depth = + match lines with + | Macro ("RE", _) :: rest3 -> + if depth <= 1 then collect_desc_text rest3 parts + else skip_rs rest3 parts (depth - 1) + | Macro ("RS", _) :: rest3 -> skip_rs rest3 parts (depth + 1) + | _ :: rest3 -> skip_rs rest3 parts depth + | [] -> (String.concat " " (List.rev parts), []) + in + let (desc, rest3) = collect_desc rest2 [] in + let entry = parse_tag_to_entry tag desc in + walk rest3 (match entry with Some e -> e :: acc | None -> acc) + | _ :: rest -> walk rest acc + in + walk lines [] + +(* count occurrences of a specific macro in the section. + * used by extract_entries to decide which strategies are worth trying. *) +let count_macro name lines = + List.fold_left (fun n line -> + match line with Macro (m, _) when m = name -> n + 1 | _ -> n + ) 0 lines + +(* auto-detect and try strategies, return the one with most entries. + * first counts macros to determine which strategies are applicable, + * then runs all applicable ones and picks the winner by entry count. + * if no specialized strategy produces results, falls back to deroff. + * + * peculiarity: this "try everything, pick the best" approach is intentional. + * manpage formatting is too varied and inconsistent to reliably detect the + * format from macro counts alone. running multiple strategies and comparing + * results is more robust. *) +let extract_entries lines = + let tp = count_macro "TP" lines + and ip = count_macro "IP" lines + and pp = count_macro "PP" lines + and rs = count_macro "RS" lines + and ur = count_macro "UR" lines in + let specialized = List.filter_map Fun.id [ + (if tp > 0 then Some ("TP", strategy_tp lines) else None); + (if ip > 0 then Some ("IP", strategy_ip lines) else None); + (if pp > 0 && rs > 0 then Some ("PP+RS", strategy_pp_rs lines) else None); + (if ur > 0 && ip > 0 then Some ("nix", strategy_nix lines) else None); + ] in + let candidates = match List.filter (fun (_, e) -> e <> []) specialized with + | [] -> [("deroff", strategy_deroff_lines lines)] + | filtered -> filtered + in + List.fold_left (fun (_, best) (name, entries) -> + if List.length entries >= List.length best then (name, entries) + else (name, best) + ) ("none", []) candidates |> snd + +(* --- NAME section description extraction --- + * the NAME section in manpages follows the convention: + * "command \- short description" + * we extract the part after "\-" as the command's description. + * handles both "\-" (groff) and " - " (plain text) separators. *) + +let extract_name_description contents = + let lines = String.split_on_char '\n' contents in + let classified = List.map classify_line lines in + let rec find = function + | [] -> None + | Macro ("SH", args) :: rest + when String.uppercase_ascii (String.trim args) = "NAME" -> + collect rest [] + | _ :: rest -> find rest + and collect lines acc = + match lines with + | Macro ("SH", _) :: _ | [] -> finish acc + | Text s :: rest -> collect rest (s :: acc) + | Macro (("B" | "BI" | "BR" | "I" | "IR"), args) :: rest -> + let s = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in + collect rest (if String.length s > 0 then s :: acc else acc) + | Macro ("Nm", args) :: rest -> + let s = strip_groff_escapes args |> String.trim in + collect rest (if String.length s > 0 then s :: acc else acc) + | Macro ("Nd", args) :: rest -> + let s = strip_groff_escapes args |> String.trim in + collect rest (if String.length s > 0 then ("\\- " ^ s) :: acc else acc) + | _ :: rest -> collect rest acc + and finish acc = + let full = String.concat " " (List.rev acc) |> String.trim in + (* NAME lines look like: "git-add \- Add file contents to the index" *) + let sep = Str.regexp {| *\\- *\| +- +|} in + match Str.bounded_split sep full 2 with + | [_; desc] -> Some (String.trim desc) + | _ -> None + in + find classified + +(* --- SYNOPSIS command name extraction --- + * the SYNOPSIS section shows how to invoke the command: + * .SH SYNOPSIS + * .B git add + * [\fIOPTIONS\fR] [\fB\-\-\fR] [\fI\fR...] + * + * we extract the command name by taking consecutive "word" tokens until + * we hit something that looks like an argument (starts with [, <, -, etc.). *) + +let extract_synopsis_command_lines lines = + let classified = List.map classify_line lines in + let is_synopsis name = + let s = String.uppercase_ascii (String.trim name) in + s = "SYNOPSIS" + in + let extract_cmd line = + let words = String.split_on_char ' ' (String.trim line) in + let words = List.filter (fun w -> String.length w > 0) words in + let is_cmd_char = function + | 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '.' -> true + | _ -> false + in + let rec take = function + | [] -> [] + | w :: rest -> + if String.length w > 0 + && (w.[0] = '[' || w.[0] = '-' || w.[0] = '<' + || w.[0] = '(' || w.[0] = '{') + then [] + else if String.for_all is_cmd_char w then + w :: take rest + else [] + in + match take words with + | [] -> None + | cmd -> Some (String.concat " " cmd) + in + let rec find = function + | [] -> None + | Macro ("SH", args) :: rest when is_synopsis args -> collect rest + | _ :: rest -> find rest + and collect = function + | [] -> None + | Macro ("SH", _) :: _ -> None + | Text s :: _ -> + let s = String.trim s in + if String.length s > 0 then extract_cmd s else None + | Macro (("B" | "BI" | "BR"), args) :: _ -> + let s = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in + if String.length s > 0 then extract_cmd s else None + | _ :: rest -> collect rest + in + find classified + +let extract_synopsis_command contents = + let lines = String.split_on_char '\n' contents in + extract_synopsis_command_lines lines + +(* --- SYNOPSIS positional extraction --- + * extract positional arguments from the SYNOPSIS section by collecting + * all text/formatting macro lines, joining them, skipping the command + * name prefix, then running parse_usage_args from parser.ml on the remainder. *) + +let extract_synopsis_positionals_lines lines = + let classified = List.map classify_line lines in + let is_synopsis name = + String.uppercase_ascii (String.trim name) = "SYNOPSIS" + in + let rec find = function + | [] -> [] + | Macro ("SH", args) :: rest when is_synopsis args -> collect rest [] + | _ :: rest -> find rest + and collect lines acc = + match lines with + | [] -> finish acc + | Macro ("SH", _) :: _ -> finish acc + | Macro ("SS", _) :: _ -> finish acc + | Macro ("br", _) :: _ -> finish acc + | Text s :: rest -> + let s = strip_groff_escapes s |> String.trim in + collect rest (if String.length s > 0 then s :: acc else acc) + | Macro (("B" | "BI" | "BR" | "I" | "IR" | "IB" | "RB" | "RI"), args) :: rest -> + let s = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in + collect rest (if String.length s > 0 then s :: acc else acc) + | _ :: rest -> collect rest acc + and finish acc = + let parts = List.rev acc in + let full = String.concat " " parts |> String.trim in + if String.length full = 0 then [] + else + let cmd_end = skip_command_prefix full in + let args = String.sub full cmd_end (String.length full - cmd_end) in + parse_usage_args args + in + find classified + +(* --- mdoc (bsd) format support --- + * mdoc is the bsd manpage macro package. it uses semantic macros rather than + * presentation macros: + * .Fl v → flag: -v + * .Ar file → argument: file + * .Op ... → optional: [...] + * .Bl/.It/.El → list begin/item/end + * .Sh → section header (note lowercase 'h', vs groff's .SH) + * + * the parser walks through classified lines looking for .Bl (list begin) + * blocks containing .It (items) with .Fl (flag) entries. *) + +let is_mdoc lines = + List.exists (fun l -> + match classify_line l with Macro ("Sh", _) -> true | _ -> false + ) lines + +let mdoc_text_of line = + match line with + | Text s -> Some (strip_groff_escapes s) + | Macro (m, args) -> + (match m with + | "Pp" | "Bl" | "El" | "Sh" | "Ss" | "Os" | "Dd" | "Dt" + | "Oo" | "Oc" | "Op" -> None + | _ -> + let s = strip_groff_escapes args |> String.trim in + if s = "" then None else Some s) + | _ -> None + +(* parse an mdoc .It (list item) line that contains flag definitions. + * mdoc .It lines look like: ".It Fl v Ar file" + * where Fl = flag, Ar = argument. we extract the flag name and parameter. + * + * peculiarity: only handles single-char short flags and long flags starting + * with '-'. mdoc's .Fl macro automatically prepends '-', so "Fl v" means "-v" + * and "Fl -verbose" means "--verbose". *) +let parse_mdoc_it args = + let words = String.split_on_char ' ' args + |> List.filter (fun w -> w <> "" && w <> "Ns") in + let param = match words with + | _ :: _ :: "Ar" :: p :: _ -> Some (Mandatory p) + | _ -> None + in + match words with + | "Fl" :: c :: _ when String.length c = 1 && is_alphanumeric c.[0] -> + Some { switch = Short c.[0]; param; desc = "" } + | "Fl" :: name :: _ when String.length name > 1 && name.[0] = '-' -> + Some { switch = Long (String.sub name 1 (String.length name - 1)); param; desc = "" } + | _ -> None + +let positional_of_mdoc_line optional args = + let words = String.split_on_char ' ' args + |> List.filter (fun w -> w <> "") in + match words with + | name :: _ when String.length name >= 2 -> + Some { pos_name = String.lowercase_ascii name; + optional; variadic = List.mem "..." words } + | _ -> None + +(* parse an entire mdoc-format manpage. + * walks through all classified lines looking for: + * 1. .Bl/.It/.El list blocks containing flag definitions + * 2. .Sh SYNOPSIS sections containing positional arguments (.Ar, .Op Ar) + * + * peculiarity: the scan function handles nested .Bl blocks — if the first + * .It in a .Bl starts with .Fl (a flag), the entire list is parsed as + * options. otherwise the list is skipped (it might be an example list or + * a description list). *) +let parse_mdoc_lines lines = + let classified = List.map classify_line lines in + let rec skip_to_el = function + | [] -> [] + | Macro ("El", _) :: rest -> rest + | _ :: rest -> skip_to_el rest + in + let rec collect_desc acc = function + | [] -> (acc, []) + | (Macro ("It", _) | Macro ("El", _) + | Macro ("Sh", _) | Macro ("Ss", _)) :: _ as rest -> (acc, rest) + | line :: rest -> + collect_desc (match mdoc_text_of line with Some s -> s :: acc | None -> acc) rest + in + let desc_of rest = + let parts, rest = collect_desc [] rest in + (String.concat " " (List.rev parts) |> String.trim, rest) + in + let parse_it args rest entries = + let desc, rest = desc_of rest in + let entries = match parse_mdoc_it args with + | Some e -> { e with desc } :: entries + | None -> entries + in + (entries, rest) + in + let rec parse_option_list entries = function + | [] -> (entries, []) + | Macro ("El", _) :: rest -> (entries, rest) + | Macro ("It", args) :: rest -> + let entries, rest = parse_it args rest entries in + parse_option_list entries rest + | _ :: rest -> parse_option_list entries rest + in + let rec scan entries positionals = function + | [] -> (entries, positionals) + | Macro ("Bl", _) :: Macro ("It", it_args) :: rest -> + let words = String.split_on_char ' ' it_args + |> List.filter (fun w -> w <> "") in + if (match words with "Fl" :: _ -> true | _ -> false) then + let entries, rest = parse_it it_args rest entries in + let entries, rest = parse_option_list entries rest in + scan entries positionals rest + else + scan entries positionals (skip_to_el rest) + | Macro ("Bl", _) :: rest -> scan entries positionals (skip_to_el rest) + | Macro ("Sh", args) :: rest + when String.uppercase_ascii (String.trim args) = "SYNOPSIS" -> + let positionals, rest = parse_synopsis positionals rest in + scan entries positionals rest + | _ :: rest -> scan entries positionals rest + and parse_synopsis positionals = function + | [] -> (positionals, []) + | Macro ("Sh", _) :: _ as rest -> (positionals, rest) + | Macro ("Ar", args) :: rest -> + let positionals = match positional_of_mdoc_line false args with + | Some p -> p :: positionals | None -> positionals in + parse_synopsis positionals rest + | Macro ("Op", args) :: rest -> + let words = String.split_on_char ' ' args + |> List.filter (fun w -> w <> "") in + let positionals = match words with + | "Ar" :: _ -> + (match positional_of_mdoc_line true args with + | Some p -> p :: positionals | None -> positionals) + | _ -> positionals in + parse_synopsis positionals rest + | _ :: rest -> parse_synopsis positionals rest + in + let entries, positionals = scan [] [] classified in + let positionals = + List.rev positionals + |> List.fold_left (fun (seen, acc) p -> + if List.mem p.pos_name seen then (seen, acc) + else (p.pos_name :: seen, p :: acc) + ) ([], []) + |> snd |> List.rev + in + { entries = List.rev entries; subcommands = []; positionals; description = "" } + +(* --- COMMANDS section subcommand extraction --- + * some manpages (notably systemctl) have a dedicated COMMANDS section + * listing subcommands with descriptions. these use .PP + bold name + + * .RS/.RE blocks: + * .PP + * \fBstart\fR \fIUNIT\fR... + * .RS 4 + * Start (activate) one or more units. + * .RE + * + * we extract the bold command name and first sentence of description. *) + +let extract_commands_section lines = + let classified = List.map classify_line lines in + let rec collect_until_next_sh lines acc = + match lines with + | [] -> List.rev acc + | Macro ("SH", _) :: _ -> List.rev acc + | line :: rest -> collect_until_next_sh rest (line :: acc) + in + let is_commands_section name = + let s = String.uppercase_ascii (String.trim name) in + s = "COMMANDS" || s = "COMMAND" + in + let rec find_commands acc = function + | [] -> List.rev acc + | Macro ("SH", args) :: rest when is_commands_section args -> + find_commands (collect_until_next_sh rest [] :: acc) rest + | _ :: rest -> find_commands acc rest + in + let sections = find_commands [] classified in + List.concat sections + +(* extract subcommand name from a bold groff text like + * "\fBlist\-units\fR [\fIPATTERN\fR...]" → "list-units" + * + * validates that the extracted name looks like a subcommand: lowercase, + * at least 2 chars, no leading dash. falls back to stripping all groff + * and taking the first word if no \fB...\fR wrapper is found. *) +let extract_bold_command_name text = + let s = String.trim text in + (* Look for \fB...\fR at the start *) + if String.length s >= 4 + && s.[0] = '\\' && s.[1] = 'f' && s.[2] = 'B' then + let start = 3 in + let end_marker = "\\fR" in + match String.split_on_char '\\' (String.sub s start (String.length s - start)) with + | name_part :: _ -> + let name = strip_groff_escapes ("\\fB" ^ name_part ^ end_marker) |> String.trim in + (* Must look like a subcommand: lowercase, hyphens, no leading dash *) + if String.length name >= 2 + && name.[0] <> '-' + && String.for_all (fun c -> + (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c = '-' || c = '_' + ) name + then Some name + else None + | [] -> None + else + (* Try already-stripped text *) + let stripped = strip_groff_escapes s in + let first_word = match String.split_on_char ' ' stripped with + | w :: _ -> w | [] -> "" in + if String.length first_word >= 2 + && first_word.[0] <> '-' + && String.for_all (fun c -> + (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c = '-' || c = '_' + ) first_word + then Some first_word + else None + +let extract_subcommands_from_commands lines = + let rec walk lines acc = + match lines with + | [] -> List.rev acc + | Macro ("PP", _) :: rest -> + begin match rest with + | Text tag :: rest2 -> + (* Check if this is a subcommand (bold name, not a flag) *) + begin match extract_bold_command_name tag with + | Some name -> + (* Collect description from .RS/.RE block *) + let rec collect_desc lines desc_acc = + match lines with + | Macro ("RS", _) :: rest3 -> + collect_in_rs rest3 desc_acc + | Text s :: rest3 -> + collect_desc rest3 (s :: desc_acc) + | _ -> (String.concat " " (List.rev desc_acc), lines) + and collect_in_rs lines desc_acc = + match lines with + | Macro ("RE", _) :: rest3 -> + (String.concat " " (List.rev desc_acc), rest3) + | Text s :: rest3 -> + collect_in_rs rest3 (s :: desc_acc) + | Macro ("PP", _) :: _ | Macro ("SH", _) :: _ | Macro ("SS", _) :: _ -> + (String.concat " " (List.rev desc_acc), lines) + | _ :: rest3 -> collect_in_rs rest3 desc_acc + | [] -> (String.concat " " (List.rev desc_acc), []) + in + let (desc, rest3) = collect_desc rest2 [] in + let desc = String.trim desc in + (* Take first sentence as description *) + let short_desc = match String.split_on_char '.' desc with + | first :: _ when String.length first > 0 -> String.trim first + | _ -> desc in + let sc : subcommand = { name; desc = short_desc } in + walk rest3 (sc :: acc) + | None -> walk rest2 acc + end + | _ -> walk rest acc + end + | _ :: rest -> walk rest acc + in + walk lines [] + +(* --- top-level api --- *) + +(* parse a manpage from its classified lines. + * auto-detects mdoc vs groff format. for groff, runs the multi-strategy + * extraction pipeline: extract OPTIONS section → try all strategies → + * pick best → extract SYNOPSIS positionals → extract COMMANDS subcommands. *) +let parse_manpage_lines lines = + if is_mdoc lines then + parse_mdoc_lines lines + else begin + let options_section = extract_options_section lines in + let entries = extract_entries options_section in + let positionals = extract_synopsis_positionals_lines lines in + let commands_section = extract_commands_section lines in + let subcommands = extract_subcommands_from_commands commands_section in + { entries; subcommands; positionals; description = "" } + end + +(* parse a manpage from its raw string contents. + * splits into lines, parses, then extracts the NAME section description. *) +let parse_manpage_string contents = + let lines = String.split_on_char '\n' contents in + let result = parse_manpage_lines lines in + let description = match extract_name_description contents with + | Some d -> d | None -> "" in + { result with description } + +(* --- clap-style SUBCOMMAND section extraction --- + * manpages generated by clap (rust's cli arg parser) put each subcommand + * under its own .SH SUBCOMMAND header with a Usage: line giving the name. + * this is unusual — most tools list subcommands under a single COMMANDS section. + * + * we collect all .SH SUBCOMMAND/SUBCOMMANDS sections, find the Usage: line + * in each to get the subcommand name, then extract flag entries from the + * section body. returns triples of (name, description, help_result). *) +let extract_subcommand_sections contents = + let lines = String.split_on_char '\n' contents in + let classified = List.map classify_line lines in + (* Split into sections at .SH boundaries *) + let rec collect_sections acc current_name current_lines = function + | [] -> + let acc = match current_name with + | Some n -> (n, List.rev current_lines) :: acc + | None -> acc in + List.rev acc + | Macro ("SH", args) :: rest -> + let acc = match current_name with + | Some n -> (n, List.rev current_lines) :: acc + | None -> acc in + let name = String.uppercase_ascii (String.trim args) in + if name = "SUBCOMMAND" || name = "SUBCOMMANDS" then + collect_sections acc (Some name) [] rest + else + collect_sections acc None [] rest + | line :: rest -> + collect_sections acc current_name (line :: current_lines) rest + in + let sections = collect_sections [] None [] classified in + (* For each SUBCOMMAND section, extract name from Usage: line and parse entries *) + let usage_re = Str.regexp {|Usage: \([a-zA-Z0-9_-]+\)|} in + let matches_usage s = + try ignore (Str.search_forward usage_re s 0); Some (Str.matched_group 1 s) + with Not_found -> None in + List.filter_map (fun (_header, section_lines) -> + let name, desc_lines = + List.fold_left (fun (name, desc_lines) line -> + match name with + | Some _ -> (name, desc_lines) + | None -> + match line with + | Text s -> + (match matches_usage s with + | Some _ as found -> (found, desc_lines) + | None -> (None, s :: desc_lines)) + | Macro (("TP" | "B" | "BI" | "BR"), args) -> + let s = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in + (matches_usage s, desc_lines) + | _ -> (None, desc_lines) + ) (None, []) section_lines in + match name with + | None -> None + | Some subcmd_name -> + let entries = extract_entries section_lines in + let desc = String.concat " " (List.rev desc_lines) + |> strip_groff_escapes |> String.trim in + let desc = Str.global_replace (Str.regexp "`\\([^`]*\\)`") "\\1" desc in + Some (subcmd_name, desc, { entries; subcommands = []; positionals = []; description = desc }) + ) sections + +(* read a manpage file from disk. handles .gz compressed files (the common + * case — most installed manpages are gzipped) using the Gzip library. + * plain text files are read directly. *) +let read_manpage_file path = + if Filename.check_suffix path ".gz" then begin + let ic = Gzip.open_in path in + let buf = Buffer.create 8192 in + let chunk = Bytes.create 8192 in + (try while true do + let n = Gzip.input ic chunk 0 8192 in + if n = 0 then raise Exit + else Buffer.add_subbytes buf chunk 0 n + done with Exit | End_of_file -> ()); + Gzip.close_in ic; + Buffer.contents buf + end else begin + let ic = open_in path in + let n = in_channel_length ic in + let s = Bytes.create n in + really_input ic s 0 n; + close_in ic; + Bytes.to_string s + end + +(* convenience: read + parse a manpage file in one step *) +let parse_manpage_file path = + read_manpage_file path |> parse_manpage_string diff --git a/lib/nushell.ml b/lib/nushell.ml new file mode 100644 index 0000000..d1eb782 --- /dev/null +++ b/lib/nushell.ml @@ -0,0 +1,242 @@ +(* nushell.ml — generate nushell extern definitions from parsed help data. + * + * this module is the code generation backend. it takes a help_result (from + * the parser or manpage modules) and produces nushell source code that + * defines "extern" declarations — nushell's mechanism for teaching the shell + * about external commands' flags and subcommands so it can offer completions. + * + * it also maintains a list of nushell's built-in commands to avoid generating + * extern definitions that would shadow them. + * + * key responsibilities: + * - deduplicating flag entries (same flag from multiple help sources) + * - mapping parameter names to nushell types (path, int, string) + * - formatting flags in nushell syntax: --flag(-f): type # description + * - handling positional arguments with nushell's ordering constraints + * - escaping special characters for nushell string literals + *) + +open Parser + +module SSet = Set.Make(String) +module SMap = Map.Make(String) +module CSet = Set.Make(Char) + +(* nushell built-in commands and keywords — we must never generate extern + * definitions for these because it would shadow nushell's own implementations. + * this list is maintained manually and should be updated with new nushell releases. *) +let nushell_builtins = [ + "alias"; "all"; "ansi"; "any"; "append"; "ast"; "attr"; + "bits"; "break"; "bytes"; + "cal"; "cd"; "char"; "chunk-by"; "chunks"; "clear"; "collect"; + "columns"; "commandline"; "compact"; "complete"; "config"; "const"; + "continue"; "cp"; + "date"; "debug"; "decode"; "def"; "default"; "describe"; "detect"; + "do"; "drop"; "du"; + "each"; "echo"; "encode"; "enumerate"; "error"; "every"; "exec"; + "exit"; "explain"; "explore"; "export"; "export-env"; "extern"; + "fill"; "filter"; "find"; "first"; "flatten"; "for"; "format"; "from"; + "generate"; "get"; "glob"; "grid"; "group-by"; + "hash"; "headers"; "help"; "hide"; "hide-env"; "histogram"; + "history"; "http"; + "if"; "ignore"; "input"; "insert"; "inspect"; "interleave"; "into"; + "is-admin"; "is-empty"; "is-not-empty"; "is-terminal"; "items"; + "job"; "join"; + "keybindings"; "kill"; + "last"; "length"; "let"; "let-env"; "lines"; "load-env"; "loop"; "ls"; + "match"; "math"; "merge"; "metadata"; "mkdir"; "mktemp"; "module"; + "move"; "mut"; "mv"; + "nu-check"; "nu-highlight"; + "open"; "overlay"; + "panic"; "par-each"; "parse"; "path"; "plugin"; "port"; "prepend"; "print"; "ps"; + "query"; + "random"; "reduce"; "reject"; "rename"; "return"; "reverse"; "rm"; + "roll"; "rotate"; "run-external"; + "save"; "schema"; "scope"; "select"; "seq"; "shuffle"; "skip"; "sleep"; + "slice"; "sort"; "sort-by"; "source"; "source-env"; "split"; "start"; + "stor"; "str"; "sys"; + "table"; "take"; "tee"; "term"; "timeit"; "to"; "touch"; "transpose"; + "try"; "tutor"; + "ulimit"; "umask"; "uname"; "uniq"; "uniq-by"; "unlet"; "update"; + "upsert"; "url"; "use"; + "values"; "version"; "view"; + "watch"; "where"; "which"; "while"; "whoami"; "window"; "with-env"; "wrap"; + "zip"; +] + +(* lazily constructed set for fast lookup *) +let builtin_set = lazy (SSet.of_list nushell_builtins) + +let is_nushell_builtin cmd = + SSet.mem cmd (Lazy.force builtin_set) + +(* deduplicate flag entries that refer to the same flag. + * when the same flag appears multiple times (e.g. from overlapping manpage + * sections or repeated help text), we keep the "best" version using a score: + * - both short+long form: +10 (most informative) + * - has a parameter: +5 + * - description length bonus: up to +5 + * + * peculiarity: after deduplication by long name, we also remove standalone + * short flags whose letter is already covered by a Both(short, long) entry. + * this prevents emitting both "-v" and "--verbose(-v)" which nushell would + * reject as a duplicate. the filtering preserves original ordering from the + * help text. *) +let dedup_entries entries = + let key_of entry = + match entry.switch with + | Short c -> Printf.sprintf "-%c" c + | Long l | Both (_, l) -> Printf.sprintf "--%s" l + in + let score entry = + let sw = match entry.switch with Both _ -> 10 | _ -> 0 in + let p = match entry.param with Some _ -> 5 | None -> 0 in + let d = min 5 (String.length entry.desc / 10) in + sw + p + d + in + let best = List.fold_left (fun acc e -> + let k = key_of e in + match SMap.find_opt k acc with + | Some prev when score prev >= score e -> acc + | _ -> SMap.add k e acc + ) SMap.empty entries in + let covered = SMap.fold (fun _ e acc -> + match e.switch with + | Both (c, _) -> CSet.add c acc + | _ -> acc + ) best CSet.empty in + List.fold_left (fun (seen, acc) e -> + let k = key_of e in + if SSet.mem k seen then (seen, acc) + else match e.switch with + | Short c when CSet.mem c covered -> (seen, acc) + | _ -> (SSet.add k seen, SMap.find k best :: acc) + ) (SSet.empty, []) entries |> snd |> List.rev + +(* map parameter names to nushell types. + * nushell's extern declarations use typed parameters, so we infer the type + * from the parameter name. file/path-related names become "path" (enables + * path completion), numeric names become "int", everything else is "string". *) +let nushell_type_of_param = function + | "FILE" | "file" | "PATH" | "path" | "DIR" | "dir" | "DIRECTORY" + | "FILENAME" | "PATTERNFILE" -> "path" + | "NUM" | "N" | "COUNT" | "NUMBER" | "int" | "INT" | "COLS" | "WIDTH" + | "LINES" | "DEPTH" | "depth" -> "int" + | _ -> "string" + +(* escape a string for use inside nushell double-quoted string literals. + * only double quotes and backslashes need escaping in nushell's syntax. *) +let escape_nu s = + if not (String.contains s '"') && not (String.contains s '\\') then s + else begin + let buf = Buffer.create (String.length s + 4) in + String.iter (fun c -> match c with + | '"' -> Buffer.add_string buf "\\\"" + | '\\' -> Buffer.add_string buf "\\\\" + | _ -> Buffer.add_char buf c + ) s; + Buffer.contents buf + end + +(* format a single flag entry as a nushell extern parameter line. + * output examples: + * " --verbose(-v) # increase verbosity" + * " --output(-o): path # write output to file" + * " -n: int # number of results" + * + * the description is right-padded to column 40 with a "# " comment prefix. + * nushell's syntax for combined short+long is "--long(-s)". *) +let format_flag entry = + let name = match entry.switch with + | Both (s, l) -> Printf.sprintf "--%s(-%c)" l s + | Long l -> Printf.sprintf "--%s" l + | Short s -> Printf.sprintf "-%c" s + in + let typed = match entry.param with + | Some (Mandatory p) | Some (Optional p) -> ": " ^ nushell_type_of_param p + | None -> "" + in + let flag = " " ^ name ^ typed in + if String.length entry.desc = 0 then flag + else + let pad_len = max 1 (40 - String.length flag) in + flag ^ String.make pad_len ' ' ^ "# " ^ entry.desc + +(* format a positional argument as a nushell extern parameter line. + * nushell syntax: "...name: type" for variadic, "name?: type" for optional. + * hyphens in names are converted to underscores (nushell identifiers can't + * contain hyphens). *) +let format_positional p = + let name = String.map (function '-' -> '_' | c -> c) p.pos_name in + let prefix = if p.variadic then "..." else "" in + let suffix = if p.optional && not p.variadic then "?" else "" in + let typ = nushell_type_of_param (String.uppercase_ascii p.pos_name) in + Printf.sprintf " %s%s%s: %s" prefix name suffix typ + +(* enforce nushell's positional argument ordering rules: + * 1. no required positional may follow an optional one + * 2. at most one variadic ("rest") parameter is allowed + * + * if a required positional appears after an optional one, it's silently + * promoted to optional. duplicate variadic params are dropped. *) +let fixup_positionals positionals = + List.fold_left (fun (saw_opt, saw_rest, acc) p -> + if p.variadic then + if saw_rest then (saw_opt, saw_rest, acc) + else (true, true, p :: acc) + else if saw_opt then + (true, saw_rest, { p with optional = true } :: acc) + else + (p.optional, saw_rest, p :: acc) + ) (false, false, []) positionals + |> fun (_, _, acc) -> List.rev acc + +(* generate the full nushell extern block for a command. + * produces output like: + * export extern "git add" [ + * ...pathspec?: path + * --verbose(-v) # be verbose + * --dry-run(-n) # dry run + * ] + * + * subcommands that weren't resolved into their own full definitions get + * stub externs with just a comment containing their description: + * export extern "git stash" [ # stash changes + * ] + *) +let extern_of cmd_name result = + let entries = dedup_entries result.entries in + let cmd = escape_nu cmd_name in + let positionals = fixup_positionals result.positionals in + let pos_lines = List.map (fun p -> format_positional p ^ "\n") positionals in + let flags = List.map (fun e -> format_flag e ^ "\n") entries in + let main = Printf.sprintf "export extern \"%s\" [\n%s%s]\n" cmd (String.concat "" pos_lines) (String.concat "" flags) in + let subs = List.map (fun (sc : subcommand) -> + Printf.sprintf "\nexport extern \"%s %s\" [ # %s\n]\n" + cmd (escape_nu sc.name) (escape_nu sc.desc) + ) result.subcommands in + String.concat "" (main :: subs) + +(* public alias for extern_of *) +let generate_extern = extern_of + +(* derive a nushell module name from a command name. + * replaces non-alphanumeric characters with hyphens and appends "-completions". + * e.g. "git" → "git-completions", "docker-compose" → "docker-compose-completions" *) +let module_name_of cmd_name = + let s = String.map (function + | ('a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_') as c -> c | _ -> '-') cmd_name in + s ^ "-completions" + +(* generate a complete nushell module wrapping the extern. + * output: "module git-completions { ... }\n\nuse git-completions *\n" + * the "use" at the end makes the extern immediately available. *) +let generate_module cmd_name result = + let m = module_name_of cmd_name in + Printf.sprintf "module %s {\n%s}\n\nuse %s *\n" m (extern_of cmd_name result) m + +(* convenience wrapper: generate an extern from just a list of entries + * (no subcommands, positionals, or description). used when we only have + * flag data and nothing else. *) +let generate_extern_from_entries cmd_name entries = + generate_extern cmd_name { entries; subcommands = []; positionals = []; description = "" } diff --git a/lib/parser.ml b/lib/parser.ml new file mode 100644 index 0000000..704a332 --- /dev/null +++ b/lib/parser.ml @@ -0,0 +1,802 @@ +(* parser.ml — parse --help output into structured flag/subcommand/positional data. + * + * this module is the core of inshellah's help-text understanding. it takes the + * raw text that a cli tool prints when you run `cmd --help` and extracts: + * - flag entries (short/long switches with optional parameters and descriptions) + * - subcommand listings (name + description pairs) + * - positional arguments (from usage lines) + * + * the parser is built on angstrom (a monadic parser combinator library) for the + * structured flag/subcommand extraction, with hand-rolled imperative parsers for + * usage-line positional extraction (where the format is too varied for clean + * combinator composition). + * + * key design decisions: + * - the angstrom parser runs in prefix-consume mode — it doesn't need to parse + * the entire input, just extract what it can recognize. unrecognized lines are + * skipped via skip_non_option_line. + * - multi-line descriptions are handled via indentation-based continuation: + * lines indented 8+ spaces that don't start with '-' are folded into the + * previous entry's description. + * - subcommand detection uses a heuristic: lines with a name followed by 2+ + * spaces then a description, where the name is at least 2 chars. section + * headers (like "arguments:") toggle whether name-description pairs are + * treated as subcommands or positionals. + * - positional extraction has two paths: usage-line parsing (the common case) + * and cli11's explicit "positionals:" section format. + *) + +open Angstrom + +(* strip ansi escape sequences and osc hyperlinks from --help output. + * many modern cli tools emit colored/styled output even when piped, + * so we need to clean this before parsing. handles: + * - csi sequences (esc [ ... final_byte) — colors, cursor movement, etc. + * - osc sequences (esc ] ... bel/st) — hyperlinks, window titles, etc. + * - other two-byte esc+char sequences *) +let strip_ansi s = + let buf = Buffer.create (String.length s) in + let len = String.length s in + let i = ref 0 in + while !i < len do + if !i + 1 < len && Char.code s.[!i] = 0x1b then begin + let next = s.[!i + 1] in + if next = '[' then begin + (* CSI sequence: ESC [ ... final_byte *) + i := !i + 2; + while !i < len && not (s.[!i] >= '@' && s.[!i] <= '~') do incr i done; + if !i < len then incr i + end else if next = ']' then begin + (* OSC sequence: ESC ] ... (terminated by BEL or ESC \) *) + i := !i + 2; + let found = ref false in + while !i < len && not !found do + if s.[!i] = '\x07' then + (incr i; found := true) + else if !i + 1 < len && Char.code s.[!i] = 0x1b && s.[!i + 1] = '\\' then + (i := !i + 2; found := true) + else + incr i + done + end else begin + (* Other ESC sequence, skip ESC + one char *) + i := !i + 2 + end + end else begin + Buffer.add_char buf s.[!i]; + incr i + end + done; + Buffer.contents buf + +(* --- character class predicates --- *) +(* these are used throughout the angstrom parsers to classify characters. + * they're separated out for readability and reuse. *) + +let is_whitespace = function ' ' | '\t' -> true | _ -> false + +let is_alphanumeric = function + | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' -> true + | _ -> false + +(* characters allowed inside parameter names like FILE, output-dir, etc. *) +let is_param_char = function + | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '_' | '-' -> true + | _ -> false + +(* used to detect all-caps parameter names like FILE, TIME_STYLE *) +let is_upper_or_underscore = function + | 'A' .. 'Z' | '_' -> true + | _ -> false + +(* characters allowed in long flag names (--foo-bar, --enable-feature2) *) +let is_long_char = function + | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '-' -> true + | _ -> false + +(* --- core types --- + * these types represent the structured output of parsing a help text. + * they are shared across the entire codebase (nushell codegen, store, manpage parser). + * + * switch: a flag can be short-only (-v), long-only (--verbose), or both (-v, --verbose). + * the both variant keeps the pair together so nushell can emit "--verbose(-v)". + * + * param: flags can take mandatory (--output FILE) or optional (--color[=WHEN]) values. + * + * entry: one complete flag definition — its switch form, optional parameter, and + * the description text (potentially multi-line, already joined). + * + * help_result: the complete parsed output for a single command. *) +type switch = Short of char | Long of string | Both of char * string +type param = Mandatory of string | Optional of string +type entry = { switch : switch; param : param option; desc : string } +type subcommand = { name : string; desc : string } +type positional = { pos_name : string; optional : bool; variadic : bool } +type help_result = { entries : entry list; subcommands : subcommand list; positionals : positional list; description : string } + +(* --- low-level angstrom combinators --- *) +(* these are the building blocks for all the parsers below. *) + +(* consume horizontal whitespace (spaces and tabs) without crossing lines *) +let inline_ws = skip_while (function ' ' | '\t' -> true | _ -> false) +(* end of line — matches either a newline or end of input. + * this is the permissive version used in most places. *) +let eol = end_of_line <|> end_of_input +(* strict end of line — must consume an actual newline character. + * used in skip_non_option_line so we don't accidentally match eof + * and consume it when we shouldn't. *) +let eol_strict = end_of_line + +(* --- switch and parameter parsers --- *) +(* these parse the flag name portion of an option line, e.g. "-v", "--verbose" *) + +let short_switch = char '-' *> satisfy is_alphanumeric +let long_switch = string "--" *> take_while1 is_long_char +let comma = char ',' *> inline_ws + +(* parameter parsers — these handle the various syntaxes tools use to indicate + * that a flag takes a value. the formats are surprisingly diverse: + * --output=FILE (eq_man_param — mandatory, common in gnu tools) + * --color[=WHEN] (eq_opt_param — optional with = syntax) + * --depth DEPTH (space_upper_param — space-separated ALL_CAPS) + * --file (space_angle_param — angle brackets) + * --file [] (space_opt_angle_param — optional angle brackets) + * --format string (space_type_param — go/cobra lowercase type word) + *) +let eq_opt_param = + string "[=" *> take_while1 is_param_char <* char ']' >>| fun a -> Optional a + +let eq_man_param = + char '=' *> take_while1 is_param_char >>| fun a -> Mandatory a + +(* space-separated ALL_CAPS param: e.g. " FILE", " TIME_STYLE". + * peculiarity: we peek ahead and check the first char is uppercase, then + * validate the entire word is ALL_CAPS. this prevents false positives where + * a description word like "Do" or "Set" immediately follows the flag name. + * digits are allowed (e.g. "SHA256") but lowercase chars disqualify. *) +let space_upper_param = + char ' ' *> peek_char_fail >>= fun c -> + if is_upper_or_underscore c then + take_while1 is_param_char >>= fun name -> + if String.length name >= 1 && String.for_all (fun c -> is_upper_or_underscore c || c >= '0' && c <= '9') name then + return (Mandatory name) + else + fail "not an all-caps param" + else + fail "not an uppercase param" + +(* Angle-bracket param: e.g. "", "" *) +let angle_param = + char '<' *> take_while1 (fun c -> c <> '>') <* char '>' >>| fun name -> + Mandatory name + +(* Space + angle bracket param *) +let space_angle_param = + char ' ' *> angle_param + +(* Optional angle bracket param: [] *) +let opt_angle_param = + char '[' *> char '<' *> take_while1 (fun c -> c <> '>') <* char '>' <* char ']' + >>| fun name -> Optional name + +let space_opt_angle_param = + char ' ' *> opt_angle_param + +(* go/cobra style: space + lowercase type word like "string", "list", "int". + * peculiarity: capped at 10 chars to avoid consuming description words. + * go's flag libraries commonly emit "--timeout duration" or "--name string" + * where the type name is a short lowercase word. longer words are almost + * certainly the start of a description, not a type annotation. *) +let space_type_param = + char ' ' *> peek_char_fail >>= fun c -> + if c >= 'a' && c <= 'z' then + take_while1 (fun c -> c >= 'a' && c <= 'z') >>= fun name -> + if String.length name <= 10 then + return (Mandatory name) + else + fail "too long for type param" + else + fail "not a lowercase type param" + +(* try each parameter format in order of specificity. the ordering matters: + * eq_opt_param must come before eq_man_param because "[=WHEN]" would otherwise + * partially match as "=WHEN" then fail on the trailing "]". similarly, + * space_opt_angle_param before space_angle_param to catch "[]" before "". *) +let param_parser = + option None + (choice + [ eq_opt_param; eq_man_param; + space_opt_angle_param; space_angle_param; + space_upper_param; space_type_param ] + >>| fun a -> Some a) + +(* switch parser — handles the various ways help text presents flag names. + * formats handled (in order of attempt): + * -a, --all (short + comma + long — gnu style) + * -a --all (short + space + long — some tools omit the comma) + * --all / -a (long + slash + short — rare but seen in some tools) + * -a (short only) + * --all (long only) + * + * peculiarity: the ordering is critical because angstrom's choice commits to + * the first parser that makes progress. short_switch consumes "-a", so the + * combined parsers must be tried before the short-only parser. *) +let switch_parser = + choice + [ + (short_switch >>= fun s -> + comma *> long_switch >>| fun l -> Both (s, l)); + (short_switch >>= fun s -> + char ' ' *> long_switch >>| fun l -> Both (s, l)); + (long_switch >>= fun l -> + inline_ws *> char '/' *> inline_ws *> + short_switch >>| fun s -> Both (s, l)); + (short_switch >>| fun s -> Short s); + (long_switch >>| fun l -> Long l); + ] + +(* --- description parsing with multi-line continuation --- + * descriptions in help text often wrap across multiple lines. the convention + * is that continuation lines are deeply indented (8+ spaces) and don't start + * with '-' (which would indicate a new flag entry). we peek ahead to check + * indentation without consuming, then decide whether to fold the line in. *) + +(* take the rest of the line as text (does not consume the newline itself) *) +let rest_of_line = take_till (fun c -> c = '\n' || c = '\r') + +(* check if a line is a continuation line: deeply indented, doesn't start with '-'. + * peculiarity: we count tabs as 8 spaces to match typical terminal rendering. + * the 8-space threshold was chosen empirically — most help formatters indent + * descriptions at least this much, while flag lines are indented 2-4 spaces. *) +let continuation_line = + peek_string 1 >>= fun _ -> + (* Must start with significant whitespace (8+ spaces or tab) *) + let count_indent s = + let n = ref 0 in + let i = ref 0 in + while !i < String.length s do + (match s.[!i] with + | ' ' -> incr n + | '\t' -> n := !n + 8 + | _ -> i := String.length s); + incr i + done; + !n + in + available >>= fun avail -> + if avail = 0 then fail "eof" + else + (* Peek ahead to see indentation level *) + peek_string (min avail 80) >>= fun preview -> + let indent = count_indent preview in + let trimmed = String.trim preview in + let starts_with_dash = + String.length trimmed > 0 && trimmed.[0] = '-' + in + if indent >= 8 && not starts_with_dash then + (* This is a continuation line — consume whitespace + text *) + inline_ws *> rest_of_line <* eol + else + fail "not a continuation line" + +(* parse description text: first line (after switch+param) plus any continuation lines. + * blank continuation lines are filtered out, and all lines are trimmed and joined + * with spaces into a single string. *) +let description = + inline_ws *> rest_of_line <* eol >>= fun first_line -> + many continuation_line >>| fun cont_lines -> + let all = first_line :: cont_lines in + let all = List.filter (fun s -> String.length (String.trim s) > 0) all in + String.concat " " (List.map String.trim all) + +(* description that appears on a separate line below the flag. + * this handles the clap (rust) "long" help format where flags and descriptions + * are on separate lines: + * --verbose + * increase verbosity + * here there's no inline description — just deeply-indented continuation lines. *) +let description_below = + many1 continuation_line >>| fun lines -> + let lines = List.filter (fun s -> String.length (String.trim s) > 0) lines in + String.concat " " (List.map String.trim lines) + +(* --- line classification for skipping --- + * the parser needs to skip lines it doesn't understand (section headers, + * blank lines, description paragraphs not attached to a flag, etc.) + * without consuming lines that ARE flag entries. *) + +(* peek ahead to check if the current line looks like a flag entry. + * an option line starts with whitespace then '-'. *) +let at_option_line = + peek_string 1 >>= fun _ -> + available >>= fun avail -> + if avail = 0 then fail "eof" + else + peek_string (min avail 40) >>= fun preview -> + let s = String.trim preview in + if String.length s > 0 && s.[0] = '-' then return () + else fail "not an option line" + +(* skip a non-option line (section header, blank, description-only, etc.). + * peculiarity: uses eol_strict (not eol) so it won't match at eof — this + * prevents the parser from infinitely skipping at the end of input. if the + * line looks like an option line (at_option_line succeeds), we deliberately + * fail so that the entry parser gets a chance at it instead. *) +let skip_non_option_line = + (at_option_line *> fail "this is an option line") + <|> (rest_of_line *> eol_strict *> return ()) + +(* --- entry parsing --- *) + +(* parse a single flag entry: leading whitespace, then switch+param, then description. + * the description can appear on the same line (inline) or on the next line (below). + * if there's no description at all, we accept an empty string. + * the (eol *> description_below) branch handles the clap long-help format. *) +let entry = + inline_ws *> + lift2 (fun (sw, param) desc -> { switch = sw; param; desc }) + (lift2 (fun a b -> (a, b)) switch_parser param_parser) + (description <|> (eol *> (description_below <|> return ""))) + +(* --- subcommand parsing --- + * subcommand lines in help text follow the pattern: + * " name description" + * where the name and description are separated by 2+ spaces. + * some tools also include argument placeholders between name and description: + * " start UNIT... start one or more units" + * " list [PATTERN] list matching units" + *) + +let is_subcommand_char = function + | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | '_' -> true + | _ -> false + +(* skip argument placeholders like UNIT..., [PATTERN...|PID...], + * that appear between the subcommand name and the description. + * only consumes single-space gaps — the two-space gap before the + * description is left for the main parser to use as the delimiter. + * + * peculiarity: this is a recursive (fix-point) parser that peeks ahead + * to distinguish single-space argument gaps from the double-space + * description separator. it accepts tokens that start with [, <, or + * are ALL_CAPS (with dots/pipes/commas for variadic syntax). *) +let skip_arg_placeholders = + fix (fun self -> + (* Peek ahead: single space followed by arg-like token *) + available >>= fun avail -> + if avail < 2 then return () + else + peek_string (min avail 2) >>= fun s2 -> + if String.length s2 >= 2 && s2.[0] = ' ' && s2.[1] <> ' ' then + (* Single space — could be an arg placeholder *) + let next = s2.[1] in + if next = '[' || next = '<' + || (next >= 'A' && next <= 'Z') then + (* Peek the full token to check if it's ALL_CAPS/brackets *) + peek_string (min avail 80) >>= fun preview -> + (* Extract the token after the single space *) + let tok_start = 1 in + let tok_end = ref tok_start in + while !tok_end < String.length preview + && preview.[!tok_end] <> ' ' + && preview.[!tok_end] <> '\n' + && preview.[!tok_end] <> '\r' do + incr tok_end + done; + let tok = String.sub preview tok_start (!tok_end - tok_start) in + (* Accept as placeholder if it starts with [ or < or is ALL_CAPS + (possibly with dots, pipes, dashes) *) + let is_placeholder = + tok.[0] = '[' || tok.[0] = '<' + || String.for_all (fun c -> + (c >= 'A' && c <= 'Z') || c = '_' || c = '-' + || c = '.' || c = '|' || c = ',' || (c >= '0' && c <= '9') + ) tok + in + if is_placeholder then + advance (1 + String.length tok) *> self + else return () + else return () + else return ()) + +(* parse a subcommand entry line. + * requires: name >= 2 chars, followed by 2+ spaces, then description. + * the name is lowercased for consistent lookup. + * + * peculiarity: if the description starts with "- " (a dash-space prefix), + * it's stripped. some tools format their subcommand lists as: + * " add - add a new item" + * where the "- " is decorative, not part of the description. *) +let subcommand_entry = + inline_ws *> + take_while1 is_subcommand_char >>= fun name -> + if String.length name < 2 then fail "subcommand name too short" + else + skip_arg_placeholders *> + char ' ' *> char ' ' *> inline_ws *> + rest_of_line <* eol >>| fun desc -> + { name = String.lowercase_ascii name; + desc = let t = String.trim desc in + if String.length t >= 2 && t.[0] = '-' && t.[1] = ' ' then + String.trim (String.sub t 2 (String.length t - 2)) + else t } + +(* --- section header detection --- + * section headers are critical for disambiguating subcommands from positional + * arguments. lines like "commands:" introduce subcommand sections, while + * "arguments:" or "positionals:" introduce argument sections where the same + * name+description format should NOT be treated as subcommands. *) + +(* detect section names that introduce positional argument listings. + * the check is case-insensitive and strips trailing colons. *) +let is_arg_section s = + let lc = String.lowercase_ascii (String.trim s) in + let base = if String.ends_with ~suffix:":" lc + then String.sub lc 0 (String.length lc - 1) |> String.trim + else lc in + base = "arguments" || base = "args" || base = "positionals" + || base = "positional arguments" + +(* a section header: left-aligned (or lightly indented, <= 4 spaces) text + * ending with ':', not starting with '-'. must be consumed BEFORE + * subcommand_entry in the choice combinator, otherwise "commands:" would + * be parsed as a subcommand named "commands" with description ":". + * + * returns a bool indicating whether this is an argument section (true) + * or some other section (false). this drives the subcommand filtering logic + * in help_parser — entries under argument sections are excluded from the + * subcommand list. *) +let section_header = + available >>= fun avail -> + if avail = 0 then fail "eof" + else + peek_string (min avail 80) >>= fun preview -> + (* Extract just the first line from the preview *) + let first_line = match String.index_opt preview '\n' with + | Some i -> String.sub preview 0 i + | None -> preview in + let t = String.trim first_line in + let len = String.length t in + let indent = let i = ref 0 in + while !i < String.length first_line && (first_line.[!i] = ' ' || first_line.[!i] = '\t') do incr i done; + !i in + if len >= 2 && t.[len - 1] = ':' && t.[0] <> '-' && indent <= 4 then + rest_of_line <* eol_strict >>| fun line -> is_arg_section line + else fail "not a section header" + +(* --- top-level parser --- + * the main help parser: walks through all lines, trying each line as one of: + * 1. a flag entry (starts with whitespace + '-') + * 2. a section header (left-aligned text ending with ':') + * 3. a subcommand line (name + 2+ spaces + description) + * 4. anything else → skip + * + * the choice ordering matters: entries are tried first (highest priority), + * then section headers (must beat subcommand_entry to avoid misparse), + * then subcommands, then skip as fallback. + * + * after collecting all items, two post-processing steps happen: + * - subcommands under argument sections are excluded (tracked via + * a running in_arg_sec boolean toggled by section headers) + * - duplicate subcommand names are deduplicated, keeping the entry + * with the longer description (heuristic: more info = better) + * + * peculiarity: positionals are NOT extracted here — they come from + * the usage line parser (extract_usage_positionals) or cli11's + * explicit section parser (extract_cli11_positionals), applied later + * in parse_help. *) +let help_parser = + let open Angstrom in + fix (fun _self -> + let try_entry = + entry >>| fun e -> `Entry e + in + let try_section = + section_header >>| fun is_arg -> `Section is_arg + in + let try_subcommand = + subcommand_entry >>| fun sc -> `Subcommand sc + in + let try_skip = + skip_non_option_line >>| fun () -> `Skip + in + many (choice [ try_entry; try_section; try_subcommand; try_skip ]) >>| fun items -> + let entries = List.filter_map (function `Entry e -> Some e | _ -> None) items in + let subcommands = + List.fold_left (fun (in_arg_sec, acc) item -> + match item with + | `Section is_arg -> (is_arg, acc) + | `Subcommand sc when not in_arg_sec -> (in_arg_sec, sc :: acc) + | _ -> (in_arg_sec, acc) + ) (false, []) items + |> snd |> List.rev + |> List.fold_left (fun acc sc -> + match List.assoc_opt sc.name acc with + | Some prev when String.length prev.desc >= String.length sc.desc -> acc + | _ -> (sc.name, sc) :: List.remove_assoc sc.name acc + ) [] + |> List.rev_map snd + in + { entries; subcommands; positionals = []; description = "" }) + +(* --- usage line parsing --- + * usage lines look like: "usage: git add [OPTIONS] [--] [...]" + * to extract positional arguments, we first need to skip past the command + * name prefix ("git add") to reach the argument portion. + * + * skip_command_prefix walks word-by-word, treating each space-separated + * token as part of the command name as long as it: + * - is made of "word chars" (alphanumeric, hyphen, underscore, slash, dot) + * - contains at least one lowercase letter (to distinguish from ALL_CAPS + * positional names like FILE) + * - doesn't start with [, <, (, {, or - (which indicate arguments, not + * command name components) + * + * peculiarity: this is an imperative index-walking parser rather than using + * angstrom, because usage lines are a single string (not line-oriented) + * and the format is too varied for clean combinator composition. *) +let skip_command_prefix s = + let len = String.length s in + let i = ref 0 in + let skip_ws () = while !i < len && (s.[!i] = ' ' || s.[!i] = '\t') do incr i done in + let is_word_char = function + | 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '/' | '.' -> true + | _ -> false + in + let rec loop () = + skip_ws (); + if !i >= len then () + else if s.[!i] = '[' || s.[!i] = '<' || s.[!i] = '(' || s.[!i] = '{' || s.[!i] = '-' then () + else if is_word_char s.[!i] then begin + let start = !i in + while !i < len && is_word_char s.[!i] do incr i done; + let word = String.sub s start (!i - start) in + let has_lower = ref false in + String.iter (fun c -> if c >= 'a' && c <= 'z' then has_lower := true) word; + if not !has_lower then + i := start + else + loop () + end + in + loop (); + !i + +(* parse the argument portion of a usage line into positional definitions. + * handles these syntactic forms: + * - mandatory positional + * [file] - optional positional + * FILE - mandatory positional (ALL_CAPS convention) + * ... - variadic (also handles utf-8 ellipsis) + * [file...] - optional variadic + * curly-brace alternatives - skipped, not a positional + * -flag - flags (skipped) + * + * peculiarity: certain all-caps names are skipped because they're not real + * positionals — "OPTIONS", "FLAGS", etc. are section labels that sometimes + * appear in usage lines for readability. + * + * deduplication at the end ensures we don't emit the same positional twice + * (can happen when usage lines are reformatted or repeated). *) +let parse_usage_args s = + let len = String.length s in + let i = ref 0 in + let results = ref [] in + let skip_ws () = + while !i < len && (s.[!i] = ' ' || s.[!i] = '\t') do incr i done in + let is_pos_char c = + (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || (c >= '0' && c <= '9') in + let read_dots () = + skip_ws (); + if !i + 2 < len && s.[!i] = '.' && s.[!i+1] = '.' && s.[!i+2] = '.' then + (i := !i + 3; true) + else if !i + 2 < len && s.[!i] = '\xe2' && s.[!i+1] = '\x80' && s.[!i+2] = '\xa6' then + (i := !i + 3; true) (* UTF-8 ellipsis … *) + else false + in + let is_skip name = + let u = String.uppercase_ascii name in + u = "OPTIONS" || u = "OPTION" || u = "FLAGS" || u = "FLAG" + in + let is_clean_name name = + String.length name >= 2 + && String.for_all (fun c -> + (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') + || (c >= '0' && c <= '9') || c = '_' || c = '-') name + in + let is_letter c = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') in + let skip_braces () = + (* Skip {A|c|d|...} alternative blocks *) + if !i < len && s.[!i] = '{' then begin + let depth = ref 1 in + incr i; + while !i < len && !depth > 0 do + if s.[!i] = '{' then incr depth + else if s.[!i] = '}' then decr depth; + incr i + done; + ignore (read_dots ()); + true + end else false + in + while !i < len do + skip_ws (); + if !i >= len then () + else if skip_braces () then () + else match s.[!i] with + | '[' -> + incr i; + let start = !i in + let depth = ref 1 in + while !i < len && !depth > 0 do + if s.[!i] = '[' then incr depth + else if s.[!i] = ']' then decr depth; + incr i + done; + let bracket_end = !i - 1 in + let inner = String.sub s start (max 0 (bracket_end - start)) |> String.trim in + let inner, has_inner_dots = + if String.ends_with ~suffix:"..." inner then + (String.sub inner 0 (String.length inner - 3) |> String.trim, true) + else (inner, false) + in + let variadic = has_inner_dots || read_dots () in + if String.length inner > 0 + && inner.[0] <> '-' + && (is_letter inner.[0] || inner.[0] = '<') then begin + let name = + if inner.[0] = '<' then + let e = try String.index inner '>' with Not_found -> String.length inner in + String.sub inner 1 (e - 1) + else inner + in + if is_clean_name name && not (is_skip name) then + results := { pos_name = String.lowercase_ascii name; + optional = true; variadic } :: !results + end + | '<' -> + incr i; + let start = !i in + while !i < len && s.[!i] <> '>' do incr i done; + let name = String.sub s start (!i - start) in + if !i < len then incr i; + let variadic = read_dots () in + if is_clean_name name && not (is_skip name) then + results := { pos_name = String.lowercase_ascii name; + optional = false; variadic } :: !results + | '-' -> + while !i < len && s.[!i] <> ' ' && s.[!i] <> '\t' && s.[!i] <> ']' do incr i done + | c when c >= 'A' && c <= 'Z' -> + let start = !i in + while !i < len && is_pos_char s.[!i] do incr i done; + let name = String.sub s start (!i - start) in + let variadic = read_dots () in + if String.length name >= 2 + && String.for_all (fun c -> + (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || (c >= '0' && c <= '9') + ) name + && not (is_skip name) then + results := { pos_name = String.lowercase_ascii name; + optional = false; variadic } :: !results + | _ -> + incr i + done; + List.rev !results + |> List.fold_left (fun (seen, acc) p -> + if List.mem p.pos_name seen then (seen, acc) + else (p.pos_name :: seen, p :: acc) + ) ([], []) + |> snd |> List.rev + +(* find the "usage:" line in the help text and extract positionals from it. + * searches line-by-line for a line starting with "usage:" (case-insensitive). + * handles both inline usage ("usage: cmd [OPTIONS] FILE") and the clap style + * where the actual usage is on the next line: + * USAGE: + * cmd [OPTIONS] FILE + * + * also handles the bare "usage" header (no colon) followed by a next line. *) +let extract_usage_positionals text = + let lines = String.split_on_char '\n' text in + let lines_arr = Array.of_list lines in + let len = Array.length lines_arr in + let find_usage_line () = + let rec go i = + if i >= len then None + else + let t = String.trim lines_arr.(i) in + let tlen = String.length t in + let lc = String.lowercase_ascii t in + if tlen >= 6 && String.sub lc 0 6 = "usage:" then begin + let after = String.sub t 6 (tlen - 6) |> String.trim in + if String.length after > 0 then Some after + else if i + 1 < len then + (* Clap style: USAGE:\n cmd [OPTIONS] PATTERN *) + let next = String.trim lines_arr.(i + 1) in + if String.length next > 0 then Some next else None + else None + end else if lc = "usage" then begin + if i + 1 < len then + let next = String.trim lines_arr.(i + 1) in + if String.length next > 0 then Some next else None + else None + end else go (i + 1) + in + go 0 + in + match find_usage_line () with + | None -> [] + | Some usage -> + let cmd_end = skip_command_prefix usage in + let args = String.sub usage cmd_end (String.length usage - cmd_end) in + parse_usage_args args + +(* extract positionals from cli11's explicit "POSITIONALS:" section. + * cli11 (a c++ arg parsing library) emits a dedicated section: + * Positionals: + * name TEXT description here + * count INT another description + * + * this is preferred over usage-line extraction when present because it + * provides more accurate type information. the parser looks for the + * section header, then reads indented lines until a blank or unindented + * line signals the end. type words (TEXT, INT, FLOAT, etc.) between the + * name and description are skipped. *) +let extract_cli11_positionals text = + let lines = String.split_on_char '\n' text in + let rec find_section = function + | [] -> [] + | line :: rest -> + let t = String.trim line in + if t = "POSITIONALS:" || t = "Positionals:" then + parse_lines rest [] + else + find_section rest + and parse_lines lines acc = + match lines with + | [] -> List.rev acc + | line :: rest -> + let len = String.length line in + if len = 0 || (line.[0] <> ' ' && line.[0] <> '\t') then + List.rev acc + else + let t = String.trim line in + if String.length t = 0 then List.rev acc + else match parse_one t with + | Some p -> parse_lines rest (p :: acc) + | None -> parse_lines rest acc + and parse_one s = + let len = String.length s in + let i = ref 0 in + let is_name_char c = + (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') + || (c >= '0' && c <= '9') || c = '_' || c = '-' in + while !i < len && is_name_char s.[!i] do incr i done; + if !i < 2 then None + else + let name = String.sub s 0 !i in + while !i < len && (s.[!i] = ' ' || s.[!i] = '\t') do incr i done; + (* skip type word: TEXT, INT, FLOAT, ENUM, BOOLEAN, etc. *) + while !i < len && s.[!i] >= 'A' && s.[!i] <= 'Z' do incr i done; + while !i < len && (s.[!i] = ' ' || s.[!i] = '\t') do incr i done; + let variadic = !i + 2 < len && s.[!i] = '.' && s.[!i+1] = '.' && s.[!i+2] = '.' in + Some { pos_name = String.lowercase_ascii name; optional = false; variadic } + in + find_section lines + +(* top-level entry point: parse a --help text string into a help_result. + * steps: + * 1. strip ansi escapes (colors, hyperlinks, etc.) + * 2. run the angstrom help_parser for flags and subcommands + * 3. extract positionals via cli11 format (preferred) or usage line (fallback) + * 4. merge positionals into the result + * uses angstrom's prefix-consume mode — we don't need to parse every byte. *) +let parse_help txt = + let clean = strip_ansi txt in + match Angstrom.parse_string ~consume:Consume.Prefix help_parser clean with + | Ok result -> + let cli11 = extract_cli11_positionals clean in + let usage = extract_usage_positionals clean in + let positionals = if cli11 <> [] then cli11 else usage in + Ok { result with positionals } + | Error msg -> Error msg diff --git a/lib/store.ml b/lib/store.ml new file mode 100644 index 0000000..13b5d1b --- /dev/null +++ b/lib/store.ml @@ -0,0 +1,444 @@ +(* store.ml — filesystem-backed cache of parsed completion data. + * + * this module handles persistence of completion data to disk. each command's + * help_result is serialized to json and stored as a file in a cache directory + * (default: $XDG_CACHE_HOME/inshellah). commands with native nushell completions + * are stored as .nu files instead. + * + * the store also provides lookup, listing, and subcommand discovery by + * scanning filenames in the cache directory. + * + * file naming convention: + * - spaces in command names become underscores (e.g. "git add" → "git_add.json") + * - subcommands of a parent share the prefix (e.g. "git_add.json", "git_commit.json") + * - .json files contain serialized help_result + * - .nu files contain native nushell extern source code + * + * the module includes a minimal hand-rolled json parser/serializer because + * we only need to handle our own output format (no need for a full json library). + *) + +open Parser + +(* get the default store path: $XDG_CACHE_HOME/inshellah, falling back to + * ~/.cache/inshellah if XDG_CACHE_HOME is not set. *) +let default_store_path () = + let cache = try Sys.getenv "XDG_CACHE_HOME" + with Not_found -> Filename.concat (Sys.getenv "HOME") ".cache" in + Filename.concat cache "inshellah" + +(* recursively create directories (equivalent to mkdir -p) *) +let ensure_dir dir = + let rec mkdir_p d = + if Sys.file_exists d then () + else begin mkdir_p (Filename.dirname d); Unix.mkdir d 0o755 end in + mkdir_p dir + +(* convert command name to safe filename: spaces become underscores, + * non-alphanumeric chars become hyphens. + * e.g. "git add" → "git_add", "docker-compose" → "docker-compose" *) +let filename_of_command cmd = + String.map (function + | ' ' -> '_' + | ('a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '.') as c -> c + | _ -> '-') cmd + +(* inverse of filename_of_command: underscores back to spaces. + * peculiarity: this is lossy — original underscores in command names + * (e.g. "my_tool") would be converted to spaces. in practice this + * doesn't matter because tools with underscores in names are rare, + * and subcommands use space-separated naming. *) +let command_of_filename base = + String.map (function '_' -> ' ' | c -> c) base + +(* --- json serialization of help_result --- + * hand-rolled json emitters. we don't use a json library because: + * 1. the schema is fixed and simple — we only serialize our own types + * 2. avoiding dependencies keeps the binary small + * 3. printf-style emission is fast and straightforward for our types *) + +(* escape a string for json: quotes, backslashes, and control characters. + * control chars below 0x20 are emitted as \u00XX unicode escapes. *) +let escape_json s = + let buf = Buffer.create (String.length s + 4) in + String.iter (fun c -> match c with + | '"' -> Buffer.add_string buf "\\\"" + | '\\' -> Buffer.add_string buf "\\\\" + | '\n' -> Buffer.add_string buf "\\n" + | '\t' -> Buffer.add_string buf "\\t" + | '\r' -> Buffer.add_string buf "\\r" + | c when Char.code c < 0x20 -> + Buffer.add_string buf (Printf.sprintf "\\u%04x" (Char.code c)) + | c -> Buffer.add_char buf c + ) s; + Buffer.contents buf + +let json_string s = Printf.sprintf "\"%s\"" (escape_json s) +let json_null = "null" + +let json_switch_of = function + | Short c -> Printf.sprintf "{\"type\":\"short\",\"char\":%s}" (json_string (String.make 1 c)) + | Long l -> Printf.sprintf "{\"type\":\"long\",\"name\":%s}" (json_string l) + | Both (c, l) -> + Printf.sprintf "{\"type\":\"both\",\"char\":%s,\"name\":%s}" + (json_string (String.make 1 c)) (json_string l) + +let json_param_of = function + | None -> json_null + | Some (Mandatory p) -> + Printf.sprintf "{\"kind\":\"mandatory\",\"name\":%s}" (json_string p) + | Some (Optional p) -> + Printf.sprintf "{\"kind\":\"optional\",\"name\":%s}" (json_string p) + +let json_entry_of e = + Printf.sprintf "{\"switch\":%s,\"param\":%s,\"desc\":%s}" + (json_switch_of e.switch) (json_param_of e.param) (json_string e.desc) + +let json_subcommand_of sc = + Printf.sprintf "{\"name\":%s,\"desc\":%s}" (json_string sc.name) (json_string sc.desc) + +let json_positional_of p = + Printf.sprintf "{\"name\":%s,\"optional\":%b,\"variadic\":%b}" + (json_string p.pos_name) p.optional p.variadic + +let json_list f items = + "[" ^ String.concat "," (List.map f items) ^ "]" + +let json_of_help_result ?(source="help") r = + Printf.sprintf "{\"source\":%s,\"description\":%s,\"entries\":%s,\"subcommands\":%s,\"positionals\":%s}" + (json_string source) + (json_string r.description) + (json_list json_entry_of r.entries) + (json_list json_subcommand_of r.subcommands) + (json_list json_positional_of r.positionals) + +(* --- json deserialization --- + * minimal hand-rolled recursive-descent json parser. only handles the subset + * we emit: strings, booleans, nulls, arrays, and objects. no number parsing + * (we don't emit numbers). this is intentionally minimal — we only read back + * our own serialized format, so robustness against arbitrary json is not needed. + * + * peculiarity: the \u escape handler does basic utf-8 encoding for code points + * up to 0xffff but doesn't handle surrogate pairs. this is fine for our use + * case since we only escape control characters below 0x20. *) + +type json = + | Jnull + | Jbool of bool + | Jstring of string + | Jarray of json list + | Jobject of (string * json) list + +(* json accessor helpers — return sensible defaults for missing/wrong types *) +let json_get key = function + | Jobject pairs -> (try List.assoc key pairs with Not_found -> Jnull) + | _ -> Jnull + +let json_to_string = function Jstring s -> s | _ -> "" +let json_to_bool = function Jbool b -> b | _ -> false +let json_to_list = function Jarray l -> l | _ -> [] + +exception Json_error of string + +(* imperative recursive-descent json parser. + * uses a mutable position ref to walk through the string. + * peculiarity: boolean/null parsing just advances a fixed number of chars + * without validating the actual characters — safe because we only read + * our own output, but would be incorrect for arbitrary json. *) +let parse_json s = + let len = String.length s in + let pos = ref 0 in + let peek () = if !pos < len then s.[!pos] else '\x00' in + let advance () = incr pos in + let skip_ws () = + while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t' + || s.[!pos] = '\n' || s.[!pos] = '\r') do + advance () + done in + let expect c = + skip_ws (); + if peek () <> c then + raise (Json_error (Printf.sprintf "expected '%c' at %d" c !pos)); + advance () in + let rec parse_value () = + skip_ws (); + match peek () with + | '"' -> Jstring (parse_string ()) + | '{' -> parse_object () + | '[' -> parse_array () + | 'n' -> advance (); advance (); advance (); advance (); Jnull + | 't' -> advance (); advance (); advance (); advance (); Jbool true + | 'f' -> + advance (); advance (); advance (); advance (); advance (); Jbool false + | c -> raise (Json_error (Printf.sprintf "unexpected '%c' at %d" c !pos)) + and parse_string () = + expect '"'; + let buf = Buffer.create 32 in + while peek () <> '"' do + if peek () = '\\' then begin + advance (); + (match peek () with + | '"' -> Buffer.add_char buf '"' + | '\\' -> Buffer.add_char buf '\\' + | 'n' -> Buffer.add_char buf '\n' + | 't' -> Buffer.add_char buf '\t' + | 'r' -> Buffer.add_char buf '\r' + | 'u' -> + advance (); + let hex = String.sub s !pos 4 in + pos := !pos + 3; + let code = int_of_string ("0x" ^ hex) in + if code < 128 then Buffer.add_char buf (Char.chr code) + else begin + (* UTF-8 encode *) + if code < 0x800 then begin + Buffer.add_char buf (Char.chr (0xc0 lor (code lsr 6))); + Buffer.add_char buf (Char.chr (0x80 lor (code land 0x3f))) + end else begin + Buffer.add_char buf (Char.chr (0xe0 lor (code lsr 12))); + Buffer.add_char buf (Char.chr (0x80 lor ((code lsr 6) land 0x3f))); + Buffer.add_char buf (Char.chr (0x80 lor (code land 0x3f))) + end + end + | c -> Buffer.add_char buf c); + advance () + end else begin + Buffer.add_char buf (peek ()); + advance () + end + done; + advance (); (* closing quote *) + Buffer.contents buf + and parse_object () = + expect '{'; + skip_ws (); + if peek () = '}' then (advance (); Jobject []) + else begin + let pairs = ref [] in + let cont = ref true in + while !cont do + skip_ws (); + let key = parse_string () in + expect ':'; + let value = parse_value () in + pairs := (key, value) :: !pairs; + skip_ws (); + if peek () = ',' then advance () + else cont := false + done; + expect '}'; + Jobject (List.rev !pairs) + end + and parse_array () = + expect '['; + skip_ws (); + if peek () = ']' then (advance (); Jarray []) + else begin + let items = ref [] in + let cont = ref true in + while !cont do + let v = parse_value () in + items := v :: !items; + skip_ws (); + if peek () = ',' then advance () + else cont := false + done; + expect ']'; + Jarray (List.rev !items) + end + in + parse_value () + +(* --- json → ocaml type converters --- + * these reconstruct our parser types from their json representations. + * they mirror the json_*_of serializers above. *) + +let switch_of_json j = + match json_to_string (json_get "type" j) with + | "short" -> + let c = json_to_string (json_get "char" j) in + Short (if String.length c > 0 then c.[0] else '?') + | "long" -> Long (json_to_string (json_get "name" j)) + | "both" -> + let c = json_to_string (json_get "char" j) in + Both ((if String.length c > 0 then c.[0] else '?'), + json_to_string (json_get "name" j)) + | _ -> Long "?" + +let param_of_json = function + | Jnull -> None + | j -> + let name = json_to_string (json_get "name" j) in + (match json_to_string (json_get "kind" j) with + | "mandatory" -> Some (Mandatory name) + | "optional" -> Some (Optional name) + | _ -> None) + +let entry_of_json j = + { switch = switch_of_json (json_get "switch" j); + param = param_of_json (json_get "param" j); + desc = json_to_string (json_get "desc" j) } + +let subcommand_of_json j = + { name = json_to_string (json_get "name" j); + desc = json_to_string (json_get "desc" j) } + +let positional_of_json j = + { pos_name = json_to_string (json_get "name" j); + optional = json_to_bool (json_get "optional" j); + variadic = json_to_bool (json_get "variadic" j) } + +let help_result_of_json j = + { entries = List.map entry_of_json (json_to_list (json_get "entries" j)); + subcommands = List.map subcommand_of_json (json_to_list (json_get "subcommands" j)); + positionals = List.map positional_of_json (json_to_list (json_get "positionals" j)); + description = json_to_string (json_get "description" j) } + +(* --- filesystem operations --- *) + +let write_file path contents = + let oc = open_out path in + output_string oc contents; + close_out oc + +let read_file path = + try + let ic = open_in path in + let n = in_channel_length ic in + let s = Bytes.create n in + really_input ic s 0 n; + close_in ic; + Some (Bytes.to_string s) + with _ -> None + +(* write a parsed help_result to the store as json *) +let write_result ~dir ?(source="help") command result = + let path = Filename.concat dir (filename_of_command command ^ ".json") in + write_file path (json_of_help_result ~source result) + +(* write native nushell completion source to the store as a .nu file *) +let write_native ~dir command data = + let path = Filename.concat dir (filename_of_command command ^ ".nu") in + write_file path data + +let is_dir path = Sys.file_exists path && Sys.is_directory path + +(* look for a command's data file across multiple store directories. + * checks json first, then .nu. returns the first match found. + * directories are searched in order (user dir before system dirs). *) +let find_file dirs command = + let base = filename_of_command command in + List.find_map (fun dir -> + let json_path = Filename.concat dir (base ^ ".json") in + if Sys.file_exists json_path then Some json_path + else + let nu_path = Filename.concat dir (base ^ ".nu") in + if Sys.file_exists nu_path then Some nu_path + else None + ) dirs + +(* look up a command and deserialize its help_result from json. + * only searches for .json files (not .nu, since those can't be deserialized + * back into help_result). returns none if not found or parse fails. *) +let lookup dirs command = + let base = filename_of_command command in + List.find_map (fun dir -> + let path = Filename.concat dir (base ^ ".json") in + match read_file path with + | Some data -> + (try Some (help_result_of_json (parse_json data)) + with _ -> None) + | None -> None + ) dirs + +(* look up a command's raw data (json or .nu source) without parsing. + * used by the "query" command to dump stored data as-is. *) +let lookup_raw dirs command = + let base = filename_of_command command in + List.find_map (fun dir -> + let json_path = Filename.concat dir (base ^ ".json") in + match read_file json_path with + | Some _ as r -> r + | None -> + let nu_path = Filename.concat dir (base ^ ".nu") in + read_file nu_path + ) dirs + +let chop_extension f = + if Filename.check_suffix f ".json" then Some (Filename.chop_suffix f ".json") + else if Filename.check_suffix f ".nu" then Some (Filename.chop_suffix f ".nu") + else None + +(* discover subcommands of a command by scanning filenames in the store. + * looks for files whose names start with the command's filename + "_" + * (e.g. for "git", finds "git_add.json", "git_commit.json", etc.) + * + * only returns immediate subcommands (no nested underscores beyond the prefix). + * tries to extract description from the json "description" field if available. + * + * peculiarity: this filesystem-based discovery is used as a fallback when the + * command's own help_result doesn't list subcommands. it enables completion + * for subcommands that were indexed from separate manpages or help runs. *) +let subcommands_of dirs command = + let prefix = filename_of_command command ^ "_" in + let plen = String.length prefix in + let module SMap = Map.Make(String) in + let subs = List.fold_left (fun subs dir -> + if is_dir dir then + Array.fold_left (fun subs f -> + if not (String.starts_with ~prefix f) then subs + else + let is_json = Filename.check_suffix f ".json" in + match chop_extension f with + | None -> subs + | Some b -> + let rest = String.sub b plen (String.length b - plen) in + if String.contains rest '_' || String.length rest = 0 then subs + else if SMap.mem rest subs then subs + else + let desc = if is_json then + match read_file (Filename.concat dir f) with + | Some data -> + (try json_to_string (json_get "description" (parse_json data)) + with _ -> "") + | None -> "" + else "" in + SMap.add rest { name = rest; desc } subs + ) subs (Sys.readdir dir) + else subs + ) SMap.empty dirs in + SMap.fold (fun _ sc acc -> sc :: acc) subs [] |> List.rev + +(* list all indexed commands across all store directories. + * returns a sorted, deduplicated list of command names. *) +let all_commands dirs = + let module SSet = Set.Make(String) in + List.fold_left (fun cmds dir -> + if is_dir dir then + Array.fold_left (fun cmds f -> + match chop_extension f with + | Some b -> SSet.add (command_of_filename b) cmds + | None -> cmds + ) cmds (Sys.readdir dir) + else cmds + ) SSet.empty dirs + |> SSet.elements + +(* determine how a command was indexed: "help", "manpage", "native", etc. + * for json files, reads the "source" field. for .nu files, returns "native". + * used by the "dump" command to show provenance. *) +let file_type_of dirs command = + let base = filename_of_command command in + List.find_map (fun dir -> + let json_path = Filename.concat dir (base ^ ".json") in + if Sys.file_exists json_path then + (match read_file json_path with + | Some data -> + (try Some (json_to_string (json_get "source" (parse_json data))) + with _ -> Some "json") + | None -> Some "json") + else + let nu_path = Filename.concat dir (base ^ ".nu") in + if Sys.file_exists nu_path then Some "native" + else None + ) dirs diff --git a/nix/module.nix b/nix/module.nix new file mode 100644 index 0000000..f407cd5 --- /dev/null +++ b/nix/module.nix @@ -0,0 +1,107 @@ +# NixOS module: automatic nushell completion indexing +# +# Indexes completions using three strategies in priority order: +# 1. Native completion generators (e.g. CMD completions nushell) +# 2. Manpage parsing +# 3. --help output parsing +# +# Produces a directory of .json/.nu files at build time. +# The `complete` command reads from this directory as a system overlay. +# +# Usage: +# { pkgs, ... }: { +# imports = [ ./path/to/inshellah/nix/module.nix ]; +# programs.inshellah.enable = true; +# } + +{ + config, + lib, + pkgs, + ... +}: + +let + cfg = config.programs.inshellah; +in +{ + options.programs.inshellah = { + enable = lib.mkEnableOption "nushell completion indexing via inshellah"; + + package = lib.mkOption { + type = lib.types.package; + description = "package to use for indexing completions"; + }; + + completionsPath = lib.mkOption { + type = lib.types.str; + default = "/share/inshellah"; + description = '' + subdirectory within the system profile where completion files + are placed. used as --system-dir for the completer. + ''; + }; + + ignoreCommands = lib.mkOption { + type = lib.types.listOf lib.types.str; + default = [ ]; + example = [ "problematic-tool" ]; + description = '' + list of command names to skip during completion indexing + ''; + }; + + helpOnlyCommands = lib.mkOption { + type = lib.types.listOf lib.types.str; + default = [ ]; + example = [ "nix" ]; + description = '' + list of command names to skip manpage parsing for, + using --help scraping instead + ''; + }; + + snippet = lib.mkOption { + type = lib.types.str; + readOnly = true; + }; + }; + + config = lib.mkIf cfg.enable { + environment.systemPackages = [ config.programs.inshellah.package ]; + environment.pathsToLink = [ "/share/nushell/autoload" ]; + environment.extraSetup = + let + inshellah = "${cfg.package}/bin/inshellah"; + destDir = "$out${cfg.completionsPath}"; + ignoreFile = pkgs.writeText "inshellah-ignore" (lib.concatStringsSep "\n" cfg.ignoreCommands); + ignoreFlag = lib.optionalString (cfg.ignoreCommands != [ ]) " --ignore ${ignoreFile}"; + helpOnlyFile = pkgs.writeText "inshellah-help-only" (lib.concatStringsSep "\n" cfg.helpOnlyCommands); + helpOnlyFlag = lib.optionalString (cfg.helpOnlyCommands != [ ]) " --help-only ${helpOnlyFile}"; + in + '' + mkdir -p ${destDir} + + if [ -d "$out/bin" ] && [ -d "$out/share/man" ]; then + ${inshellah} index "$out" --dir ${destDir}${ignoreFlag}${helpOnlyFlag} \ + 2>/dev/null || true + fi + + find ${destDir} -maxdepth 1 -empty -delete + + # nushell hardcodes sudo and doas to bypass the external completer, + # returning command-name completion instead of calling inshellah. + # these @complete external stubs override that so inshellah handles + # their flags and elevation stripping. placed in the nushell autoload + # dir so they are sourced automatically at shell startup. + mkdir -p $out/share/nushell/autoload + cat > $out/share/nushell/autoload/inshellah-elevation.nu << 'NUSHELL' + @complete external + extern "sudo" [] + + @complete external + extern "doas" [] + NUSHELL + ''; + }; +} diff --git a/test/dune b/test/dune new file mode 100644 index 0000000..d54a2fb --- /dev/null +++ b/test/dune @@ -0,0 +1,3 @@ +(test + (name test_inshellah) + (libraries inshellah str)) diff --git a/test/test_inshellah.ml b/test/test_inshellah.ml new file mode 100644 index 0000000..55567f3 --- /dev/null +++ b/test/test_inshellah.ml @@ -0,0 +1,492 @@ +open Inshellah.Parser +open Inshellah.Manpage +open Inshellah.Nushell + +let failures = ref 0 +let passes = ref 0 + +let check name condition = + if condition then begin + incr passes; + Printf.printf " PASS: %s\n" name + end else begin + incr failures; + Printf.printf " FAIL: %s\n" name + end + +let parse txt = + match parse_help txt with + | Ok r -> r + | Error msg -> failwith (Printf.sprintf "parse_help failed: %s" msg) + +(* --- Help parser tests --- *) + +let test_gnu_basic () = + Printf.printf "\n== GNU basic flags ==\n"; + let r = parse " -a, --all do not ignore entries starting with .\n" in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "both switch" (e.switch = Both ('a', "all")); + check "no param" (e.param = None); + check "desc" (String.length e.desc > 0) + +let test_gnu_eq_param () = + Printf.printf "\n== GNU = param ==\n"; + let r = parse " --block-size=SIZE scale sizes by SIZE\n" in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "long switch" (e.switch = Long "block-size"); + check "mandatory param" (e.param = Some (Mandatory "SIZE")) + +let test_gnu_opt_param () = + Printf.printf "\n== GNU optional param ==\n"; + let r = parse " --color[=WHEN] color the output WHEN\n" in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "long switch" (e.switch = Long "color"); + check "optional param" (e.param = Some (Optional "WHEN")) + +let test_underscore_param () = + Printf.printf "\n== Underscore in param (TIME_STYLE) ==\n"; + let r = parse " --time-style=TIME_STYLE time/date format\n" in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "param with underscore" (e.param = Some (Mandatory "TIME_STYLE")) + +let test_short_only () = + Printf.printf "\n== Short-only flag ==\n"; + let r = parse " -v verbose output\n" in + check "one entry" (List.length r.entries = 1); + check "short switch" ((List.hd r.entries).switch = Short 'v') + +let test_long_only () = + Printf.printf "\n== Long-only flag ==\n"; + let r = parse " --help display help\n" in + check "one entry" (List.length r.entries = 1); + check "long switch" ((List.hd r.entries).switch = Long "help") + +let test_multiline_desc () = + Printf.printf "\n== Multi-line description ==\n"; + let r = parse {| --block-size=SIZE with -l, scale sizes by SIZE when printing them; + e.g., '--block-size=M'; see SIZE format below +|} in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "desc includes continuation" (String.length e.desc > 50) + +let test_multiple_entries () = + Printf.printf "\n== Multiple entries ==\n"; + let r = parse {| -a, --all do not ignore entries starting with . + -A, --almost-all do not list implied . and .. + --author with -l, print the author of each file +|} in + check "three entries" (List.length r.entries = 3) + +let test_clap_short_sections () = + Printf.printf "\n== Clap short with section headers ==\n"; + let r = parse {|INPUT OPTIONS: + -e, --regexp=PATTERN A pattern to search for. + -f, --file=PATTERNFILE Search for patterns from the given file. +SEARCH OPTIONS: + -s, --case-sensitive Search case sensitively. +|} in + check "three entries" (List.length r.entries = 3); + let e = List.hd r.entries in + check "first is regexp" (e.switch = Both ('e', "regexp")); + check "first has param" (e.param = Some (Mandatory "PATTERN")) + +let test_clap_long_style () = + Printf.printf "\n== Clap long style (desc below flag) ==\n"; + let r = parse {| -H, --hidden + Include hidden directories and files. + + --no-ignore + Do not respect ignore files. +|} in + check "two entries" (List.length r.entries = 2); + let e = List.hd r.entries in + check "hidden switch" (e.switch = Both ('H', "hidden")); + check "desc below" (String.length e.desc > 0) + +let test_clap_long_angle_param () = + Printf.printf "\n== Clap long angle bracket param ==\n"; + let r = parse {| --nonprintable-notation + Set notation for non-printable characters. +|} in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "long switch" (e.switch = Long "nonprintable-notation"); + check "angle param" (e.param = Some (Mandatory "notation")) + +let test_space_upper_param () = + Printf.printf "\n== Space-separated ALL_CAPS param ==\n"; + let r = parse " -f, --foo FOO foo help\n" in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "switch" (e.switch = Both ('f', "foo")); + check "space param" (e.param = Some (Mandatory "FOO")) + +let test_go_cobra_flags () = + Printf.printf "\n== Go/Cobra flags ==\n"; + let r = parse {|Flags: + -D, --debug Enable debug mode + -H, --host string Daemon socket to connect to + -v, --version Print version information +|} in + check "three flag entries" (List.length r.entries = 3); + (* Check the host flag has a type param *) + let host = List.nth r.entries 1 in + check "host switch" (host.switch = Both ('H', "host")); + check "host type param" (host.param = Some (Mandatory "string")) + +let test_go_cobra_subcommands () = + Printf.printf "\n== Go/Cobra subcommands ==\n"; + let r = parse {|Common Commands: + run Create and run a new container from an image + exec Execute a command in a running container + build Build an image from a Dockerfile +|} in + check "has subcommands" (List.length r.subcommands > 0) + +let test_busybox_tab () = + Printf.printf "\n== Busybox tab-indented ==\n"; + let r = parse "\t-1\tOne column output\n\t-a\tInclude names starting with .\n" in + check "two entries" (List.length r.entries = 2); + check "first is -1" ((List.hd r.entries).switch = Short '1') + +let test_no_debug_prints () = + Printf.printf "\n== No debug side effects ==\n"; + (* The old parser had print_endline at module load time. + If we got here without "opt param is running" on stdout, we're good. *) + check "no debug prints" true + +(* --- Manpage parser tests --- *) + +let test_manpage_tp_style () = + Printf.printf "\n== Manpage .TP style ==\n"; + let groff = {|.SH OPTIONS +.TP +\fB\-a\fR, \fB\-\-all\fR +do not ignore entries starting with . +.TP +\fB\-A\fR, \fB\-\-almost\-all\fR +do not list implied . and .. +.TP +\fB\-\-block\-size\fR=\fISIZE\fR +with \fB\-l\fR, scale sizes by SIZE +.SH AUTHOR +Written by someone. +|} in + let result = parse_manpage_string groff in + check "three entries" (List.length result.entries = 3); + if List.length result.entries >= 1 then begin + let e = List.hd result.entries in + check "first is -a/--all" (e.switch = Both ('a', "all")); + check "first desc" (String.length e.desc > 0) + end; + if List.length result.entries >= 3 then begin + let e = List.nth result.entries 2 in + check "block-size switch" (e.switch = Long "block-size"); + check "block-size param" (e.param = Some (Mandatory "SIZE")) + end + +let test_manpage_ip_style () = + Printf.printf "\n== Manpage .IP style ==\n"; + let groff = {|.SH OPTIONS +.IP "\fB\-k\fR, \fB\-\-insecure\fR" +Allow insecure connections. +.IP "\fB\-o\fR, \fB\-\-output\fR \fIfile\fR" +Write output to file. +.SH SEE ALSO +|} in + let result = parse_manpage_string groff in + check "two entries" (List.length result.entries = 2); + if List.length result.entries >= 1 then begin + let e = List.hd result.entries in + check "first is -k/--insecure" (e.switch = Both ('k', "insecure")) + end + +let test_manpage_groff_stripping () = + Printf.printf "\n== Groff escape stripping ==\n"; + let s = strip_groff_escapes {|\fB\-\-color\fR[=\fIWHEN\fR]|} in + check "font escapes removed" (not (String.contains s 'f' && String.contains s 'B')); + check "dashes converted" (String.contains s '-'); + let s2 = strip_groff_escapes {|\(aqhello\(aq|} in + check "aq -> quote" (String.contains s2 '\'') + +let test_manpage_empty_options () = + Printf.printf "\n== Manpage with no OPTIONS section ==\n"; + let groff = {|.SH NAME +foo \- does stuff +.SH DESCRIPTION +Does stuff. +|} in + let result = parse_manpage_string groff in + check "no entries" (List.length result.entries = 0) + +let test_slash_switch_separator () = + Printf.printf "\n== Slash switch separator (--long / -s) ==\n"; + let r = parse " --verbose / -v Increase verbosity\n" in + check "one entry" (List.length r.entries = 1); + let e = List.hd r.entries in + check "both switch" (e.switch = Both ('v', "verbose")); + check "no param" (e.param = None); + check "desc" (e.desc = "Increase verbosity") + +let test_manpage_nix3_style () = + Printf.printf "\n== Manpage nix3 style ==\n"; + let groff = {|.SH Options +.SS Logging-related options +.IP "\(bu" 3 +.UR #opt-verbose +\f(CR--verbose\fR +.UE +/ \f(CR-v\fR +.IP +Increase the logging verbosity level. +.IP "\(bu" 3 +.UR #opt-quiet +\f(CR--quiet\fR +.UE +.IP +Decrease the logging verbosity level. +.SH SEE ALSO +|} in + let result = parse_manpage_string groff in + check "two entries" (List.length result.entries = 2); + if List.length result.entries >= 1 then begin + let e = List.hd result.entries in + check "verbose is Both" (e.switch = Both ('v', "verbose")); + check "verbose desc" (String.length e.desc > 0) + end; + if List.length result.entries >= 2 then begin + let e = List.nth result.entries 1 in + check "quiet is Long" (e.switch = Long "quiet"); + check "quiet desc" (String.length e.desc > 0) + end + +let test_manpage_nix3_with_params () = + Printf.printf "\n== Manpage nix3 with params ==\n"; + let groff = {|.SH Options +.IP "\(bu" 3 +.UR #opt-arg +\f(CR--arg\fR +.UE +\fIname\fR \fIexpr\fR +.IP +Pass the value as the argument name to Nix functions. +.IP "\(bu" 3 +.UR #opt-include +\f(CR--include\fR +.UE +/ \f(CR-I\fR \fIpath\fR +.IP +Add path to search path entries. +.IP +This option may be given multiple times. +.SH SEE ALSO +|} in + let result = parse_manpage_string groff in + check "two entries" (List.length result.entries = 2); + if List.length result.entries >= 1 then begin + let e = List.hd result.entries in + check "arg is Long" (e.switch = Long "arg"); + check "arg has param" (e.param <> None) + end; + if List.length result.entries >= 2 then begin + let e = List.nth result.entries 1 in + check "include is Both" (e.switch = Both ('I', "include")); + check "include has path param" (e.param = Some (Mandatory "path")) + end + +let test_synopsis_subcommand () = + Printf.printf "\n== SYNOPSIS subcommand detection ==\n"; + let groff = {|.SH "SYNOPSIS" +.sp +.nf +\fBgit\fR \fBcommit\fR [\fB\-a\fR | \fB\-\-interactive\fR] +.fi +.SH "DESCRIPTION" +|} in + let cmd = extract_synopsis_command groff in + check "detected git commit" (cmd = Some "git commit") + +let test_synopsis_standalone () = + Printf.printf "\n== SYNOPSIS standalone command ==\n"; + let groff = {|.SH Synopsis +.LP +\f(CRnix-build\fR [\fIpaths\fR] +.SH Description +|} in + let cmd = extract_synopsis_command groff in + check "detected nix-build" (cmd = Some "nix-build") + +let test_synopsis_nix3 () = + Printf.printf "\n== SYNOPSIS nix3 subcommand ==\n"; + let groff = {|.SH Synopsis +.LP +\f(CRnix run\fR [\fIoption\fR] \fIinstallable\fR +.SH Description +|} in + let cmd = extract_synopsis_command groff in + check "detected nix run" (cmd = Some "nix run") + +(* --- Nushell generation tests --- *) + +let contains s sub = + try + let _ = Str.search_forward (Str.regexp_string sub) s 0 in true + with Not_found -> false + +let test_nushell_basic () = + Printf.printf "\n== Nushell basic extern ==\n"; + let r = parse " -a, --all do not ignore entries starting with .\n" in + let nu = generate_extern "ls" r in + check "has extern" (contains nu "export extern \"ls\""); + check "has --all(-a)" (contains nu "--all(-a)"); + check "has comment" (contains nu "# do not ignore") + +let test_nushell_param_types () = + Printf.printf "\n== Nushell param type mapping ==\n"; + let r = parse {| -w, --width=COLS set output width + --block-size=SIZE scale sizes + -o, --output FILE output file +|} in + let nu = generate_extern "ls" r in + check "COLS -> int" (contains nu "--width(-w): int"); + check "SIZE -> string" (contains nu "--block-size: string"); + check "FILE -> path" (contains nu "--output(-o): path") + +let test_nushell_subcommands () = + Printf.printf "\n== Nushell subcommands ==\n"; + let r = parse {|Common Commands: + run Create and run a new container + exec Execute a command + +Flags: + -D, --debug Enable debug mode +|} in + let nu = generate_extern "docker" r in + check "has main extern" (contains nu "export extern \"docker\""); + check "has --debug" (contains nu "--debug(-D)"); + check "has run subcommand" (contains nu "export extern \"docker run\""); + check "has exec subcommand" (contains nu "export extern \"docker exec\"") + +let test_nushell_from_manpage () = + Printf.printf "\n== Nushell from manpage ==\n"; + let groff = {|.SH OPTIONS +.TP +\fB\-a\fR, \fB\-\-all\fR +do not ignore entries starting with . +.TP +\fB\-\-block\-size\fR=\fISIZE\fR +scale sizes by SIZE +.SH AUTHOR +|} in + let result = parse_manpage_string groff in + let nu = generate_extern "ls" result in + check "has extern" (contains nu "export extern \"ls\""); + check "has --all(-a)" (contains nu "--all(-a)"); + check "has --block-size" (contains nu "--block-size: string") + +let test_nushell_module () = + Printf.printf "\n== Nushell module wrapper ==\n"; + let r = parse " -v, --verbose verbose output\n" in + let nu = generate_module "myapp" r in + check "has module" (contains nu "module myapp-completions"); + check "has extern inside" (contains nu "export extern \"myapp\""); + check "has flag" (contains nu "--verbose(-v)") + +let test_dedup_entries () = + Printf.printf "\n== Deduplication ==\n"; + let r = parse {| -v, --verbose verbose output + --verbose verbose mode + -v be verbose +|} in + let nu = generate_extern "test" r in + (* Count occurrences of --verbose *) + let count = + let re = Str.regexp_string "--verbose" in + let n = ref 0 in + let i = ref 0 in + (try while true do + let _ = Str.search_forward re nu !i in + incr n; i := Str.match_end () + done with Not_found -> ()); + !n + in + check "verbose appears once" (count = 1); + check "best version kept (Both)" (contains nu "--verbose(-v)") + +let test_dedup_manpage () = + Printf.printf "\n== Dedup from manpage ==\n"; + let groff = {|.SH OPTIONS +.TP +\fB\-v\fR, \fB\-\-verbose\fR +Be verbose. +.SH DESCRIPTION +Use \fB\-v\fR for verbose output. +Use \fB\-\-verbose\fR to see more. +|} in + let result = parse_manpage_string groff in + let nu = generate_extern "test" result in + check "has --verbose(-v)" (contains nu "--verbose(-v)"); + (* Should not have standalone -v or duplicate --verbose *) + let lines = String.split_on_char '\n' nu in + let verbose_lines = List.filter (fun l -> contains l "verbose") lines in + check "only one verbose line" (List.length verbose_lines = 1) + +let test_font_boundary_spacing () = + Printf.printf "\n== Font boundary spacing ==\n"; + (* \fB--max-results\fR\fIcount\fR should become "--max-results count" *) + let s = strip_groff_escapes {|\fB\-\-max\-results\fR\fIcount\fR|} in + check "has space before param" (contains s "--max-results count"); + (* \fB--color\fR[=\fIWHEN\fR] should NOT insert space before = *) + let s2 = strip_groff_escapes {|\fB\-\-color\fR[=\fIWHEN\fR]|} in + check "no space before =" (contains s2 "--color[=WHEN]") + +let () = + Printf.printf "Running help parser tests...\n"; + test_gnu_basic (); + test_gnu_eq_param (); + test_gnu_opt_param (); + test_underscore_param (); + test_short_only (); + test_long_only (); + test_multiline_desc (); + test_multiple_entries (); + test_clap_short_sections (); + test_clap_long_style (); + test_clap_long_angle_param (); + test_space_upper_param (); + test_go_cobra_flags (); + test_go_cobra_subcommands (); + test_busybox_tab (); + test_no_debug_prints (); + + Printf.printf "\nRunning manpage parser tests...\n"; + test_manpage_tp_style (); + test_manpage_ip_style (); + test_manpage_groff_stripping (); + test_manpage_empty_options (); + test_slash_switch_separator (); + test_manpage_nix3_style (); + test_manpage_nix3_with_params (); + test_synopsis_subcommand (); + test_synopsis_standalone (); + test_synopsis_nix3 (); + + Printf.printf "\nRunning nushell generation tests...\n"; + test_nushell_basic (); + test_nushell_param_types (); + test_nushell_subcommands (); + test_nushell_from_manpage (); + test_nushell_module (); + + Printf.printf "\nRunning dedup and font tests...\n"; + test_dedup_entries (); + test_dedup_manpage (); + test_font_boundary_spacing (); + + Printf.printf "\n=== Results: %d passed, %d failed ===\n" !passes !failures; + if !failures > 0 then exit 1