From 0aa6ae9fbff56483cb0ce4f141ea343480f7d651 Mon Sep 17 00:00:00 2001 From: atagen Date: Tue, 19 May 2026 23:32:51 +1000 Subject: [PATCH] riir --- Cargo.lock | 285 ++++ Cargo.toml | 12 + README.md | 6 +- bin/.ocamlformat | 0 bin/dune | 4 - bin/main.ml | 1403 ------------------ doc/building.md | 145 +- doc/nixos.md | 217 ++- doc/nushell-integration.md | 254 ++-- doc/runtime-completions.md | 55 +- dune-project | 28 - flake.lock | 8 +- flake.nix | 308 ++-- inshellah.opam | 35 - lib/.ocamlformat | 0 lib/dune | 3 - lib/manpage.ml | 1145 --------------- lib/nushell.ml | 253 ---- lib/parser.ml | 814 ----------- lib/store.ml | 670 --------- nix/inshellah-completer.nu | 813 +++++++++++ nix/module.nix | 91 +- src/lib.rs | 4 + src/main.rs | 2241 +++++++++++++++++++++++++++++ src/parsers/help.rs | 187 +++ src/parsers/help/description.rs | 37 + src/parsers/help/helpers.rs | 105 ++ src/parsers/help/options.rs | 192 +++ src/parsers/help/positionals.rs | 400 +++++ src/parsers/help/subcommands.rs | 83 ++ src/parsers/manpage.rs | 335 +++++ src/parsers/manpage/commands.rs | 157 ++ src/parsers/manpage/groff.rs | 385 +++++ src/parsers/manpage/mdoc.rs | 237 +++ src/parsers/manpage/sections.rs | 851 +++++++++++ src/parsers/manpage/strategies.rs | 456 ++++++ src/parsers/mod.rs | 3 + src/parsers/nushell.rs | 475 ++++++ src/pool.rs | 233 +++ src/store.rs | 657 +++++++++ src/types.rs | 34 + test/dune | 3 - test/test_inshellah.ml | 610 -------- tests/git_clone_fix.rs | 78 + tests/manpage_cli.rs | 150 ++ tests/nushell-completer.nu | 128 ++ tests/ports.rs | 915 ++++++++++++ tests/runtime_complete.rs | 500 +++++++ tests/self_completions.rs | 31 + 49 files changed, 10554 insertions(+), 5482 deletions(-) create mode 100644 Cargo.lock create mode 100644 Cargo.toml delete mode 100644 bin/.ocamlformat delete mode 100644 bin/dune delete mode 100644 bin/main.ml delete mode 100644 dune-project delete mode 100644 inshellah.opam delete mode 100644 lib/.ocamlformat delete mode 100644 lib/dune delete mode 100644 lib/manpage.ml delete mode 100644 lib/nushell.ml delete mode 100644 lib/parser.ml delete mode 100644 lib/store.ml create mode 100644 nix/inshellah-completer.nu create mode 100644 src/lib.rs create mode 100644 src/main.rs create mode 100644 src/parsers/help.rs create mode 100644 src/parsers/help/description.rs create mode 100644 src/parsers/help/helpers.rs create mode 100644 src/parsers/help/options.rs create mode 100644 src/parsers/help/positionals.rs create mode 100644 src/parsers/help/subcommands.rs create mode 100644 src/parsers/manpage.rs create mode 100644 src/parsers/manpage/commands.rs create mode 100644 src/parsers/manpage/groff.rs create mode 100644 src/parsers/manpage/mdoc.rs create mode 100644 src/parsers/manpage/sections.rs create mode 100644 src/parsers/manpage/strategies.rs create mode 100644 src/parsers/mod.rs create mode 100644 src/parsers/nushell.rs create mode 100644 src/pool.rs create mode 100644 src/store.rs create mode 100644 src/types.rs delete mode 100644 test/dune delete mode 100644 test/test_inshellah.ml create mode 100644 tests/git_clone_fix.rs create mode 100644 tests/manpage_cli.rs create mode 100644 tests/nushell-completer.nu create mode 100644 tests/ports.rs create mode 100644 tests/runtime_complete.rs create mode 100644 tests/self_completions.rs diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..8e59a90 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,285 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "bitflags" +version = "2.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4512299f36f043ab09a583e57bceb5a5aab7a73db1805848e8fef3c9e8c78b3" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "fast-strip-ansi" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3086ffd0a7160f58f988c74173a002e255da505a114e2f5425acb1eaab2b8ac" +dependencies = [ + "vt-push-parser", +] + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "hashbrown" +version = "0.17.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed5909b6e89a2db4456e54cd5f673791d7eca6732202bbf2a9cc504fe2f9b84a" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "indexmap" +version = "2.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d466e9454f08e4a911e14806c24e16fba1b4c121d1ea474396f396069cf949d9" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "inshellah" +version = "0.1.1" +dependencies = [ + "fast-strip-ansi", + "flate2", + "libc", + "nom", + "parking_lot", + "serde_json", +] + +[[package]] +name = "itoa" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f42a60cbdf9a97f5d2305f08a87dc4e09308d1276d28c869c684d7777685682" + +[[package]] +name = "libc" +version = "0.2.186" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68ab91017fe16c622486840e4c83c9a37afeff978bd239b5293d61ece587de66" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41f2619966050689382d2b44f664f4bc593e129785a36d6ee376ddf37259b924" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "indexmap", + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "syn" +version = "2.0.117" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e665b8803e7b1d2a727f4023456bbbbe74da67099c585258af0ad9c5013b9b99" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "unicode-ident" +version = "1.0.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6e4313cd5fcd3dad5cafa179702e2b244f760991f45397d14d4ebf38247da75" + +[[package]] +name = "vt-push-parser" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdbf39d53c5a50cad8119d9cde929ecd208764e8d8d1626486b8929cbcd5f0e7" +dependencies = [ + "hex", + "smallvec", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..1319992 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,12 @@ +[package] +name = "inshellah" +version = "0.1.1" +edition = "2024" + +[dependencies] +fast-strip-ansi = "0.13.1" +flate2 = "1.1.9" +libc = "0.2.186" +nom = "8.0.0" +parking_lot = "0.12.5" +serde_json = { version = "1.0.149", features = ["preserve_order"] } diff --git a/README.md b/README.md index 3d3e66d..1f3e779 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ completer. see `doc/` for details: -- [building and installing](doc/building.md) — compilation, arch/debian/fedora, opam, nix -- [nushell integration](doc/nushell-integration.md) — setup, usage, examples -- [nixos module](doc/nixos.md) — automatic build-time indexing +- [building and installing](doc/building.md) — cargo, nix, post-install setup +- [nushell integration](doc/nushell-integration.md) — setup, the pipeline, the completer +- [nixos module](doc/nixos.md) — automatic build-time indexing + module options - [runtime completions](doc/runtime-completions.md) — on-the-fly caching via the completer diff --git a/bin/.ocamlformat b/bin/.ocamlformat deleted file mode 100644 index e69de29..0000000 diff --git a/bin/dune b/bin/dune deleted file mode 100644 index 4bb8309..0000000 --- a/bin/dune +++ /dev/null @@ -1,4 +0,0 @@ -(executable - (public_name inshellah) - (name main) - (libraries inshellah)) diff --git a/bin/main.ml b/bin/main.ml deleted file mode 100644 index b72a456..0000000 --- a/bin/main.ml +++ /dev/null @@ -1,1403 +0,0 @@ -(* main.ml — cli entry point for inshellah, a nushell completions engine. - * - * inshellah generates nushell "extern" definitions for external commands by - * parsing their manpages and --help output. it has two main modes: - * - * 1. indexing (batch): scan a prefix directory's bin/ and share/man/, - * extract completions for every binary, and write them to a cache dir. - * this is typically run once per nix profile or system update. - * - * 2. completing (interactive): given a command and its current arguments, - * look up the cached data and return JSON completion candidates for - * nushell's custom completer protocol. - * - * the indexing pipeline for each binary: - * a. classify the binary (skip? try --help? try native completions?) - * b. if the tool has native nushell completion support, run --help and - * discover subcommands containing "complet", then try them with "nushell" - * c. otherwise, run the tool with --help/-h and parse the output - * d. recursively resolve subcommands (depth-limited to 5) - * e. after binaries, parse manpages for any commands not yet covered - * - * parallelism: indexing forks per binary, and subcommand resolution forks - * per subcommand. results are marshaled back via pipes. this gives good - * throughput on multi-core systems while keeping the code simple (no threads, - * no async runtime — just unix fork/pipe/waitpid). - *) - -open Inshellah.Parser -open Inshellah.Manpage -open Inshellah.Nushell -open Inshellah.Store - -module SSet = Set.Make(String) - -(* print usage and exit. called when no valid subcommand is given. *) -let usage () = - Printf.eprintf - {|inshellah - nushell completions engine - -Usage: - inshellah index PREFIX... [--dir PATH] [--ignore FILE] [--help-only FILE] - Index completions into a directory of JSON/nu files. - PREFIX is a directory containing bin/ and share/man/. - Default dir: $XDG_CACHE_HOME/inshellah - --ignore FILE skip listed commands entirely - --help-only FILE skip manpages for listed commands, use --help instead - inshellah complete CMD [ARGS...] [--dir PATH[:PATH...]] - Nushell custom completer. Outputs JSON completion candidates. - Falls back to --help resolution if command is not indexed. - --dir takes colon-separated paths. The first path is the writable - user cache; additional paths are read-only system directories. - Manpages are found via sibling share/man of system dir paths. - inshellah query CMD [--dir PATH[:PATH...]] - Print stored completion data for CMD. - inshellah dump [--dir PATH[:PATH...]] - List indexed commands. - inshellah manpage FILE Parse a manpage and emit nushell extern - inshellah manpage-dir DIR Batch-process manpages under DIR - inshellah completions Generate nushell completions for inshellah - -|}; - exit 1 - -(* manpage sections that contain command documentation. - * section 1 = user commands, section 8 = system administration commands. *) -let command_sections = [1; 8] - -(* simple substring search using Str *) -let contains_str haystack needle = - try ignore (Str.search_forward (Str.regexp_string needle) haystack 0); true - with Not_found -> false - -(* heuristic to detect whether text is valid nushell source code. - * checks for common nushell declaration keywords. the length > 20 - * check avoids false positives on short error messages. *) -let is_nushell_source text = - String.length text > 20 - && (contains_str text "export extern" - || contains_str text "export def" - || (contains_str text "module " && contains_str text "export")) - -(* extract command name from a manpage filename. - * "ls.1.gz" -> strip .gz -> "ls.1" -> chop extension -> "ls" *) -let cmd_name_of_manpage path = - let base = Filename.basename path in - let base = - if Filename.check_suffix base ".gz" then Filename.chop_suffix base ".gz" - else base in - try Filename.chop_extension base with Invalid_argument _ -> base - -(* sanitized environment for child processes. - * strips display-related variables (DISPLAY, WAYLAND_DISPLAY, etc.) to prevent - * gui tools from trying to open windows when we run them with --help. - * without this, some tools would pop up dialogs or hang waiting for a - * display connection. *) -let safe_env = lazy ( - Array.of_list ( - List.filter (fun var -> - not (String.starts_with ~prefix:"DISPLAY=" var - || String.starts_with ~prefix:"WAYLAND_DISPLAY=" var - || String.starts_with ~prefix:"DBUS_SESSION_BUS_ADDRESS=" var - || String.starts_with ~prefix:"XAUTHORITY=" var)) - (Array.to_list (Unix.environment ())))) - -(* non-blocking drain of a pipe fd into a buffer. safe to call repeatedly; - * reads whatever is available without blocking. used by all fork-pipe sites - * to keep pipes drained so children never block on write. *) -let drain_fd rd buf = - let chunk = Bytes.create 8192 in - let continue = ref true in - while !continue do - match Unix.select [rd] [] [] 0.0 with - | (_ :: _, _, _) -> - (try - let bytes_read = Unix.read rd chunk 0 8192 in - if bytes_read = 0 then continue := false - else Buffer.add_subbytes buf chunk 0 bytes_read - with Unix.Unix_error _ -> continue := false) - | _ -> continue := false - done - -(* run a command with a timeout, capturing its stdout+stderr. - * forks a child process, redirects stdin from /dev/null, and merges - * stdout+stderr onto a pipe. reads from the pipe with select() polling - * until either the child exits or the deadline is reached. - * - * the child is run in /tmp to prevent tools that create side-effect files - * from polluting the user's working directory. we chdir to /tmp before - * fork and restore after. - * - * the select timeout is capped at 0.05s per iteration to ensure we check - * the deadline frequently even when no data is available. - * - * returns none if the process couldn't be started, produced no output, - * or was killed due to timeout. *) -let run_cmd args timeout_ms = - let (rd, wr) = Unix.pipe () in - let devnull = Unix.openfile "/dev/null" [Unix.O_RDONLY] 0 in - let argv = Array.of_list args in - (* run subprocesses in /tmp so commands that write side-effect files - * don't pollute the working directory *) - let saved_cwd = Sys.getcwd () in - Sys.chdir "/tmp"; - let pid = - try Unix.create_process_env (List.hd args) argv - (Lazy.force safe_env) devnull wr wr - with Unix.Unix_error _ -> - Unix.close rd; Unix.close wr; Unix.close devnull; -1 in - Sys.chdir saved_cwd; - Unix.close wr; Unix.close devnull; - if pid < 0 then (Unix.close rd; None) - else begin - let buf = Buffer.create 4096 in - let deadline = Unix.gettimeofday () +. (float_of_int timeout_ms /. 1000.0) in - let chunk = Bytes.create 8192 in - let alive = ref true in - (try while !alive do - let remaining = deadline -. Unix.gettimeofday () in - if remaining <= 0.0 then alive := false - else match Unix.select [rd] [] [] (min remaining 0.05) with - | (_ :: _, _, _) -> - let bytes_read = Unix.read rd chunk 0 8192 in - if bytes_read = 0 then raise Exit - else Buffer.add_subbytes buf chunk 0 bytes_read - | _ -> () - done with Exit -> ()); - Unix.close rd; - if not !alive then begin - (try Unix.kill pid Sys.sigkill with Unix.Unix_error _ -> ()); - ignore (Unix.waitpid [] pid) - end else - ignore (Unix.waitpid [] pid); - if Buffer.length buf > 0 then Some (Buffer.contents buf) else None - end - -(* check if a path is a regular file with at least one execute bit set *) -let is_executable path = - try let st = Unix.stat path in - st.st_kind = Unix.S_REG && st.st_perm land 0o111 <> 0 - with Unix.Unix_error _ -> false - -(* check if a file is a script by looking for a #! shebang. - * follows symlinks via realpath before reading. *) -let is_script path = - try - let real = Unix.realpath path in - let ic = open_in_bin real in - let has_shebang = - try let b = Bytes.create 2 in - really_input ic b 0 2; - Bytes.get b 0 = '#' && Bytes.get b 1 = '!' - with End_of_file -> false in - close_in ic; - has_shebang - with _ -> false - -(* scan an elf binary for string needles without loading the entire file. - * reads the file in 64kb chunks, searching each chunk for the needle strings. - * uses a sliding window (carry) of max_needle bytes between chunks to handle - * needles that span chunk boundaries. - * - * on read failure (e.g. if the path resolves to something unreadable), all - * needles are marked as found. this is a conservative fallback — we'd rather - * try --help on an unreadable binary than skip it. - * - * the inner loop is a manual byte-by-byte comparison rather than using - * String.contains or Str for performance — this runs on every binary - * in the prefix, so it needs to be fast. *) -let elf_scan path needles = - let found = Hashtbl.create 4 in - let remaining () = List.filter (fun needle -> not (Hashtbl.mem found needle)) needles in - (try - let real = Unix.realpath path in - let ic = open_in_bin real in - let magic = Bytes.create 4 in - really_input ic magic 0 4; - if Bytes.get magic 0 = '\x7f' && Bytes.get magic 1 = 'E' - && Bytes.get magic 2 = 'L' && Bytes.get magic 3 = 'F' then begin - let max_needle = List.fold_left (fun m needle -> max m (String.length needle)) 0 needles in - let chunk_size = 65536 in - let buf = Bytes.create (chunk_size + max_needle) in - let carry = ref 0 in - let eof = ref false in - while not !eof && remaining () <> [] do - let bytes_read = (try input ic buf !carry chunk_size with End_of_file -> 0) in - if bytes_read = 0 then eof := true - else begin - let total = !carry + bytes_read in - List.iter (fun needle -> - if not (Hashtbl.mem found needle) then begin - let nlen = String.length needle in - let pos = ref 0 in - while !pos <= total - nlen do - if Bytes.get buf !pos = needle.[0] then begin - let matched = ref true in - for j = 1 to nlen - 1 do - if Bytes.get buf (!pos + j) <> needle.[j] then matched := false - done; - if !matched then (Hashtbl.replace found needle true; pos := total) - else incr pos - end else incr pos - done - end - ) (remaining ()); - let new_carry = min max_needle total in - Bytes.blit buf (total - new_carry) buf 0 new_carry; - carry := new_carry - end - done - end; - close_in ic - with _ -> - List.iter (fun needle -> Hashtbl.replace found needle true) needles); - found - -(* detect nix-generated c wrapper scripts and extract the real binary path. - * nix's makeCWrapper creates small c programs that set up the environment - * and exec the real binary. these wrappers won't contain "-h" or "complet" - * in their own binary (they're just wrappers), so elf_scan would say "skip". - * this function reads the wrapper source to find the actual /nix/store/.../bin/... - * target path, so we can try --help on the real binary instead. - * - * caps the read at 64kb to avoid accidentally reading a large non-wrapper - * binary into memory. *) -let nix_wrapper_target path = - try - let real = Unix.realpath path in - let ic = open_in_bin real in - let size = in_channel_length ic in - if size > 65536 then (close_in ic; None) - else begin - let contents = Bytes.create size in - really_input ic contents 0 size; close_in ic; - let contents = Bytes.to_string contents in - if not (contains_str contents "makeCWrapper") then None - else - let re = Str.regexp "/nix/store/[a-z0-9]+-[^' \n\r\x00]+/bin/[a-zA-Z0-9._-]+" in - try ignore (Str.search_forward re contents 0); - let target = Str.matched_string contents in - if Sys.file_exists target then Some target else None - with Not_found -> None - end - with _ -> None - -(* detect nix bash/sh wrapper scripts that exec a real binary. - * nix sometimes generates small shell scripts (e.g. to set env vars like - * XDG_CONFIG_HOME) that exec the real binary. these look like: - * #!/nix/store/.../bash -e - * export FOO=... - * exec -a "$0" "/nix/store/.../bin/.foo-wrapped" "$@" - * we extract the exec target path and resolve through it. *) -let nix_script_wrapper_target path = - try - let real = Unix.realpath path in - let ic = open_in real in - let size = in_channel_length ic in - if size > 4096 then (close_in ic; None) - else begin - let contents = Bytes.create size in - really_input ic contents 0 size; close_in ic; - let contents = Bytes.to_string contents in - if not (contains_str contents "exec") then None - else - let re = Str.regexp "exec[ \t]+\\(-a[ \t]+\"\\$0\"[ \t]+\\)?\"?\\(/nix/store/[a-z0-9]+-[^\" \t\n]+/bin/[a-zA-Z0-9._-]+\\)\"?" in - try ignore (Str.search_forward re contents 0); - let target = Str.matched_group 2 contents in - let target = Unix.realpath target in - if Sys.file_exists target then Some target else None - with Not_found -> None - end - with _ -> None - -(* heuristic filter for binary names that should never be indexed. - * skips: empty names, "-", dotfiles, libraries (lib-prefix), daemon wrappers - * (suffixes -daemon, -wrapped), shared objects (.so suffix), and names with no - * alphanumeric characters (e.g. punctuation-only names). *) -let skip_name name = - String.length name = 0 || name = "-" || name.[0] = '.' - || String.starts_with ~prefix:"lib" name - || String.ends_with ~suffix:"-daemon" name - || String.ends_with ~suffix:"-wrapped" name - || String.ends_with ~suffix:".so" name - || not (String.exists (fun c -> (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) name) - -(* classification result for a binary. - * Skip — don't index this binary at all - * Try_help — only try --help (scripts, binaries without "completion" string) - * Try_native_and_help — try native nushell completion first, fall back to --help *) -type bin_class = Skip | Try_help | Try_native_and_help - -(* classify an elf binary path for indexing. *) -let classify_elf path = - let scan = elf_scan path ["-h"; "complet"] in - if Hashtbl.mem scan "complet" then Try_native_and_help - else if Hashtbl.mem scan "-h" then Try_help - else Skip - -(* classify a binary to decide the indexing strategy. - * decision tree: - * 1. nushell builtin or bad name -> Skip - * 2. not executable -> Skip - * 3. script (has shebang) -> resolve through nix script wrapper if possible, - * otherwise Try_help - * 4. elf binary containing "complet" -> Try_native_and_help - * 5. elf binary containing "-h" -> Try_help - * 6. nix c wrapper -> Try_help (the wrapper itself is just an exec shim) - * 7. otherwise -> Skip (binary has no help infrastructure) *) -let classify_binary bindir name = - if is_nushell_builtin name || skip_name name then Skip - else - let path = Filename.concat bindir name in - if not (is_executable path) then Skip - else if is_script path then - match nix_script_wrapper_target path with - | Some target -> - let cls = classify_elf target in - if cls <> Skip then cls else Try_help - | None -> Try_help - else - let cls = classify_elf path in - if cls <> Skip then cls - else if nix_wrapper_target path <> None then Try_help - else Skip - -(* detect available cpu cores by counting "processor" lines in /proc/cpuinfo. - * falls back to 4 if /proc/cpuinfo can't be read (e.g. on non-linux). *) -let num_cores () = - try - let ic = open_in "/proc/cpuinfo" in - let count = ref 0 in - (try while true do - if String.starts_with ~prefix:"processor" (input_line ic) then incr count - done with End_of_file -> ()); - close_in ic; max 1 !count - with _ -> 4 - -(* extract words from text that contain any of the given substrings. - * words are sequences of [a-zA-Z0-9_-] optionally prefixed with --. - * returns a deduplicated list. *) -let extract_matching_words text needles = - let len = String.length text in - let module SSet = Set.Make(String) in - let words = ref SSet.empty in - let i = ref 0 in - while !i < len do - while !i < len && not (text.[!i] >= 'a' && text.[!i] <= 'z' - || text.[!i] >= 'A' && text.[!i] <= 'Z' - || text.[!i] = '-') do - incr i - done; - let start = !i in - while !i < len && (text.[!i] >= 'a' && text.[!i] <= 'z' - || text.[!i] >= 'A' && text.[!i] <= 'Z' - || text.[!i] >= '0' && text.[!i] <= '9' - || text.[!i] = '-' || text.[!i] = '_') do - incr i - done; - if !i > start then begin - let word = String.sub text start (!i - start) in - let lower = String.lowercase_ascii word in - if List.exists (fun needle -> - try ignore (Str.search_forward (Str.regexp_string needle) lower 0); true - with Not_found -> false - ) needles then - words := SSet.add word !words - end - done; - SSet.elements !words - -(* try to get native nushell completions from a binary. - * runs --help, scans the output for words containing completion-related - * substrings ("complet"), then tries each match as a subcommand or flag - * with "nushell" as the argument. - * - * this catches arbitrary patterns (completions, generate-completions, - * shell-completion, gen-completions, etc.) without maintaining a hardcoded - * list. the worst case is a few failed attempts before falling back to - * manpage/--help parsing. *) -let try_native_completion bin_path = - let help_text = match run_cmd [bin_path; "--help"] 500 with - | Some t -> t | None -> "" in - if help_text = "" then None - else - let candidates = extract_matching_words help_text ["complet"] in - List.find_map (fun word -> - let attempts = - if String.starts_with ~prefix:"--" word then - [[bin_path; word; "nushell"]] - else - [[bin_path; word; "nushell"]; - [bin_path; "--" ^ word; "nushell"]] - in - List.find_map (fun args -> - match run_cmd args 500 with - | Some text when is_nushell_source text -> Some text - | _ -> None - ) attempts - ) candidates - -(* parse a manpage file, extracting the command name, its flags/subcommands, - * and any clap-style per-subcommand sections. - * returns none for nushell builtins or failed parses. *) -let parse_manpage_for_command file = - let contents = read_manpage_file file in - let fallback = cmd_name_of_manpage file in - (* the filename encodes the command boundary: "git-stash" = 2 words. - * use this to clamp the synopsis-extracted name, which can be too greedy - * when the synopsis lists subcommand variants. *) - let max_words = List.length (String.split_on_char '-' fallback) in - let clamp_cmd name = - let words = String.split_on_char ' ' name in - if List.length words > max_words then - String.concat " " (List.filteri (fun i _ -> i < max_words) words) - else name in - let cmd = match extract_synopsis_command contents with - | Some name -> clamp_cmd name | None -> fallback in - if is_nushell_builtin cmd then None - else - let result = parse_manpage_string contents in - let sub_sections = extract_subcommand_sections contents in - let result = if sub_sections <> [] then - { result with subcommands = List.map (fun (name, desc, _) -> - { name; desc }) sub_sections } - else result in - let subs = List.map (fun (name, _desc, r) -> - (cmd ^ " " ^ name, r)) sub_sections in - Some (cmd, result, subs) - -(* "inshellah manpage FILE" — parse one manpage and print the nushell extern *) -let cmd_manpage file = - match parse_manpage_for_command file with - | Some (cmd, result, _) when result.entries <> [] -> - print_string (generate_extern cmd result) - | _ -> () - -(* "inshellah manpage-dir DIR" — batch-process all manpages under a directory *) -let cmd_manpage_dir dir = - List.iter (fun section -> - let subdir = Filename.concat dir (Printf.sprintf "man%d" section) in - if is_dir subdir then - Array.iter (fun file -> - (try cmd_manpage (Filename.concat subdir file) with _ -> ()) - ) (Sys.readdir subdir) - ) command_sections - -(* detect rendered manpage output — when --help delegates to man(1), the - * output starts with a header line like "GIT-STASH(1) ... GIT-STASH(1)". - * we check if the first non-blank line matches that pattern. *) -let is_rendered_manpage text = - let lines = String.split_on_char '\n' text in - let first_line = List.find_opt (fun l -> String.trim l <> "") lines in - match first_line with - | None -> false - | Some line -> - let trimmed = String.trim line in - (* look for WORD(DIGIT) at the start of the line *) - try - let paren = String.index trimmed '(' in - paren > 0 - && paren + 2 < String.length trimmed - && trimmed.[paren + 1] >= '0' && trimmed.[paren + 1] <= '9' - && trimmed.[paren + 2] = ')' - with Not_found -> false - -(* find the raw manpage file for a hyphenated command name like "git-stash". - * first checks the provided man directories directly, then falls back to - * man -w for on-the-fly resolution when no man dirs are known. *) -let find_manpage_path mandirs hyphenated_name = - let try_dirs () = - List.find_map (fun mandir -> - List.find_map (fun section -> - let subdir = Filename.concat mandir (Printf.sprintf "man%d" section) in - List.find_map (fun ext -> - let path = Filename.concat subdir - (Printf.sprintf "%s.%d%s" hyphenated_name section ext) in - if Sys.file_exists path then Some path else None - ) [""; ".gz"] - ) command_sections - ) mandirs in - match try_dirs () with - | Some _ as found -> found - | None -> - (* fallback to man -w when no man dirs provided or file not found *) - match run_cmd ["man"; "-w"; hyphenated_name] 200 with - | Some raw -> - let path = String.trim raw in - if Sys.file_exists path then Some path else None - | None -> None - -(* when --help output is a rendered manpage, find and parse the raw manpage - * source instead. returns the main result plus any sub-section results - * (e.g. "git stash push" flags parsed from the git-stash manpage). *) -let try_manpage_fallback mandirs cmd_name = - match find_manpage_path mandirs cmd_name with - | None -> None - | Some path -> - match parse_manpage_for_command path with - | None -> None - | Some (_, result, subs) when result.entries = [] && subs = [] -> None - | Some (_, result, subs) -> Some (result, subs) - -(* safety limit: don't accumulate more than 500 subcommand resolution results - * per binary. prevents runaway recursion on tools with enormous subcommand trees. *) -let max_resolve_results = 500 - -(* safe wrapper around parse_manpage_for_command that catches all exceptions *) -let process_manpage file = - try - match parse_manpage_for_command file with - | Some (cmd, result, subs) when result.entries <> [] || subs <> [] -> - Some (cmd, result, subs) - | _ -> None - with _ -> None - -(* collect the set of command names that have manpages in a given man directory. - * used during indexing to skip --help for commands that will be handled by - * the manpage parsing phase instead (manpages are more reliable than --help). *) -let manpaged_commands mandir = - List.fold_left (fun acc section -> - let subdir = Filename.concat mandir (Printf.sprintf "man%d" section) in - if is_dir subdir then - Array.fold_left (fun acc f -> SSet.add (cmd_name_of_manpage f) acc) - acc (Sys.readdir subdir) - else acc - ) SSet.empty command_sections - -(* parallel structured help resolver — recursively resolves a command and - * all its subcommands by running --help on each, forking a child process - * per subcommand for parallelism. - * - * the resolver works as a breadth-first queue: - * 1. start with the root command in the queue - * 2. fork a child for each queued item (up to num_cores concurrent) - * 3. the child runs --help, parses the output, marshals the result via pipe - * 4. the parent collects results and enqueues discovered subcommands - * 5. repeat until queue is empty and all children have finished - * - * depth is limited to 5 levels and total results to max_resolve_results - * to prevent runaway recursion on pathological command trees. - * - * the child process detects "self-listing" — when a subcommand's --help - * lists itself as a subcommand (e.g. "git help" listing "help" as a - * subcommand of itself). this would cause infinite recursion, so such - * results are discarded. - * - * children close all pipe fds from other pending children immediately - * after fork to prevent fd leaks. the parent drains pipes regularly to - * prevent children from blocking on full pipe buffers. *) -let help_resolve_par ?(timeout=200) ?(mandirs=[]) cmd rest name = - let max_jobs = num_cores () in - let queue = Queue.create () in - Queue.push (rest, name, 0) queue; - let results = ref [] in - (* pending: (pid, rd, buf, cmd_args, cmd_name, depth) *) - let pending = ref [] in - let collect rd buf cmd_args cmd_name depth = - drain_fd rd buf; - (try Unix.close rd with _ -> ()); - let data = Buffer.contents buf in - let result : (help_result * subcommand list * (string * help_result) list) option = - if String.length data > 0 then - try Marshal.from_string data 0 with _ -> None - else None in - match result with - | None -> () - | Some (r, subs, extras) -> - let at_limit = depth >= 5 || List.length !results >= max_resolve_results in - results := (cmd_name, r) :: !results; - (* extras are fully-parsed sub-results from manpage sub-sections — - * add them directly without enqueueing for further resolution *) - List.iter (fun (sub_name, sub_r) -> - if not (List.exists (fun (existing, _) -> existing = sub_name) !results) then - results := (sub_name, sub_r) :: !results - ) extras; - if not at_limit then - (* only enqueue subcommands that weren't already covered by extras *) - let extra_names = List.map fst extras in - List.iter (fun (sc : subcommand) -> - let full = cmd_name ^ " " ^ sc.name in - if not (List.exists (fun existing -> existing = full) extra_names) then - Queue.push (cmd_args @ [sc.name], full, depth + 1) queue - ) subs in - let reap () = - pending := List.filter (fun (pid, rd, buf, cmd_args, cmd_name, depth) -> - drain_fd rd buf; - match Unix.waitpid [Unix.WNOHANG] pid with - | (0, _) -> true - | _ -> collect rd buf cmd_args cmd_name depth; false - | exception Unix.Unix_error (Unix.ECHILD, _, _) -> - (try Unix.close rd with _ -> ()); false - ) !pending in - let wait_for_slot () = - while List.length !pending >= max_jobs do - reap (); - if List.length !pending >= max_jobs then begin - let fds = List.map (fun (_, rd, _, _, _, _) -> rd) !pending in - ignore (Unix.select fds [] [] 0.05) - end - done in - while not (Queue.is_empty queue) || !pending <> [] do - while not (Queue.is_empty queue) do - let (cmd_args, cmd_name, depth) = Queue.pop queue in - wait_for_slot (); - let (rd, wr) = Unix.pipe () in - let pid = Unix.fork () in - if pid = 0 then begin - Unix.close rd; - List.iter (fun (_, prd, _, _, _, _) -> - try Unix.close prd with _ -> ()) !pending; - let result = - let text = match run_cmd (cmd :: cmd_args @ ["--help"]) timeout with - | Some _ as r -> r - | None -> run_cmd (cmd :: cmd_args @ ["-h"]) timeout in - match text with - | None -> None - | Some text -> - (* check for rendered manpage first — when --help delegates to - * man(1), the raw groff source has richer structure than the - * rendered text. parse_help would partially succeed on rendered - * manpage output (extracting flags from OPTIONS) but miss - * subcommands from the COMMANDS section. *) - if is_rendered_manpage text then - let base = Filename.basename cmd in - let hyphenated = String.concat "-" (base :: cmd_args) in - match try_manpage_fallback mandirs hyphenated with - | Some (r, subs) -> - let at_limit = depth >= 5 in - let extra = List.map (fun (sub_name, sub_r) -> - (cmd_name ^ " " ^ sub_name, sub_r)) subs in - let enqueue_subs = if at_limit then [] else r.subcommands in - Some (r, enqueue_subs, extra) - | None -> - (* manpage file not found — fall back to parsing rendered text *) - (match parse_help text with - | Error _ -> None - | Ok r when r.entries = [] && r.subcommands = [] && r.positionals = [] -> None - | Ok r -> - let self_listed = match cmd_args with - | [] -> false - | _ -> - let leaf = List.nth cmd_args (List.length cmd_args - 1) in - List.exists (fun (sc : subcommand) -> sc.name = leaf) r.subcommands in - if self_listed then - Some ({ entries = []; subcommands = []; positionals = []; - description = "" }, [], []) - else - let at_limit = depth >= 5 in - let subs = if at_limit then [] else r.subcommands in - Some (r, subs, [])) - else - match parse_help text with - | Error _ -> None - | Ok r when r.entries = [] && r.subcommands = [] && r.positionals = [] -> None - | Ok r -> - let self_listed = match cmd_args with - | [] -> false - | _ -> - let leaf = List.nth cmd_args (List.length cmd_args - 1) in - List.exists (fun (sc : subcommand) -> sc.name = leaf) r.subcommands in - if self_listed then - (* the subcommand's --help returned the parent's help text - * (it lists itself as a subcommand). cache a leaf stub so the - * completer knows this is a leaf node, not a parent with - * further subcommands. *) - Some ({ entries = []; subcommands = []; positionals = []; - description = "" }, [], []) - else - let at_limit = depth >= 5 in - let subs = if at_limit then [] else r.subcommands in - Some (r, subs, []) in - let oc = Unix.out_channel_of_descr wr in - Marshal.to_channel oc (result : (help_result * subcommand list * (string * help_result) list) option) []; - close_out oc; - exit 0 - end else begin - Unix.close wr; - pending := (pid, rd, Buffer.create 4096, cmd_args, cmd_name, depth) :: !pending - end - done; - if !pending <> [] then begin - reap (); - if !pending <> [] && Queue.is_empty queue then begin - let fds = List.map (fun (_, rd, _, _, _, _) -> rd) !pending in - ignore (Unix.select fds [] [] 0.05) - end - end - done; - List.rev !results - -(* "inshellah index" — the main indexing command. - * processes all binaries and manpages in the given prefix directories, - * writing completion data to the cache dir. - * - * the pipeline has two phases: - * - * phase 1 (binaries): fork one child per binary. each child: - * - tries native nushell completions (if classified as Try_native_and_help) - * - falls back to help_resolve_par (which itself forks per subcommand) - * - marshals the result back via pipe as a tagged variant: - * `Native of string — raw nushell source - * `Parsed of (string * help_result) list — parsed flag data - * `None — nothing useful extracted - * - * phase 2 (manpages): sequentially parse manpages for commands not yet - * covered by phase 1. manpages are more reliable than --help for many - * gnu tools, but slower to process. - * - * commands on the ignorelist are skipped entirely. commands on the - * help_only list skip manpage parsing and only use --help. commands - * with manpages skip --help in phase 1 (they'll be handled in phase 2). - * - * the done_cmds set tracks which commands have already been indexed to - * prevent duplicates across phases and across multiple prefix directories. *) - -(* known privilege-escalation wrappers — defined here (before cmd_index and - * cmd_complete) because both need the list: cmd_index writes @complete - * external stubs, and cmd_complete strips the wrapper to find the real command. *) -let elevation_commands = - ["sudo"; "run0"; "doas"; "pkexec"; "su"; "calife"; "sux"; "sudoedit"; - "please"; "super"; "priv"] - -let cmd_index bindirs mandirs ignorelist help_only dir = - ensure_dir dir; - let done_cmds = ref SSet.empty in - let result_count = ref 0 in - let index_bindir bindir mandir = - if not (is_dir bindir) then - Printf.eprintf "skipping %s (not found)\n" bindir - else begin - let bins = Sys.readdir bindir in - Array.sort String.compare bins; - let manpaged = if is_dir mandir - then manpaged_commands mandir else SSet.empty in - let max_jobs = num_cores () in - let classified = Array.map (fun name -> - if SSet.mem name ignorelist then (name, Skip) - else if SSet.mem name help_only then (name, classify_binary bindir name) - else if SSet.mem name manpaged then (name, Skip) - else (name, classify_binary bindir name) - ) bins in - let pending = ref [] in - let process_result name rd buf = - drain_fd rd buf; - (try Unix.close rd with _ -> ()); - let data = Buffer.contents buf in - if String.length data > 0 then begin - let result : [`Native of string | `Parsed of (string * help_result) list | `None] = - try Marshal.from_string data 0 with _ -> `None in - (match result with - | `Native src -> - write_native ~dir name src; - incr result_count - | `Parsed pairs -> - List.iter (fun (cmd_name, r) -> - if not (SSet.mem cmd_name !done_cmds) then begin - write_result ~dir ~source:"help" cmd_name r; - done_cmds := SSet.add cmd_name !done_cmds; - incr result_count - end - ) pairs - | `None -> ()) - end; - done_cmds := SSet.add name !done_cmds in - let reap () = - pending := List.filter (fun (pid, rd, buf, name) -> - drain_fd rd buf; - match Unix.waitpid [Unix.WNOHANG] pid with - | (0, _) -> true - | _ -> - process_result name rd buf; - false - | exception Unix.Unix_error (Unix.ECHILD, _, _) -> - (try Unix.close rd with _ -> ()); false - ) !pending in - let wait_for_slot () = - while List.length !pending >= max_jobs do - reap (); - if List.length !pending >= max_jobs then begin - let fds = List.map (fun (_, rd, _, _) -> rd) !pending in - ignore (Unix.select fds [] [] 0.05) - end - done in - Array.iter (fun (name, classification) -> - match classification with - | Skip -> () - | Try_help | Try_native_and_help -> - wait_for_slot (); - let (rd, wr) = Unix.pipe () in - let pid = Unix.fork () in - if pid = 0 then begin - Unix.close rd; - List.iter (fun (_, prd, _, _) -> - try Unix.close prd with _ -> ()) !pending; - let result = - try - let path = Filename.concat bindir name in - let native = match classification with - | Try_native_and_help -> - (match try_native_completion path with - | Some src -> Some src | None -> None) - | _ -> None in - match native with - | Some src -> `Native src - | None -> - let pairs = help_resolve_par ~timeout:200 ~mandirs path [] name in - if pairs <> [] then `Parsed pairs else `None - with _ -> `None in - let oc = Unix.out_channel_of_descr wr in - Marshal.to_channel oc - (result : [`Native of string | `Parsed of (string * help_result) list | `None]) []; - close_out oc; - exit 0 - end else begin - Unix.close wr; - pending := (pid, rd, Buffer.create 4096, name) :: !pending - end - ) classified; - while !pending <> [] do - reap (); - if !pending <> [] then begin - let fds = List.map (fun (_, rd, _, _) -> rd) !pending in - ignore (Unix.select fds [] [] 0.05) - end - done; - (* phase 2: manpages *) - if is_dir mandir then - List.iter (fun section -> - let subdir = Filename.concat mandir (Printf.sprintf "man%d" section) in - if is_dir subdir then begin - let files = Sys.readdir subdir in - (* sort by filename length first, then alphabetically. - * this ensures parent manpages (e.g. nix-env.1.gz) are - * processed before subpage manpages (nix-env-install.1.gz) - * so the parent's data isn't overwritten by a subpage - * whose synopsis also extracts the parent command name. *) - Array.sort (fun a b -> - let la = String.length a and lb = String.length b in - if la <> lb then compare la lb - else String.compare a b) files; - Array.iter (fun file -> - let base_cmd = cmd_name_of_manpage file in - if SSet.mem base_cmd help_only then () - else match process_manpage (Filename.concat subdir file) with - | None -> () - | Some (cmd, result, subs) -> - if not (SSet.mem cmd !done_cmds) then begin - write_result ~dir ~source:"manpage" cmd result; - done_cmds := SSet.add cmd !done_cmds; - incr result_count - end else if cmd <> base_cmd then - (* a subpage manpage (e.g. nix-env-install.1) extracted - * a command name that was already indexed (e.g. "nix-env"). - * warn so the user can investigate. *) - Printf.eprintf "warning: %s extracted cmd \"%s\" (already indexed), skipping\n" - file cmd; - List.iter (fun (sub_cmd, sub_result) -> - if not (SSet.mem sub_cmd !done_cmds) then begin - write_result ~dir ~source:"manpage" sub_cmd sub_result; - done_cmds := SSet.add sub_cmd !done_cmds; - incr result_count - end - ) subs; - (* for COMMANDS section subcommands (e.g. systemctl start/stop), - * write leaf stubs so the completer treats them as leaf nodes - * rather than falling back to the parent's flags/subcommands. - * only when there are no clap-style sub-sections (subs = []), - * meaning the subcommands came from the COMMANDS section. - * deliberately not added to done_cmds — if a per-subcommand - * manpage exists (e.g. docker-start.1), it will overwrite the stub. *) - if subs = [] then - List.iter (fun (sc : subcommand) -> - let sub_cmd = cmd ^ " " ^ sc.name in - if not (SSet.mem sub_cmd !done_cmds) then - write_result ~dir ~source:"manpage" sub_cmd - { entries = []; subcommands = []; positionals = []; - description = sc.desc } - ) result.subcommands - ) files - end - ) command_sections - end in - List.iter2 index_bindir bindirs mandirs; - (* write @complete external stubs for elevation commands (sudo, doas, etc.) - * so nushell routes their completions through the external completer. - * without this, nushell hardcodes sudo/doas to show command-name completion - * and never calls the external completer for their own flags. *) - List.iter (fun cmd -> - let json_path = Filename.concat dir (filename_of_command cmd ^ ".json") in - if Sys.file_exists json_path then - write_native ~dir cmd - (Printf.sprintf "@complete external\nextern \"%s\" []\n" cmd) - ) elevation_commands; - Printf.printf "indexed %d commands into %s\n" !result_count dir - -(* "inshellah dump" — list all indexed commands with their source type *) -let cmd_dump dirs = - let cmds = all_commands dirs in - Printf.printf "%d commands\n" (List.length cmds); - List.iter (fun cmd -> - let src = match file_type_of dirs cmd with - | Some label -> label | None -> "?" in - Printf.printf " %-40s [%s]\n" cmd src - ) cmds - -(* search $PATH for an executable with the given name. - * used during completion to find binaries for on-the-fly resolution. *) -let find_in_path name = - try - Sys.getenv "PATH" - |> String.split_on_char ':' - |> List.find_map (fun dir -> - let p = Filename.concat dir name in - if is_executable p then Some p else None) - with Not_found -> None - -(* resolve a command's completions on-the-fly and cache the results. - * called during "complete" when a command isn't in the index. - * runs help_resolve_par and writes results to the user's cache dir. *) -let resolve_and_cache ~dir ~mandirs name path = - let pairs = help_resolve_par ~timeout:200 ~mandirs path [] name in - if pairs <> [] then begin - ensure_dir dir; - List.iter (fun (cmd_name, r) -> write_result ~dir cmd_name r) pairs; - Some pairs - end else None - -(* format a single completion candidate as JSON for nushell's completer protocol *) -let completion_json value desc = - Printf.sprintf "{\"value\":\"%s\",\"description\":\"%s\"}" - (escape_json value) (escape_json desc) - -(* fuzzy matching: returns a score > 0 if needle is a subsequence of haystack. - * higher scores = better match. scoring tiers: - * - exact match: 1000 - * - prefix match: 900 + length bonus (how much of the haystack is covered) - * - subsequence: base 10 per char + bonuses for: - * - word boundary alignment (50): matching at '-', '_', or camelCase transitions - * - consecutive matches (20): matching adjacent characters - * - * this drives the completion candidate ranking. users typing "ser" should see - * "--server" ranked above "--preserve" even though both contain "ser" as a - * subsequence. the word-boundary bonus achieves this. *) -let fuzzy_score needle haystack = - let needle_len = String.length needle and haystack_len = String.length haystack in - if needle_len = 0 then 1 - else if needle_len > haystack_len then 0 - else if needle = haystack then 1000 - else - let needle_lc = String.lowercase_ascii needle - and haystack_lc = String.lowercase_ascii haystack in - if String.starts_with ~prefix:needle_lc haystack_lc then - 900 + (needle_len * 100 / haystack_len) - else - let is_boundary hay_idx = - hay_idx = 0 || haystack.[hay_idx - 1] = '-' || haystack.[hay_idx - 1] = '_' - || (haystack.[hay_idx - 1] >= 'a' && haystack.[hay_idx - 1] <= 'z' - && haystack.[hay_idx] >= 'A' && haystack.[hay_idx] <= 'Z') in - (* walk haystack matching needle chars as a subsequence *) - let needle_idx, score, _, _ = - String.fold_left (fun (needle_idx, score, hay_idx, prev_match) c -> - if needle_idx >= needle_len then (needle_idx, score, hay_idx + 1, prev_match) - else if c = needle_lc.[needle_idx] then - let bonus = (if is_boundary hay_idx then 50 else 10) - + (if prev_match = hay_idx - 1 then 20 else 0) in - (needle_idx + 1, score + bonus, hay_idx + 1, hay_idx) - else (needle_idx, score, hay_idx + 1, prev_match) - ) (0, 0, 0, -1) haystack_lc in - if needle_idx = needle_len then score else 0 - -(* scan past the elevation command's flags and arguments to find the real - * command. is_command checks whether a token names a known command. - * returns Some (real_cmd :: args) or None if no command was found. *) -let find_real_command is_command args = - let rec scan = function - | [] -> None - | "--" :: rest -> Some rest - | arg :: rest when String.length arg > 0 && arg.[0] = '-' -> - scan rest - | arg :: _ as cmd_and_rest when is_command arg -> - Some cmd_and_rest - | _ :: rest -> scan rest - in - scan args - -(* "inshellah complete CMD [ARGS...]" — the nushell custom completer. - * this is the hot path — called every time the user presses tab in nushell. - * - * the completion logic: - * 1. try to find the command (or longest subcommand prefix) in the store - * 2. if not found, try on-the-fly resolution (find in $PATH, run --help, cache) - * 3. score all candidate completions against the partial input using fuzzy_score - * 4. output scored candidates as a JSON array - * - * subcommand resolution: the lookup tries longest prefix first. - * for "git add --", it first looks for "git add", then "git". - * this ensures subcommand-specific flags are shown. - * - * nushell sends a trailing empty token when the cursor is after a space - * ("git add "). in this case all_tokens includes the empty string. - * when the last token is non-empty, the user is still typing it, so we use - * it as the fuzzy filter. when empty, we show all candidates. - * - * if only a parent command matched (e.g. "git" matched but not "git add"), - * we suppress subcommand suggestions and only show flags. this prevents - * showing sibling subcommands when the user has already committed to a - * specific subcommand path. - * - * file completions: nushell's external completer protocol is either/or — - * you either return custom candidates or fall back to native file completions - * (via null), but can't mix both. we return null (triggering nushell's native - * file completer with colors, sorting, quoting) when: - * - the user is at a leaf command (no subcommands) and not mid-flag - * - or we have no candidates at all - * this ensures file completions appear with full nushell UX. when the user - * IS typing a flag (partial starts with "-"), we return our flag candidates. *) -let cmd_complete spans user_dir system_dirs mandirs = - (* system dirs are searched first — they're built at index time from - * manpages and are authoritative. user dir is an on-the-fly cache - * that should only be used as fallback for commands not in any system dir. *) - let dirs = system_dirs @ [user_dir] in - (* if the command line starts with a privilege-escalation wrapper, scan past - * it to find the real command. we identify the command by checking the store - * and $PATH — this avoids needing per-command option tables which are fragile - * across different implementations. if no real command is found, fall back to - * completing the elevation command itself. *) - let spans = match spans with - | cmd :: rest when List.mem cmd elevation_commands -> - let is_command name = - name <> "" && (lookup dirs name <> None || find_in_path name <> None) - in - (match find_real_command is_command rest with - | Some (_ :: _ as real_spans) -> real_spans - | _ -> spans) - | _ -> spans in - match spans with - | [] -> print_string "null\n" - | cmd_name :: rest -> - (* try longest prefix match: "git add" before "git" *) - let find_result tokens = - let num_tokens = List.length tokens in - List.init num_tokens Fun.id |> List.find_map (fun drop -> - let prefix = List.filteri (fun i _ -> i < num_tokens - drop) tokens in - match prefix with - | [] -> None - | _ -> - let try_name = String.concat " " prefix in - match lookup dirs try_name with - | Some r -> Some (try_name, r, List.length prefix) - | None -> None) in - (* strip flag tokens (--user, -a, etc.) from intermediate positions. - * flags are not part of the subcommand path and should not affect - * lookup. e.g. "systemctl --user start" should look up "systemctl start". - * the last token (partial) is NOT stripped — it may be a flag the - * user is typing (e.g. "--u") which needs fuzzy matching. *) - let strip_intermediate_flags tokens = - match List.rev tokens with - | last :: rev_rest -> - List.filter (fun t -> - String.length t = 0 || t.[0] <> '-') (List.rev rev_rest) - @ [last] - | [] -> [] in - let all_tokens = strip_intermediate_flags (cmd_name :: rest) in - let last_token = match rest with - | [] -> "" | _ -> List.nth rest (List.length rest - 1) in - (* only treat the last token as a completed subcommand when nushell - * sends a trailing empty token (cursor is after a space). - * otherwise the user is still typing and we treat it as partial. *) - let lookup_tokens = if last_token = "" then all_tokens - else match all_tokens with - | _ :: _ -> List.rev (List.tl (List.rev all_tokens)) - | _ -> [cmd_name] in - let resolve tokens partial = - match find_result tokens with - | Some _ as found -> (found, partial) - | None -> (None, partial) in - let found, partial = resolve lookup_tokens last_token in - (* try on-the-fly resolution when no match or only a parent matched *) - let lookup_depth = List.length lookup_tokens in - let result, partial = match found with - | Some (_, _, depth) when depth >= lookup_depth - 1 -> - (* exact or near-exact match — use it *) - (found, partial) - | _ -> - (* no match, or only a parent matched — try on-the-fly resolution *) - (match find_in_path cmd_name with - | Some path -> - (* derive sibling share/man from the binary's location. - * e.g. /nix/store/.../bin/foo → /nix/store/.../share/man - * this lets on-the-fly resolution find manpages for commands - * not in the indexed prefixes. also resolves through nix - * wrappers to find the real binary's manpage location. *) - let mandir_of_bin p = - let bindir = Filename.dirname p in - let prefix = Filename.dirname bindir in - Filename.concat (Filename.concat prefix "share") "man" in - let bin_mandirs = - let direct = mandir_of_bin path in - (* also check the canonical path after resolving symlinks. - * e.g. /run/current-system/sw/bin/foo is a symlink to - * /nix/store/xxx/bin/foo — check /nix/store/xxx/share/man *) - let via_realpath = - try let real = Unix.realpath path in - if real <> path then [mandir_of_bin real] else [] - with Unix.Unix_error _ -> [] in - let via_wrapper = - match nix_script_wrapper_target path with - | Some target -> [mandir_of_bin target] - | None -> - match nix_wrapper_target path with - | Some target -> [mandir_of_bin target] - | None -> [] in - List.filter is_dir (direct :: via_realpath @ via_wrapper) in - let all_mandirs = bin_mandirs @ mandirs in - (match resolve_and_cache ~dir:user_dir ~mandirs:all_mandirs cmd_name path with - | Some _pairs -> resolve lookup_tokens last_token - | None -> (found, partial)) - | None -> (found, partial)) in - let candidates = match result with - | None -> [] - | Some (_matched_name, r, depth) -> - (* when the match is shallower than requested, the user already - * typed a subcommand beyond the matched level — don't show - * sibling subcommands, only flags *) - let sub_candidates = if depth < lookup_depth - 1 then [] else - let subs = match r.subcommands with - | _ :: _ -> r.subcommands - | [] -> subcommands_of dirs _matched_name in - List.filter_map (fun (subcommand : subcommand) -> - let score = fuzzy_score partial subcommand.name in - if score > 0 then Some (score, completion_json subcommand.name subcommand.desc) else None - ) subs in - (* build flag completion candidates from the entry list. - * for flags with both short and long forms (Both), we pick which form - * to display based on what the user is currently typing: - * - if the partial input matches the short flag better, show the short - * flag as the value and note the long form in the description - * - otherwise (including empty partial), prefer the long flag and note - * the short form in the description - * - * parameter names are appended to descriptions in angle brackets for - * mandatory params and square brackets for optional ones, matching the - * conventions users expect from cli help text. *) - let flag_candidates = List.filter_map (fun (entry : entry) -> - let base_desc = match entry.param with - | Some (Mandatory p) -> if entry.desc <> "" then entry.desc ^ " <" ^ p ^ ">" else "<" ^ p ^ ">" - | Some (Optional p) -> if entry.desc <> "" then entry.desc ^ " [" ^ p ^ "]" else "[" ^ p ^ "]" - | None -> entry.desc in - let flag, desc = match entry.switch with - | Long l -> ("--" ^ l, base_desc) - | Short c -> (Printf.sprintf "-%c" c, base_desc) - | Both (c, l) -> - (* score the partial against both forms to decide which to present. - * e.g. typing "-s" scores higher against "-s" than "--squeeze-blank", - * so we show "-s (aka --squeeze-blank)". when the partial is empty or - * matches the long form better, we default to the long form. *) - let long_flag = "--" ^ l in - let short_flag = Printf.sprintf "-%c" c in - let long_score = fuzzy_score partial long_flag in - let short_score = fuzzy_score partial short_flag in - if short_score > long_score then - (short_flag, Printf.sprintf "(aka %s) %s" long_flag base_desc) - else - (long_flag, Printf.sprintf "(aka %s) %s" short_flag base_desc) in - let score = fuzzy_score partial flag in - if score > 0 then Some (score, completion_json flag desc) else None - ) r.entries in - let scored = sub_candidates @ flag_candidates in - List.sort (fun (a, _) (b, _) -> compare b a) scored - |> List.map snd in - (* determine whether to return our candidates or fall back to nushell's - * native file completer (via null). nushell's protocol is either/or: - * returning candidates suppresses file completions, returning null - * enables them with full nushell UX (colors, sorting, quoting). - * - * we return null when: - * - we have no candidates at all (unknown command, no match) - * - the user is at a leaf command and not typing a flag — this is - * the position where file arguments are expected, so hand off to - * nushell's native file completer for the best experience *) - let typing_flag = String.length partial > 0 && partial.[0] = '-' in - let has_subcommands = match result with - | Some (matched_name, r, _) -> - r.subcommands <> [] || subcommands_of dirs matched_name <> [] - | None -> false in - let want_files = (not typing_flag) && (not has_subcommands) in - if want_files then print_string "null\n" - else if candidates = [] then print_string "null\n" - else Printf.printf "[%s]\n" (String.concat "," candidates) - -(* "inshellah query CMD" — print the raw stored data for a command *) -let cmd_query cmd dirs = - match lookup_raw dirs cmd with - | None -> - Printf.eprintf "not found: %s\n" cmd; exit 1 - | Some data -> - print_string data; print_newline () - -(* load a newline-separated list of command names to ignore. - * blank lines and lines starting with '#' are skipped. *) -let load_ignorelist path = - try - In_channel.with_open_text path In_channel.input_all - |> String.split_on_char '\n' - |> List.filter_map (fun line -> - let line = String.trim line in - if String.length line > 0 && line.[0] <> '#' then Some line else None) - |> SSet.of_list - with _ -> SSet.empty - -(* parse "index" subcommand arguments: prefix dirs + optional --dir, --ignore, --help-only. - * uses a fold over the argument list, accumulating prefixes and option values. *) -let parse_index_args args = - let (prefixes, dir, ignore, help_only, _) = - List.fold_left (fun (prefixes, dir, ignore, help_only, pending) arg -> - match pending with - | Some "--dir" -> (prefixes, arg, ignore, help_only, None) - | Some "--ignore" -> (prefixes, dir, SSet.union ignore (load_ignorelist arg), help_only, None) - | Some "--help-only" -> (prefixes, dir, ignore, SSet.union help_only (load_ignorelist arg), None) - | Some _ -> (prefixes, dir, ignore, help_only, None) - | None -> - match arg with - | "--dir" | "--ignore" | "--help-only" -> (prefixes, dir, ignore, help_only, Some arg) - | _ -> (arg :: prefixes, dir, ignore, help_only, None) - ) ([], default_store_path (), SSet.empty, SSet.empty, None) args in - (List.rev prefixes, dir, ignore, help_only) - -(* derive the sibling man directory from a store directory path. - * e.g. "/run/current-system/sw/share/inshellah" -> "/run/current-system/sw/share/man" *) -let man_dir_of_system_dir path = - Filename.concat (Filename.dirname path) "man" - -(* parse common --dir arguments for complete/query/dump commands. - * --dir takes a colon-separated list of paths. the first path is the writable - * user cache dir; additional paths are read-only system directories. - * man directories are derived from system dir paths as siblings - * (share/inshellah -> share/man). uses a fold over the argument list. *) -let parse_dir_args args = - let (dir_value, rest_args, _) = - List.fold_left (fun (dir_value, rest_args, pending) arg -> - match pending with - | Some "--dir" -> (Some arg, rest_args, None) - | Some _ -> (dir_value, rest_args, None) - | None -> - match arg with - | "--dir" -> (dir_value, rest_args, Some arg) - | _ -> (dir_value, arg :: rest_args, None) - ) (None, [], None) args in - let (user_dir, system_dirs) = match dir_value with - | None -> (default_store_path (), []) - | Some v -> - match String.split_on_char ':' v with - | [] -> (default_store_path (), []) - | first :: rest -> (first, rest) in - (user_dir, system_dirs, List.rev rest_args) - -(* "inshellah completions nushell" — emit native nushell extern for inshellah itself *) -let cmd_completions_nushell () = - let result = { - entries = []; - subcommands = []; - positionals = []; - description = "nushell completions engine"; - } in - let index_result = { - entries = [ - { switch = Long "dir"; param = Some (Mandatory "PATH"); desc = "output directory for cached completions" }; - { switch = Long "ignore"; param = Some (Mandatory "FILE"); desc = "skip listed commands entirely" }; - { switch = Long "help-only"; param = Some (Mandatory "FILE"); desc = "skip manpages for listed commands, use --help instead" }; - ]; - subcommands = []; - positionals = [ - { pos_name = "prefix"; optional = false; variadic = true }; - ]; - description = "index completions from prefix directories"; - } in - let complete_result = { - entries = [ - { switch = Long "dir"; param = Some (Mandatory "PATH"); desc = "colon-separated cache paths" }; - ]; - subcommands = []; - positionals = [ - { pos_name = "cmd"; optional = false; variadic = false }; - { pos_name = "args"; optional = true; variadic = true }; - ]; - description = "nushell custom completer, outputs JSON candidates"; - } in - let query_result = { - entries = [ - { switch = Long "dir"; param = Some (Mandatory "PATH"); desc = "colon-separated cache paths" }; - ]; - subcommands = []; - positionals = [ - { pos_name = "cmd"; optional = false; variadic = false }; - ]; - description = "print stored completion data for a command"; - } in - let dump_result = { - entries = [ - { switch = Long "dir"; param = Some (Mandatory "PATH"); desc = "colon-separated cache paths" }; - ]; - subcommands = []; - positionals = []; - description = "list indexed commands"; - } in - let manpage_result = { - entries = []; - subcommands = []; - positionals = [ - { pos_name = "file"; optional = false; variadic = false }; - ]; - description = "parse a manpage and emit nushell extern"; - } in - let manpage_dir_result = { - entries = []; - subcommands = []; - positionals = [ - { pos_name = "dir"; optional = false; variadic = false }; - ]; - description = "batch-process manpages under a directory"; - } in - let completions_result = { - entries = []; - subcommands = []; - positionals = []; - description = "generate nushell completions for inshellah"; - } in - print_string (generate_extern "inshellah" result); - print_string (generate_extern "inshellah index" index_result); - print_string (generate_extern "inshellah complete" complete_result); - print_string (generate_extern "inshellah query" query_result); - print_string (generate_extern "inshellah dump" dump_result); - print_string (generate_extern "inshellah manpage" manpage_result); - print_string (generate_extern "inshellah manpage-dir" manpage_dir_result); - print_string (generate_extern "inshellah completions" completions_result) - -(* --- entry point --- - * dispatch on the first argument to the appropriate subcommand handler. *) -let () = - match Array.to_list Sys.argv |> List.tl with - | "index" :: rest -> - let (prefixes, dir, ignorelist, help_only) = parse_index_args rest in - if prefixes = [] then (Printf.eprintf "error: index requires at least one prefix dir\n"; exit 1); - let bindirs = List.map (fun p -> Filename.concat p "bin") prefixes in - let mandirs = List.map (fun p -> Filename.concat p "share/man") prefixes in - cmd_index bindirs mandirs ignorelist help_only dir - | "complete" :: rest -> - let (user_dir, system_dirs, spans) = parse_dir_args rest in - let man_dirs = List.filter_map (fun d -> - let m = man_dir_of_system_dir d in - if is_dir m then Some m else None) system_dirs in - cmd_complete spans user_dir system_dirs man_dirs - | "query" :: rest -> - let (user_dir, system_dirs, args) = parse_dir_args rest in - (match args with - | [cmd] -> cmd_query cmd (user_dir :: system_dirs) - | _ -> Printf.eprintf "error: query CMD [--dir PATH[:PATH...]]\n"; exit 1) - | "dump" :: rest -> - let (user_dir, system_dirs, _) = parse_dir_args rest in - cmd_dump (user_dir :: system_dirs) - | ["manpage"; file] -> cmd_manpage file - | ["manpage-dir"; dir] -> cmd_manpage_dir dir - | ["completions"] -> cmd_completions_nushell () - | _ -> usage () diff --git a/doc/building.md b/doc/building.md index de685d7..0a2598d 100644 --- a/doc/building.md +++ b/doc/building.md @@ -1,141 +1,77 @@ # building and installing -## dependencies +inshellah is a rust crate. it builds with stock cargo on any platform +rust supports. -inshellah is written in OCaml and uses dune as its build system. - -build dependencies: -- **OCaml** >= 5.0 -- **dune** >= 3.20 -- **angstrom** — parser combinator library -- **angstrom-unix** — unix extensions for angstrom -- **camlzip** — gzip decompression for reading compressed manpages -- **str** — regular expressions (ships with OCaml) -- **unix** — process/file operations (ships with OCaml) - -runtime dependencies: -- **man** (optional) — used as a fallback to locate manpages during - on-the-fly completion resolution. not needed if system directories - are provided via `--dir` (manpages are found via sibling `share/man`). - -## building with nix (recommended) - -if you have nix installed: +## with nix ```sh nix build ``` -the binary is at `./result/bin/inshellah`. +binary is at `./result/bin/inshellah`. -for development with a shell containing all dependencies: +development shell: ```sh nix develop -dune build -dune test +cargo build --release +cargo test ``` -## building from source with opam +## with cargo -install dependencies via opam: +requires rust >= 1.85 (edition 2024). ```sh -opam install dune angstrom angstrom-unix camlzip -``` - -build and test: - -```sh -dune build -dune test -``` - -install into the opam switch: - -```sh -dune install -``` - -## building from source without opam - -if your distribution packages the OCaml libraries directly, install -them through your package manager, then build with dune: - -```sh -dune build -``` - -the binary is at `_build/default/bin/main.exe`. copy it to your -`$PATH`: - -```sh -install -Dm755 _build/default/bin/main.exe /usr/local/bin/inshellah +cargo build --release +cargo test +sudo install -Dm755 target/release/inshellah /usr/local/bin/inshellah ``` ## arch linux -install OCaml and dune from the official repos, and the remaining -libraries from the AUR or via opam: - ```sh -# system packages -sudo pacman -S ocaml dune - -# ocaml libraries (via opam) -opam init # if not already initialized -eval $(opam env) -opam install angstrom angstrom-unix camlzip - -# build -dune build -dune test - -# install -sudo install -Dm755 _build/default/bin/main.exe /usr/local/bin/inshellah +sudo pacman -S rust +cargo build --release +sudo install -Dm755 target/release/inshellah /usr/local/bin/inshellah ``` ## debian / ubuntu ```sh -sudo apt install ocaml opam -opam init -eval $(opam env) -opam install dune angstrom angstrom-unix camlzip - -dune build -sudo install -Dm755 _build/default/bin/main.exe /usr/local/bin/inshellah +sudo apt install cargo rustc +# or: rustup install stable +cargo build --release +sudo install -Dm755 target/release/inshellah /usr/local/bin/inshellah ``` ## fedora ```sh -sudo dnf install ocaml opam -opam init -eval $(opam env) -opam install dune angstrom angstrom-unix camlzip - -dune build -sudo install -Dm755 _build/default/bin/main.exe /usr/local/bin/inshellah +sudo dnf install cargo rust +cargo build --release +sudo install -Dm755 target/release/inshellah /usr/local/bin/inshellah ``` ## post-install setup -after installing the binary, index completions from your system -prefix(es): +index completions from your system prefix(es): ```sh # typical linux system inshellah index /usr /usr/local +# more workers / different timeout +inshellah index /usr /usr/local --workers 16 --timeout-ms 500 + # check what was indexed inshellah dump ``` -then wire up the nushell completer: +wire up the nushell completer in `~/.config/nushell/config.nu`: ```nu -# ~/.config/nushell/config.nu $env.config.completions.external = { enable: true completer: {|spans| @@ -145,19 +81,28 @@ $env.config.completions.external = { } ``` -see [nushell-integration.md](nushell-integration.md) for full details -on the completer, and [runtime-completions.md](runtime-completions.md) -for on-the-fly resolution of commands not covered by the index. +see [nushell-integration.md](nushell-integration.md) for full +completer details and [runtime-completions.md](runtime-completions.md) +for on-the-fly resolution of commands not covered by the upfront +index. ## re-indexing after package changes -the index is a static cache — it doesn't update automatically when you -install or remove packages. re-run `inshellah index` after significant -package changes: - ```sh inshellah index /usr /usr/local ``` -on nixos, the system index regenerates on every `nixos-rebuild` -automatically. see [nixos.md](nixos.md) for details. +on nixos, the system index regenerates on every `nixos-rebuild`. see +[nixos.md](nixos.md). + +## development + +```sh +cargo build # debug build, faster compile +cargo test # full test suite +cargo clippy --release +``` + +a `man` binary is useful at runtime as a fallback for locating +manpages outside the indexed prefixes — not required for indexing +itself. diff --git a/doc/nixos.md b/doc/nixos.md index 5d74690..d50ada6 100644 --- a/doc/nixos.md +++ b/doc/nixos.md @@ -1,105 +1,51 @@ # nixos integration -inshellah provides a nixos module that automatically indexes nushell -completions for all installed packages at system build time. +inshellah provides a nixos module that indexes nushell completions for +every installed package at system build time, and a wrapped binary +that knows where to find the result. ## enabling ```nix -# in your flake.nix outputs: +# flake.nix outputs: { nixosConfigurations.myhost = nixpkgs.lib.nixosSystem { modules = [ inshellah.nixosModules.default - { - programs.inshellah.enable = true; - } + { programs.inshellah.enable = true; } ]; }; } ``` -or if importing the module directly: +or importing directly: ```nix # configuration.nix { pkgs, ... }: { - imports = [ ./path/to/inshellah/nix/module.nix ]; - programs.inshellah = { - enable = true; - package = pkgs.inshellah; # or your local build - }; + imports = [ ./path/to/inshellah-rs/nix/module.nix ]; + programs.inshellah.enable = true; } ``` -## what happens at build time +after rebuilding, completions are immediately available through the +autoloaded nushell shim. -the module hooks into `environment.extraSetup`, which runs during the -system profile build (the `buildEnv` that creates `/run/current-system/sw`). -at that point, all system packages are merged, so `$out/bin` contains every -executable and `$out/share/man` contains every manpage. +## what the module does -inshellah runs a single command: - -``` -inshellah index "$out" --dir $out/share/inshellah -``` - -this executes a three-phase pipeline: - -### phase 1: native completion detection (parallel) - -for each executable, inshellah scans the elf binary for the string -`completion`. if found, it probes common patterns like -`CMD completions nushell` to see if the program can generate its own -nushell completions. native output is used verbatim — these are always -higher quality than parsed completions. - -programs like `niri`, and any clap/cobra tool with nushell support, -are handled this way. - -### phase 2: manpage parsing (sequential) - -for commands not covered by phase 1, inshellah parses manpages from -man1 (user commands) and man8 (sysadmin commands). it handles: - -- gnu `.TP` style (coreutils, help2man) -- `.IP` style (curl, hand-written) -- `.PP`+`.RS`/`.RE` style (git, docbook) -- nix3 bullet+hyperlink style (`nix run`, `nix build`, etc.) -- mdoc (bsd) format -- deroff fallback for unusual formats - -synopsis sections are parsed to detect subcommands: `git-commit.1` -generates `export extern "git commit"`, not `export extern "git-commit"`. - -### phase 3: --help fallback (parallel) - -remaining executables without manpages get `--help` (or `-h`) called -with a 200ms timeout. elf binaries are pre-scanned for the `-h` string -to skip those that don't support help flags. shell scripts are run -directly (they're fast). execution is parallelized to available cores. - -when `--help` produces rendered manpage output instead of plain help -text (e.g. `git stash --help` delegates to `man`), the raw manpage -source is located and parsed with the groff parser for richer results. - -### output - -each command gets its own file in `/share/inshellah` under the system -profile. native generators produce `.nu` files; parsed results produce -`.json` files. the `complete` command reads both formats. - -nushell built-in commands (ls, cd, cp, mv, etc.) are excluded since -nushell provides its own completions. - -### performance - -on a typical nixos system (~950 executables, ~1600 manpages): -- total time: ~4-10 seconds -- native gzip decompression (camlzip, no process spawning) -- parallel --help with core-scaled forking -- elf string scanning to skip ~15% of binaries +- installs the inshellah binary, wrapped so the system completion path + is found automatically. +- runs `inshellah index "$out"` during the system profile build, + producing one file per command under `$out/share/inshellah/`. +- drops the full nushell external-completer shim into + `/share/nushell/vendor/autoload/`, including sudo/doas overrides so + elevated commands still complete through inshellah. +- emits lightweight command-name stubs for dynamic-completion backends + that are present in the system profile, so tools like `git` and `jj` + appear in nushell's command list while inshellah still supplies their + argument completions lazily. +- exposes the same shim as a read-only `snippet` option for users who + want to source or inspect it manually. ## module options @@ -110,12 +56,11 @@ programs.inshellah = { # the inshellah package (set automatically by the flake module) package = pkgs.inshellah; - # where to place indexed completion files under the system profile + # subdirectory of the system profile holding the index files # default: "/share/inshellah" completionsPath = "/share/inshellah"; # additional read-only completion directories to search - # these are appended to the --dir path alongside the system completions extraDirs = [ "/etc/profiles/per-user/alice/share/inshellah" ]; # commands to skip entirely during indexing @@ -123,41 +68,68 @@ programs.inshellah = { # commands to skip manpage parsing for (uses --help instead) helpOnlyCommands = [ "nix" ]; + + # per-subprocess timeout in ms during indexing (null = built-in + # default of 200ms) + timeoutMs = null; + + # worker-thread count for the parallel scrape + workers = null; }; ``` ## using the completer -the flake module sets a read-only `snippet` option containing the nushell -config needed to wire up the completer. you can access it via -`config.programs.inshellah.snippet` and paste it into your nushell config, -or source it from a file generated by your nixos config. +the module installs the completer under nushell's vendor autoload path, +so no hand-written nushell config is needed for the normal NixOS case. -the snippet sets up the external completer. the wrapper installed by -the module has the system completion paths hardcoded, so no flags are -needed: +the read-only `snippet` option still holds the complete +external-completer config. to manage sourcing yourself instead of using +autoload, write it to a file: -```nu -let inshellah_complete = {|spans| - inshellah complete ...$spans | from json -} -$env.config.completions.external = { - enable: true - max_results: 100 - completer: $inshellah_complete -} +```nix +# generate a config file from the snippet +environment.etc."nushell/inshellah.nu".text = config.programs.inshellah.snippet; ``` -## home manager and other user-level package managers +then source that file from your nushell config: -the nixos module only indexes packages installed at the system level -(those that end up in `/run/current-system/sw`). if you use home-manager, -nix-env, or another user-level package manager, those binaries and -manpages live elsewhere — typically under `/etc/profiles/per-user/` -or `~/.nix-profile`. +```nu +source /etc/nushell/inshellah.nu +``` -to get completions for user-installed packages, run `inshellah index` -against those prefixes separately: +or copy the snippet directly into `~/.config/nushell/config.nu`: + +```nu +# (the snippet is many lines — copy it from `nix eval` of the option, +# or use the environment.etc approach above) +$env.config.completions.external = { ... } +``` + +the snippet provides both static lookups against the system index and +runtime fallbacks for cases the static index can't cover: + +| command | dynamic source | +|---|---| +| `nix` | flake refs via `NIX_GET_COMPLETIONS`, with optional `meta.description` | +| `systemctl` / `journalctl` | unit names from `list-units` | +| `coredumpctl` | units + pids | +| `loginctl` | users / sessions | +| `machinectl` / `networkctl` | machines / links | +| `ssh` / `scp` / `sftp` | hostnames from ssh config + known_hosts | +| `docker` / `podman` | containers + image refs by subcommand | +| `kubectl` | resource names from the live cluster | +| `git` | refs + worktree paths | +| `npm` / `pnpm` / `yarn` | scripts from package.json | +| `make` / `just` | targets / recipes | +| `cargo` | workspace targets behind `--bin` / `--example` / etc. | +| `kill` / `pkill` | pid+comm pairs | + +## home manager and user-level package managers + +the system module only indexes packages installed system-wide. for +home-manager or per-user nix profiles, run `inshellah index` against +those prefixes separately: ```sh # home-manager / per-user profile @@ -167,35 +139,34 @@ inshellah index /etc/profiles/per-user/$USER inshellah index ~/.nix-profile ``` -this indexes into the default user cache (`$XDG_CACHE_HOME/inshellah`), -which the completer searches automatically. you can re-run this after -installing new packages, or add it to a home-manager activation script. - -if you want to automate this in home-manager: +this indexes into `$XDG_CACHE_HOME/inshellah`, which the completer +searches automatically. to automate via home-manager: ```nix -# home.nix home.activation.inshellah-index = lib.hm.dag.entryAfter [ "writeBoundary" ] '' ${pkgs.inshellah}/bin/inshellah index /etc/profiles/per-user/$USER 2>/dev/null || true ''; ``` -the completer will then search both the system index and the user -cache, so completions from both sources are available. - ## troubleshooting -**completions not appearing**: ensure the completer is configured in -your nushell config (see above). check that the system index exists: -`ls /run/current-system/sw/share/inshellah/`. +**completions not appearing**: check that the system index exists +(`ls /run/current-system/sw/share/inshellah/`) and that the completer +is configured. **missing completions for a specific command**: check if it's a nushell -built-in (`help commands | where name == "thecommand"`). built-ins are -excluded because nushell serves its own completions for them. +built-in (`help commands | where name == "thecommand"`) — built-ins +are excluded. -**stale completions after update**: completions regenerate on every -`nixos-rebuild`. if a command changed its flags, rebuild to pick up -the changes. +**command name missing but arguments complete after typing it**: the +command may be installed only in a user profile. the system module can +only generate command-name stubs for binaries linked into the system +profile, though the external completer can still complete arguments +once the command word has been typed. -**build-time errors**: indexing failures are non-fatal (`|| true`). -check `journalctl` for the build log if completions are missing. +**stale completions after update**: the index regenerates on every +`nixos-rebuild`. if a command changed its flags, rebuild. + +**build-time errors**: indexing failures are non-fatal. check +`journalctl` for the build log if completions are missing for a +specific command. diff --git a/doc/nushell-integration.md b/doc/nushell-integration.md index 68ea5f8..773533d 100644 --- a/doc/nushell-integration.md +++ b/doc/nushell-integration.md @@ -1,150 +1,28 @@ # using inshellah completions in nushell -inshellah indexes completions from three sources (in priority order): -1. **native generators** — programs that can emit nushell completions directly -2. **manpages** — groff/troff/mdoc manpage parsing -3. **`--help` output** — parsing help text as a fallback - -indexed data is stored as `.json` and `.nu` files in a directory that the -`complete` command reads from at tab-completion time. +inshellah indexes completions for the commands in your `$PATH` and +serves them to nushell's external completer. indexed data is stored as +`.json` and `.nu` files that the `complete` command reads at +tab-completion time. ## quick start index completions from a system prefix: ```sh -# index from a prefix containing bin/ and share/man/ +# from a prefix containing bin/ and share/man/ inshellah index /usr -# index from multiple prefixes +# multiple prefixes inshellah index /usr /usr/local -# store in a custom directory +# custom directory inshellah index /usr --dir ~/my-completions ``` -parse a single manpage: - -```sh -inshellah manpage /usr/share/man/man1/git.1.gz -``` - -batch-process all manpages under a directory (man1 and man8): - -```sh -inshellah manpage-dir /usr/share/man -``` - -## commands - -``` -inshellah index PREFIX... [--dir PATH] [--ignore FILE] [--help-only FILE] - index completions into a directory of json/nu files. - PREFIX is a directory containing bin/ and share/man/. - default dir: $XDG_CACHE_HOME/inshellah - --ignore FILE skip listed commands entirely - --help-only FILE skip manpages for listed commands, use --help instead - -inshellah complete CMD [ARGS...] [--dir PATH[:PATH...]] - nushell custom completer. outputs json completion candidates. - falls back to --help resolution if command is not indexed. - --dir takes colon-separated paths. the first path is the writable - user cache; additional paths are read-only system directories. - manpages are found via sibling share/man of system dir paths. - -inshellah query CMD [--dir PATH[:PATH...]] - print stored completion data for CMD. - -inshellah dump [--dir PATH[:PATH...]] - list indexed commands. - -inshellah manpage FILE - parse a manpage and emit nushell extern block. - -inshellah manpage-dir DIR - batch-process manpages under DIR (man1 and man8 sections). -``` - -## the index pipeline - -the `index` command runs a three-phase pipeline over all executables -in each `PREFIX/bin`: - -### phase 1: native completion detection (parallel) - -for each executable, inshellah scans the elf binary for the string -`completion`. if found, it probes common patterns like -`CMD completions nushell` to see if the program can generate its own -nushell completions. native output is used verbatim — these are always -higher quality than parsed completions. - -programs like `niri`, and any clap/cobra tool with nushell support, -are handled this way. - -### phase 2: manpage parsing (sequential) - -for commands not covered by phase 1, inshellah parses manpages from -man1 (user commands) and man8 (sysadmin commands). it handles: - -- gnu `.TP` style (coreutils, help2man) -- `.IP` style (curl, hand-written) -- `.PP`+`.RS`/`.RE` style (git, docbook) -- nix3 bullet+hyperlink style (`nix run`, `nix build`, etc.) -- mdoc (bsd) format -- deroff fallback for unusual formats - -synopsis sections are parsed to detect subcommands: `git-commit.1` -generates `export extern "git commit"`, not `export extern "git-commit"`. - -### phase 3: --help fallback (parallel) - -remaining executables without manpages get `--help` (or `-h`) called -with a 200ms timeout. elf binaries are pre-scanned for the `-h` string -to skip those that don't support help flags. shell scripts are run -directly (they're fast). execution is parallelized to available cores. - -subcommands are recursively resolved — if `--help` output lists -subcommands, inshellah runs `CMD SUBCMD --help` for each. - -when a `--help` invocation produces rendered manpage output (some -commands like `git stash` delegate `--help` to `man`), inshellah -detects this and locates the raw manpage source to parse with the -groff parser instead. this yields richer results (subcommands, -structured flag sections) than parsing the rendered text. - -### output - -each command gets its own file in the index directory. native generators -produce `.nu` files; parsed results produce `.json` files. the `complete` -command reads both formats. - -nushell built-in commands (ls, cd, cp, mv, etc.) are excluded since -nushell provides its own completions. - -### performance - -on a typical nixos system (~950 executables, ~1600 manpages): -- total time: ~4-10 seconds -- native gzip decompression (camlzip, no process spawning) -- parallel --help with core-scaled forking -- elf string scanning to skip ~15% of binaries - -## the completer - -the `complete` command is designed to be wired into nushell as an -external completer. it reads from the directories specified via `--dir` -(colon-separated), performs fuzzy matching, and outputs json completion -candidates. the first path is the writable user cache; additional paths -are read-only system directories. - -if a command is not indexed, `complete` falls back to on-the-fly -`--help` resolution — it runs the command's help, caches the result -in the user directory, and returns completions immediately. - -### setting up the completer +then wire up the completer in `~/.config/nushell/config.nu`: ```nu -# ~/.config/nushell/config.nu $env.config.completions.external = { enable: true completer: {|spans| @@ -154,27 +32,62 @@ $env.config.completions.external = { } ``` -with the nixos module, use the provided `snippet` option value (see -[nixos.md](nixos.md)) which points at the system index automatically. +that's it. tab-completion now works for every command indexed. -## nixos module +## commands -enable automatic completion indexing at system build time: +``` +inshellah index PREFIX... [--dir PATH] [--ignore FILE] [--help-only FILE] + [--workers N] [--timeout-ms N] + index completions into a directory of json/nu files. + PREFIX is a directory containing bin/ and share/man/. + default dir: $XDG_CACHE_HOME/inshellah + --ignore FILE skip listed commands entirely + --help-only FILE skip manpages for listed commands, use --help instead + --workers N worker-thread count + --timeout-ms N per-subprocess timeout in ms (default: 200) -```nix -{ - imports = [ ./path/to/inshellah/nix/module.nix ]; - programs.inshellah.enable = true; -} +inshellah complete CMD [ARGS...] [--dir PATH[:PATH...]] [--timeout-ms N] + nushell custom completer. outputs JSON completion candidates. + falls back to on-the-fly --help resolution if a command isn't + indexed yet — the result is cached and subsequent presses are + instant. + --dir takes colon-separated paths. the first path is the writable + user cache; additional paths are read-only system directories. + +inshellah query CMD [--dir PATH[:PATH...]] + print stored completion data for CMD. + +inshellah dump [--dir PATH[:PATH...]] + list indexed commands. + +inshellah manpage FILE + parse a manpage and emit a nushell extern block. + +inshellah manpage-dir DIR + batch-process manpages under DIR (man1 and man8 sections). ``` -this runs `inshellah index` during the system profile build. see -[nixos.md](nixos.md) for full details. +## what gets handled -## what gets generated +- **sources**: native nushell completion generators (clap/cobra tools + that can emit completions themselves), manpages in section 1 and 8, + `--help` and `-h` output. +- **groff styles**: gnu `.TP` (coreutils, help2man), `.IP` (curl, + hand-written), `.PP`+`.RS`/`.RE` (git, docbook), nix3 bullet + (`nix run`, `nix build`), mdoc (BSD), plus a deroff fallback. +- **subcommand naming**: `git-commit.1` produces `git commit`, not + `git-commit`. clap-style per-subcommand manpages get one file each. +- **synopsis-only flags**: flags declared in a manpage SYNOPSIS but + missing from the body (e.g. nix-env's `--profile`, most of sed's + interface) are picked up too. +- **elevation wrappers**: `sudo`, `doas`, `pkexec`, `su`, `run0` are + stripped before lookup, including when the real target is given as + an absolute path. +- **exclusions**: nushell built-ins (ls, cd, mv, etc.) are skipped — + nushell serves its own completions for those. -the `manpage` and `manpage-dir` commands emit nushell `extern` blocks -with flags, parameter types, and descriptions: +## extern blocks (manpage / manpage-dir) ```nu export extern "rg" [ @@ -186,9 +99,52 @@ export extern "rg" [ ] ``` -subcommand manpages (e.g. `git-commit.1`) are detected via synopsis -parsing and generate the correct nushell name (`git commit` not -`git-commit`). +these are produced by `inshellah manpage` / `inshellah manpage-dir` and +can be source'd directly in your nushell config if you prefer that to +the json completer flow. -nushell built-in commands (ls, cd, mv, etc.) are excluded since nushell -provides its own completions for these. +## native completions and file completion + +when a tool ships its own nushell completion generator (clap, cobra, etc.), +inshellah caches its output verbatim as a `.nu` file under the autoload +dir. nushell loads the `extern` declarations and uses its built-in +completer for that command — the external completer (inshellah's `complete` +subcommand) is only consulted as a fallback. + +at the `extern` layer, positional/flag types drive what nushell offers: + +- `: path` triggers nushell's built-in file/path completion for that slot. +- `: string@my_completer` runs a user-defined closure. +- bare `: string` / `: int` provides no candidates of its own. + +so when a native `.nu` declares `--file: path`, you'll see file completions +intermixed with whatever else is in scope. that's intrinsic to the type, +not something inshellah injects. + +a few things worth knowing: + +- nushell ≤ 0.69 had a bug + ([#6407](https://github.com/nushell/nushell/issues/6407)) where file + completion superseded the external completer when the prefix was empty + or matched a real path. upgrade if you see this. +- [PR #14781](https://github.com/nushell/nushell/pull/14781) tightened the + contract: an external completer that returns a non-null list now + suppresses file fallback; only an explicit `null` opts back in. inshellah + already follows this — `null` for "hand off to nu", `[...]` to override. +- if you want different ranking, the relevant settings are + `$env.config.completions.{algorithm, sort, partial, case_sensitive}`. + none of them disables file completion for `: path` parameters — that + behavior is tied to the type itself. + +if a particular native completion bothers you, the workaround is to drop +that one `.nu` file from the autoload directory. nushell falls back to the +external completer for unknown commands, and inshellah's `complete` +subcommand returns candidates directly as JSON — bypassing the `extern` +type layer entirely, so no `: path` slot triggers nu's built-in file +completer. + +## nixos + +`programs.inshellah.enable = true` will index at system build time and +ship a richer completer with runtime fallbacks (live cluster queries, +git/ssh/docker/k8s lookups, etc.). see [nixos.md](nixos.md). diff --git a/doc/runtime-completions.md b/doc/runtime-completions.md index 7b58e48..3e0ee84 100644 --- a/doc/runtime-completions.md +++ b/doc/runtime-completions.md @@ -1,30 +1,31 @@ # runtime completion resolution -the `complete` command has built-in on-the-fly resolution: when a command -is not found in the index, it falls back to running `--help`, caches the -result, and returns completions immediately. this means commands installed -outside the system profile (via cargo, pip, npm, go, etc.) get completions -on first tab-press with no manual setup. +when a command isn't in the static index yet, `inshellah complete` +runs `--help` (or `-h`) on the binary, caches the result in the user +directory, and returns completions immediately. tab-completion just +works for tools installed outside the indexed prefixes — via cargo, +pip, npm, go, etc. ## how it works -when you type `docker compose up --`: +typing `docker compose up --`: 1. nushell calls `inshellah complete docker compose up --` -2. inshellah looks up the index for the longest matching prefix +2. inshellah looks up the longest matching prefix in the index 3. if found, it fuzzy-matches flags and subcommands against the partial input 4. if not found, it locates the binary in `$PATH`, runs `--help`, recursively resolves subcommands, caches the results in the user - directory (`$XDG_CACHE_HOME/inshellah`), and returns completions. - if `--help` produces rendered manpage output, the raw manpage source - is located and parsed instead for richer results + directory (`$XDG_CACHE_HOME/inshellah`), and returns completions -all subsequent completions for that command are instant (served from cache). +all subsequent completions for that command are served from cache. + +elevation wrappers (`sudo`, `doas`, `pkexec`, `su`, `run0`) are +stripped before lookup: `sudo docker compose up --` resolves against +`docker`, not `sudo`. absolute paths after the wrapper are recognised +too. ## setup -the completer works with no extra configuration beyond the basic setup: - ```nu # ~/.config/nushell/config.nu $env.config.completions.external = { @@ -36,18 +37,8 @@ $env.config.completions.external = { } ``` -with the nixos module, the installed wrapper has the system paths -hardcoded — no extra flags needed. the same snippet works: - -```nu -$env.config.completions.external = { - enable: true - completer: {|spans| - inshellah complete ...$spans - | from json - } -} -``` +with the nixos module, no extra config is needed beyond enabling the +module — the wrapper has the system paths baked in. to manually specify system dirs, use colon-separated `--dir`: @@ -61,25 +52,15 @@ $env.config.completions.external = { } ``` -system directories (paths after the first in `--dir`) enable -manpage-based fallback: when a command's `--help` delegates to `man`, -the completer looks for the raw manpage in the sibling `share/man` -directory (e.g. `share/inshellah` → `share/man`). if no system dirs -are given, it falls back to `man -w` to locate the manpage. - -or use the `snippet` option provided by the flake module (see -[nixos.md](nixos.md)). +paths after the first in `--dir` are read-only system dirs. ## cache management -the user cache lives at `$XDG_CACHE_HOME/inshellah` (typically -`~/.cache/inshellah`). - ```sh # list cached commands inshellah dump -# view cached data for a command +# view stored data for a command inshellah query docker # clear cache diff --git a/dune-project b/dune-project deleted file mode 100644 index 4d29412..0000000 --- a/dune-project +++ /dev/null @@ -1,28 +0,0 @@ -(lang dune 3.20) - -(name inshellah) - -(generate_opam_files true) - -(source - (github username/reponame)) - -(authors "atagen ") - -(maintainers "atagen ") - -(license GPL-3.0-or-later) - -(package - (name inshellah) - (synopsis "Nushell completions generator") - (description - "Inshellah parses manpages and --help switches to generate completions for nushell.") - (depends - ocaml - dune - angstrom - angstrom-unix - camlzip) - (tags - (shell completions nushell parser angstrom))) diff --git a/flake.lock b/flake.lock index 3adb309..8c7ac0c 100644 --- a/flake.lock +++ b/flake.lock @@ -2,16 +2,16 @@ "nodes": { "nixpkgs": { "locked": { - "lastModified": 1773385838, - "narHash": "sha256-ylF2AGl08seexxlLvMqj3jd+yZq56W9zicwe51mp0Pw=", + "lastModified": 1773821835, + "narHash": "sha256-TJ3lSQtW0E2JrznGVm8hOQGVpXjJyXY2guAxku2O9A4=", "owner": "nixos", "repo": "nixpkgs", - "rev": "fef542e7a88eec2b698389e6279464fd479926b6", + "rev": "b40629efe5d6ec48dd1efba650c797ddbd39ace0", "type": "github" }, "original": { "owner": "nixos", - "ref": "nixpkgs-unstable", + "ref": "nixos-unstable", "repo": "nixpkgs", "type": "github" } diff --git a/flake.nix b/flake.nix index 6b05775..dd20c6b 100644 --- a/flake.nix +++ b/flake.nix @@ -1,111 +1,251 @@ { - inputs.nixpkgs.url = "github:nixos/nixpkgs/nixpkgs-unstable"; + + inputs.nixpkgs.url = "github:nixos/nixpkgs?ref=nixos-unstable"; outputs = { self, nixpkgs }: let forAllSystems = - f: - nixpkgs.lib.genAttrs [ "x86_64-linux" "aarch64-linux" ] ( - system: f (import nixpkgs { inherit system; }) - ); + f: nixpkgs.lib.genAttrs nixpkgs.lib.systems.flakeExposed (sys: f nixpkgs.legacyPackages.${sys}); in { devShells = forAllSystems (pkgs: { default = pkgs.mkShell { - packages = with pkgs.ocamlPackages; [ - dune_3 - ocaml - angstrom - angstrom-unix - camlzip - ppx_inline_test - ocaml-lsp - ocamlformat - ocamlformat-rpc-lib - utop + packages = with pkgs; [ + rustc + cargo + rustfmt + rust-analyzer + clippy ]; }; }); packages = forAllSystems (pkgs: { - default = pkgs.ocamlPackages.buildDunePackage { + default = pkgs.rustPlatform.buildRustPackage { pname = "inshellah"; - version = "0.1"; + version = "0.1.1"; src = pkgs.lib.cleanSource ./.; - nativeBuildInputs = [ pkgs.git ]; - buildInputs = with pkgs.ocamlPackages; [ - dune_3 - ocaml - angstrom - angstrom-unix - camlzip - ]; - - meta.mainProgram = "inshellah"; + cargoLock.lockFile = ./Cargo.lock; + meta = { + description = "nushell completion indexer"; + mainProgram = "inshellah"; + }; }; }); - nixosModules.default = + checks = forAllSystems ( + pkgs: + let + checkSrc = pkgs.lib.cleanSourceWith { + src = ./.; + filter = + path: type: + let + base = baseNameOf path; + in + !(type == "directory" && (base == ".git" || base == "target")); + }; + cargoDeps = pkgs.rustPlatform.importCargoLock { lockFile = ./Cargo.lock; }; + rustInputs = with pkgs; [ + cargo + clippy + stdenv.cc + rustc + ]; + fakeInshellah = pkgs.writeShellScriptBin "inshellah" '' + if [ "''${1:-}" = complete ]; then + if [ -n "''${INSHELLAH_STATIC_FILE:-}" ] && [ -s "$INSHELLAH_STATIC_FILE" ]; then + cat "$INSHELLAH_STATIC_FILE" + printf '\n' + else + printf 'null\n' + fi + else + printf 'null\n' + fi + ''; + fakeNix = pkgs.writeShellScriptBin "nix" '' + if [ "''${1:-}" = eval ]; then + printf 'raw package description\n' + else + printf 'header\nbuild\nflake#pkg\n' + fi + ''; + fakeSystemctl = pkgs.writeShellScriptBin "systemctl" '' + case "$*" in + *"g*"*) + printf 'greetd.service loaded active running Greeter\n' + ;; + *) + printf 'demo.service loaded active running Demo Unit\n' + ;; + esac + ''; + fakeKubectl = pkgs.writeShellScriptBin "kubectl" '' + printf '%s\n' "$*" > "$KUBECTL_ARGS_FILE" + if [ "''${1:-}" = get ] && [ "''${2:-}" = deployment ]; then + printf 'deploy-a\n' + elif [ "''${1:-}" = get ]; then + printf 'pod-a\n' + fi + ''; + fakeCargo = pkgs.writeShellScriptBin "cargo" '' + cat <<'JSON' + {"packages":[{"name":"app-lib","version":"0.1.0","targets":[{"name":"app-lib","kind":["lib"]},{"name":"app-cli","kind":["bin"]},{"name":"app-integration","kind":["test"]}]},{"name":"helper-lib","version":"0.2.0","targets":[{"name":"helper-lib","kind":["lib"]}]}]} + JSON + ''; + fakeGit = pkgs.writeShellScriptBin "git" '' + case "''${1:-}" in + remote) + printf 'origin\nupstream\n' + ;; + for-each-ref) + case "$*" in + *"refs/heads refs/remotes refs/tags"*) + printf 'main\tcommit\tMain branch\norigin/main\tcommit\tRemote main\nv1.0\tcommit\tRelease 1\n' + ;; + *"refs/heads"*) + printf 'main\tMain branch\nfeature\tFeature branch\n' + ;; + *"refs/tags"*) + printf 'v1.0\tRelease 1\nv2.0\tRelease 2\n' + ;; + esac + ;; + stash) + if [ "''${2:-}" = list ]; then + printf 'stash@{0}: WIP on main: demo stash\n' + fi + ;; + status) + printf ' M src/main.rs\n?? new-file.txt\nR old.txt -> renamed.txt\n' + ;; + ls-files) + printf 'src/main.rs\nREADME.md\n' + ;; + config) + printf 'submodule.demo.path deps/demo\n' + ;; + worktree) + if [ "''${2:-}" = list ]; then + printf 'worktree /repo/linked\n' + fi + ;; + esac + ''; + fakeJj = pkgs.writeShellScriptBin "jj" '' + case "''${1:-}" in + log) + printf 'k\tworking change\nm\tmain change\n' + ;; + bookmark) + if [ "''${2:-}" = list ]; then + printf 'main\nfeature\norigin/main\n' + fi + ;; + tag) + if [ "''${2:-}" = list ]; then + printf 'v1.0\nv2.0\n' + fi + ;; + git) + if [ "''${2:-}" = remote ] && [ "''${3:-}" = list ]; then + printf 'origin https://example.com/repo.git\nupstream https://example.com/upstream.git\n' + fi + ;; + op|operation) + if [ "''${2:-}" = log ]; then + printf 'abc123\tcheckout working copy\n' + fi + ;; + file) + if [ "''${2:-}" = list ]; then + printf 'src/main.rs\nREADME.md\n' + fi + ;; + workspace) + if [ "''${2:-}" = list ]; then + printf 'default\nlinked\n' + fi + ;; + esac + ''; + fakeCompletionBackends = pkgs.symlinkJoin { + name = "inshellah-fake-completion-backends"; + paths = [ + fakeInshellah + fakeNix + fakeSystemctl + fakeKubectl + fakeCargo + fakeGit + fakeJj + ]; + }; + rustCheckPhase = '' + echo "running rust checks" + rm -rf source-rust + cp -R ${checkSrc} source-rust + chmod -R u+w source-rust + pushd source-rust + export CARGO_HOME="$TMPDIR/cargo-home" + export CARGO_TARGET_DIR="$TMPDIR/cargo-target" + mkdir -p .cargo "$CARGO_HOME" + cat > .cargo/config.toml < "$INSHELLAH_STATIC_FILE" + nu --no-config-file -c 'source ${./nix/inshellah-completer.nu}; source ${./tests/nushell-completer.nu}' + cat > "$TMPDIR/config-load.nu" <<'EOF' + source ${./nix/inshellah-completer.nu} + + def activate [p: path] { + sudo nix-env --set -p /nix/var/nix/profiles/system $p + sudo $"($p)/bin/switch-to-configuration" switch + doas nix-env --set -p /nix/var/nix/profiles/system $p + } + EOF + nu --env-config /dev/null --config "$TMPDIR/config-load.nu" -c 'print ok' + ''; + mkShellCheck = + name: inputs: phase: + pkgs.runCommand name { nativeBuildInputs = inputs; } '' + ${phase} + touch $out + ''; + in { - pkgs, - lib, - config, - ... - }: + rust = mkShellCheck "inshellah-rust-check" rustInputs rustCheckPhase; + nushell = mkShellCheck "inshellah-nushell-check" [ pkgs.nushell ] nushellCheckPhase; + default = mkShellCheck "inshellah-check" (rustInputs ++ [ pkgs.nushell ]) '' + ${rustCheckPhase} + ${nushellCheckPhase} + ''; + } + ); + + nixosModules.default = + { pkgs, ... }: { imports = [ ./nix/module.nix ]; programs.inshellah.package = self.packages.${pkgs.stdenv.hostPlatform.system}.default; - programs.inshellah.snippet = '' - let inshellah_complete = { |spans| - let completions = (^inshellah complete ...$spans) | from json - # dynamic completions - let additional = if ($completions == null and ($spans | length) > 0) { - match $spans.0 { - "nix" => { - $env.NIX_GET_COMPLETIONS = ($spans | length) - 1 - let nix_output = $spans | run-external $in | split row -r '\n' | str trim | skip 1 - let entries = if (($nix_output | length) < 6 and - ($spans | last) =~ "[a-zA-Z][a-zA-Z0-9_-]*#[a-zA-Z][a-zA-Z0-9_-]*") { - hide-env NIX_GET_COMPLETIONS - $env.NIX_ALLOW_UNFREE = 1 - $env.NIX_ALLOW_BROKEN = 1 - $nix_output | par-each { |e| - try { - { value: $e, description: (^nix eval --impure $e --apply "f: f.meta.description" err> /dev/null) } - } catch { - { value: $e, description: "" } - } - } - } else { - $nix_output | each { |e| - { value: $e, description: "" } - } - } - $entries - } - "systemctl" => { - if ($spans | length) < 3 { null } else { - let kw = $spans | last - let scope = if ("--user" in $spans) { [--user] } else { [] } - ^systemctl ...$scope list-units --all --no-pager --plain --full --no-legend $"($kw)*" - | lines - | each { |l| - let parsed = $l | parse -r '(?P\S+)\s+\S+\s+\S+\s+\S+\s+(?P.*)' - if ($parsed | length) > 0 { - {value: $parsed.0.unit, description: ($parsed.0.desc | str trim)} - } - } | compact - } - } - _ => { null } - } - } else { null } - let result = ($completions | default []) | append ($additional | default []) | compact - if ($result | is-empty) { null } else { $result } - } - $env.config.completions.external = {enable: true, max_results: 200, completer: $inshellah_complete} - ''; }; }; } diff --git a/inshellah.opam b/inshellah.opam deleted file mode 100644 index 9888aa7..0000000 --- a/inshellah.opam +++ /dev/null @@ -1,35 +0,0 @@ -# This file is generated by dune, edit dune-project instead -opam-version: "2.0" -synopsis: "Nushell completions generator" -description: - "Inshellah parses manpages and --help switches to generate completions for nushell." -maintainer: ["atagen "] -authors: ["atagen "] -license: "GPL-3.0-or-later" -tags: ["shell" "completions" "nushell" "parser" "angstrom"] -homepage: "https://github.com/username/reponame" -bug-reports: "https://github.com/username/reponame/issues" -depends: [ - "ocaml" - "dune" {>= "3.20"} - "angstrom" - "angstrom-unix" - "camlzip" - "odoc" {with-doc} -] -build: [ - ["dune" "subst"] {dev} - [ - "dune" - "build" - "-p" - name - "-j" - jobs - "@install" - "@runtest" {with-test} - "@doc" {with-doc} - ] -] -dev-repo: "git+https://github.com/username/reponame.git" -x-maintenance-intent: ["(latest)"] diff --git a/lib/.ocamlformat b/lib/.ocamlformat deleted file mode 100644 index e69de29..0000000 diff --git a/lib/dune b/lib/dune deleted file mode 100644 index 38defe1..0000000 --- a/lib/dune +++ /dev/null @@ -1,3 +0,0 @@ -(library - (name inshellah) - (libraries angstrom angstrom-unix camlzip str unix)) diff --git a/lib/manpage.ml b/lib/manpage.ml deleted file mode 100644 index 5415fac..0000000 --- a/lib/manpage.ml +++ /dev/null @@ -1,1145 +0,0 @@ -(* manpage.ml — parse unix manpages (groff/mdoc format) into help_result. - * - * manpages are written in roff/groff markup — a decades-old typesetting language - * used by man(1). this module strips the formatting and extracts structured data - * (flags, subcommands, positionals) from the raw groff source. - * - * there are two major manpage macro packages: - * - man (groff) — used by gnu/linux tools. uses macros like .SH, .TP, .IP, .PP - * - mdoc (bsd) — used by bsd tools. uses .Sh, .Fl, .Ar, .Op, .It, .Bl/.El - * - * this module handles both, auto-detecting the format by checking for .Sh macros. - * - * for groff manpages, flag extraction uses multiple "strategies" that target - * different common formatting patterns: - * - strategy_tp: .TP tagged paragraphs (gnu coreutils, help2man) - * - strategy_ip: .IP indented paragraphs (curl, hand-written) - * - strategy_pp_rs: .PP + .RS/.RE blocks (git, docbook) - * - strategy_nix: nix3-style bullet .IP with .UR/.UE hyperlinks - * - strategy_deroff: fallback — strip all groff, feed to help text parser - * - * the module tries all applicable strategies and picks the one that extracts - * the most flag entries, on the theory that more results = better match. - * - * key peculiarities: - * - groff has an enormous escape syntax (font changes, named characters, - * size changes, color, string variables, etc.) — strip_groff_escapes - * handles the common cases but is not exhaustive - * - font escapes like \fI (italic) need to insert spaces at word boundaries - * to prevent flag names from fusing with their parameter names - * - the strategies share the angstrom-based switch_parser from parser.ml - * for parsing the actual flag syntax out of the stripped text - *) - -open Parser - -(* --- shared helpers for imperative string scanning --- - * many groff parsing routines use an imperative cursor (ref int) walking - * through a string. these helpers factor out common scanning patterns. *) - -(* advance pos past all characters until the delimiter is found. - * leaves pos pointing at the delimiter character, or at len if not found. *) -let skip_to_char source len pos delim = - while !pos < len && source.[!pos] <> delim do incr pos done - -(* translate a groff named character escape to its text equivalent. - * groff uses two-letter codes like "aq" for apostrophe, "lq"/"rq" for - * left/right quotes, "em"/"en" for dashes. returns None for unknown names. *) -let named_char_of = function - | "aq" -> Some '\'' - | "lq" | "Lq" -> Some '\x22' (* left double quote *) - | "rq" | "Rq" -> Some '\x22' (* right double quote *) - | "em" | "en" -> Some '-' - | _ -> None - -(* skip a groff reference that uses one of three sub-forms: - * single char — e.g. \*X or \nX - * ( + 2 chars — e.g. \*(XX or \n(XX - * [ to ] — e.g. \*[name] or \n[name] - * used for \* (string variable) and \n (number register) escapes. - * advances pos past the consumed characters. *) -let skip_groff_reference source len pos = - if !pos < len then begin - if source.[!pos] = '(' then - pos := !pos + 3 (* skip past '(' + two-character name *) - else if source.[!pos] = '[' then begin - incr pos; - skip_to_char source len pos ']'; - if !pos < len then incr pos - end else - incr pos - end - -(* --- groff escape/formatting stripper --- - * groff escapes start with backslash and use various continuation syntaxes. - * this function strips them, replacing named characters (like \(aq for - * apostrophe) with their text equivalents and discarding formatting directives. *) - -let strip_groff_escapes source = - let buffer = Buffer.create (String.length source) in - let len = String.length source in - let pos = ref 0 in - let prev_char = ref '\000' in - (* emit a character into the output buffer and track it as previous *) - let put char_val = Buffer.add_char buffer char_val; prev_char := char_val in - let is_alnum char_val = - (char_val >= 'a' && char_val <= 'z') - || (char_val >= 'A' && char_val <= 'Z') - || (char_val >= '0' && char_val <= '9') - in - while !pos < len do - if source.[!pos] = '\\' && !pos + 1 < len then begin - let next = source.[!pos + 1] in - match next with - | 'f' -> - (* font escape: \fB, \fI, \fP, \fR, \f(XX, \f[...] *) - if !pos + 2 < len then begin - let font_char = source.[!pos + 2] in - (* insert space before italic font to preserve word boundaries - e.g. \fB--max-results\fR\fIcount\fR -> "--max-results count" *) - if font_char = 'I' && is_alnum !prev_char then put ' '; - if font_char = '(' then - pos := !pos + 5 (* \f(XX — two-character font name *) - else if font_char = '[' then begin - pos := !pos + 3; - skip_to_char source len pos ']'; - if !pos < len then incr pos - end else - pos := !pos + 3 (* \fX — single-character font selector *) - end else - pos := !pos + 2 - | '-' -> - (* escaped hyphen-minus — emit a plain hyphen *) - put '-'; - pos := !pos + 2 - | '&' | '/' | ',' -> - (* zero-width characters — discard without output *) - pos := !pos + 2 - | '(' -> - (* two-char named character: \(aq, \(lq, \(rq, etc. *) - if !pos + 3 < len then begin - let name = String.sub source (!pos + 2) 2 in - (match named_char_of name with - | Some char_val -> put char_val - | None -> ()); - pos := !pos + 4 - end else - pos := !pos + 2 - | '[' -> - (* bracketed named character: \[aq], \[lq], etc. *) - pos := !pos + 2; - let start = !pos in - skip_to_char source len pos ']'; - if !pos < len then begin - let name = String.sub source start (!pos - start) in - (match named_char_of name with - | Some char_val -> put char_val - | None -> ()); - incr pos - end - | 's' -> - (* size escape: \sN, \s+N, \s-N — skip the numeric argument *) - pos := !pos + 2; - if !pos < len && (source.[!pos] = '+' || source.[!pos] = '-') then incr pos; - if !pos < len && source.[!pos] >= '0' && source.[!pos] <= '9' then incr pos; - if !pos < len && source.[!pos] >= '0' && source.[!pos] <= '9' then incr pos - | 'm' -> - (* color escape: \m[...] — skip the bracketed color name *) - pos := !pos + 2; - if !pos < len && source.[!pos] = '[' then begin - incr pos; - skip_to_char source len pos ']'; - if !pos < len then incr pos - end - | 'X' -> - (* device control: \X'...' — skip the single-quoted payload *) - pos := !pos + 2; - if !pos < len && source.[!pos] = '\'' then begin - incr pos; - skip_to_char source len pos '\''; - if !pos < len then incr pos - end - | '*' -> - (* string variable: \*X or \*(XX or \*[...] — skip the reference *) - pos := !pos + 2; - skip_groff_reference source len pos - | 'n' -> - (* number register: \nX or \n(XX or \n[...] — skip the reference *) - pos := !pos + 2; - skip_groff_reference source len pos - | 'e' -> - (* escaped backslash literal *) - put '\\'; - pos := !pos + 2 - | '\\' -> - (* double backslash — emit one *) - put '\\'; - pos := !pos + 2 - | ' ' -> - (* escaped space — emit a regular space *) - put ' '; - pos := !pos + 2 - | _ -> - (* unknown escape — skip the two-character sequence *) - pos := !pos + 2 - end else begin - put source.[!pos]; - incr pos - end - done; - Buffer.contents buffer - -(* strip inline macro formatting: .BI, .BR, .IR, etc. - * these macros alternate between fonts for their arguments, e.g.: - * .BI "--output " "FILE" - * becomes "--outputFILE" (arguments concatenated without spaces). - * - * quoted strings are kept together (quotes stripped), but unquoted spaces - * are consumed. this matches groff's actual rendering of these macros, - * where alternating-font arguments are concatenated. *) -let strip_inline_macro_args text = - let buffer = Buffer.create (String.length text) in - let len = String.length text in - let pos = ref 0 in - while !pos < len do - if text.[!pos] = '"' then begin - (* quoted argument — copy characters up to the closing quote *) - incr pos; - while !pos < len && text.[!pos] <> '"' do - Buffer.add_char buffer text.[!pos]; - incr pos - done; - if !pos < len then incr pos - end else if text.[!pos] = ' ' || text.[!pos] = '\t' then begin - (* unquoted whitespace — skip (arguments are concatenated) *) - incr pos - end else begin - (* regular character — copy to output *) - Buffer.add_char buffer text.[!pos]; - incr pos - end - done; - Buffer.contents buffer - -(* convenience: strip escapes and trim whitespace *) -let strip_groff line = - let text = strip_groff_escapes line in - String.trim text - -(* --- line classification --- - * every line in a manpage is classified as one of four types. - * this classification drives all subsequent parsing — strategies - * pattern-match on sequences of classified lines. *) - -type groff_line = - | Macro of string * string (* macro name + args, e.g. ("SH", "OPTIONS") or ("TP", "") *) - | Text of string (* plain text after groff stripping *) - | Blank (* empty line *) - | Comment (* groff comment: .backslash-quote or backslash-quote *) - -(* classify a single line of manpage source. - * macro lines start with '.' or '\'' (groff alternate control char). - * the macro name is split from its arguments at the first space/tab. - * arguments wrapped in double quotes are unquoted. *) -let classify_line line = - let len = String.length line in - if len = 0 then Blank - else if len >= 2 && line.[0] = '.' && line.[1] = '\\' && (len < 3 || line.[2] = '"') then - Comment - else if len >= 3 && line.[0] = '\\' && line.[1] = '"' then - Comment - else if line.[0] = '.' || line.[0] = '\'' then begin - (* macro line — extract macro name and arguments *) - let rest = String.sub line 1 (len - 1) in - let rest = String.trim rest in - (* split into macro name and arguments at the first whitespace *) - let space_pos = - try Some (String.index rest ' ') - with Not_found -> - try Some (String.index rest '\t') - with Not_found -> None - in - match space_pos with - | Some split_at -> - let name = String.sub rest 0 split_at in - let args = String.trim (String.sub rest (split_at + 1) (String.length rest - split_at - 1)) in - (* strip surrounding quotes from arguments *) - let args = - let alen = String.length args in - if alen >= 2 && args.[0] = '"' && args.[alen - 1] = '"' then - String.sub args 1 (alen - 2) - else args - in - Macro (name, args) - | None -> - Macro (rest, "") - end else begin - let stripped = strip_groff line in - if String.length stripped = 0 then Blank - else Text stripped - end - -(* refined comment detection — the base classify_line may miss some comment - * forms, so this wrapper checks more carefully before falling through to - * the general classifier. *) -let is_comment_line line = - let len = String.length line in - (len >= 3 && line.[0] = '.' && line.[1] = '\\' && line.[2] = '"') - || (len >= 2 && line.[0] = '\\' && line.[1] = '"') - -let classify_line line = - if is_comment_line line then Comment - else classify_line line - -(* --- section extraction --- - * manpages are divided into sections by .SH macros. the OPTIONS section - * contains the flag definitions we want. if there's no OPTIONS section, - * we fall back to DESCRIPTION (some simple tools put flags there). - * - * old-style nix manpages (nix-build, nix-env-install, etc.) split flags - * across multiple .SH sections with option-like names: e.g. "Options" for - * command-specific flags and "Common Options" for flags shared by all nix - * commands. collecting only the first such section misses the majority of - * flags, so we collect and concatenate all option-like sections. *) - -let extract_options_section lines = - let classified = List.map classify_line lines in - (* collect lines until the next .SH header, returning (content, rest) - * where rest starts at the .SH line (or is empty if at end of file). *) - let rec collect_section lines acc = - match lines with - | [] -> (List.rev acc, []) - | Macro ("SH", _) :: _ -> (List.rev acc, lines) - | line :: rest -> collect_section rest (line :: acc) - in - (* test whether a section name looks like an options section. - * matches "OPTIONS", "COMMON OPTIONS", "GLOBAL OPTIONS", etc. *) - let is_options_section name = - let upper = String.uppercase_ascii (String.trim name) in - upper = "OPTIONS" - || (String.length upper > 0 && - try let _ = Str.search_forward (Str.regexp_string "OPTION") upper 0 in true - with Not_found -> false) - in - (* collect from all option-like .SH sections and concatenate them. - * handles the common nix pattern where "Options" and "Common Options" - * are separate .SH sections but both contain relevant flags. - * - * a synthetic Macro("SH","") separator is inserted between sections so - * that collect_desc_text (which stops on SH/SS) does not let a description - * from the last entry in one section bleed into the intro text of the next. *) - let rec find_all_options lines acc = - match lines with - | [] -> acc - | Macro ("SH", args) :: rest when is_options_section args -> - let (section, remaining) = collect_section rest [] in - let sep = if acc = [] then [] else [Macro ("SH", "")] in - find_all_options remaining (acc @ sep @ section) - | _ :: rest -> find_all_options rest acc - in - (* fallback: DESCRIPTION section for simple tools that put flags there *) - let rec find_description = function - | [] -> [] - | Macro ("SH", args) :: rest - when String.uppercase_ascii (String.trim args) = "DESCRIPTION" -> - fst (collect_section rest []) - | _ :: rest -> find_description rest - in - match find_all_options classified [] with - | [] -> find_description classified - | sections -> sections - -(* --- strategy-based entry extraction --- - * rather than a single monolithic parser, we use multiple "strategies" that - * each target a specific groff formatting pattern. this is necessary because - * manpage authors use very different macro combinations for the same purpose. - * - * the shared building blocks: - * - collect_text_lines: gather consecutive Text lines into one description string - * - parse_tag_to_entry: run the angstrom switch parser on a tag string to - * extract the flag definition. this reuses the same parser that handles - * --help output, giving consistent extraction across both sources. - * - tag_of_macro: extract the "tag" text from formatting macros like .B, .BI, etc. - *) - -(* collect consecutive text lines, joining them with spaces *) -let rec collect_text_lines lines acc = - match lines with - | Text text :: rest -> collect_text_lines rest (text :: acc) - | _ -> (String.concat " " (List.rev acc), lines) - -(* attempt to parse a tag string (e.g. "-v, --verbose FILE") into an entry. - * uses the angstrom switch_parser + param_parser from parser.ml. - * returns None if the tag doesn't look like a flag definition. *) -let parse_tag_to_entry tag desc = - let tag = strip_groff_escapes tag in - let tag = String.trim tag in - match Angstrom.parse_string ~consume:Angstrom.Consume.Prefix - (Angstrom.lift2 (fun sw p -> (sw, p)) switch_parser param_parser) tag with - | Ok (switch, param) -> Some { switch; param; desc } - | Error _ -> None - -(* extract tag text from a macro line. - * .B and .I preserve spaces (single argument); .BI, .BR, .IR alternate - * fonts and concatenate arguments. *) -let tag_of_macro name args = - match name with - | "B" | "I" -> strip_groff_escapes args |> String.trim - | _ -> strip_inline_macro_args args |> strip_groff_escapes |> String.trim - -(* strategy a: .TP style (most common — gnu coreutils, help2man). - * .TP introduces a tagged paragraph: the next line is the "tag" (flag name) - * and subsequent text lines are the description. the tag can be plain text - * or wrapped in a formatting macro (.B, .BI, etc.). - * - * example groff: - * .TP - * \fB\-v\fR, \fB\-\-verbose\fR - * increase verbosity *) -let strategy_tp lines = - let rec walk lines acc = - match lines with - | [] -> List.rev acc - | Macro ("TP", _) :: rest -> - (* next line is the tag — could be Text or a formatting macro *) - begin match rest with - | Text tag :: rest2 -> - let (desc, rest3) = collect_text_lines rest2 [] in - let entry = parse_tag_to_entry tag desc in - walk rest3 (match entry with Some e -> e :: acc | None -> acc) - | Macro (("B" | "I" | "BI" | "BR" | "IR") as macro_name, args) :: rest2 -> - let tag = tag_of_macro macro_name args in - let (desc, rest3) = collect_text_lines rest2 [] in - let entry = parse_tag_to_entry tag desc in - walk rest3 (match entry with Some e -> e :: acc | None -> acc) - | _ -> walk rest acc - end - | _ :: rest -> walk rest acc - in - walk lines [] - -(* strategy b: .IP style (curl, hand-written manpages). - * .IP takes an inline tag argument: .IP "-v, --verbose" - * the description follows as text lines. simpler than .TP because - * the tag is on the macro line itself. *) -let strategy_ip lines = - let rec walk lines acc = - match lines with - | [] -> List.rev acc - | Macro ("IP", tag) :: rest -> - let tag = strip_groff_escapes tag in - let (desc, rest2) = collect_text_lines rest [] in - let entry = parse_tag_to_entry tag desc in - walk rest2 (match entry with Some e -> e :: acc | None -> acc) - | _ :: rest -> walk rest acc - in - walk lines [] - -(* strategy c: .PP + .RS/.RE style (git, docbook-generated manpages). - * flag entries are introduced by .PP (paragraph), with the flag name as - * plain text, followed by a .RS (indent) block containing the description, - * closed by .RE (de-indent). this is common in docbook-to-manpage toolchains. *) -let strategy_pp_rs lines = - let rec walk lines acc = - match lines with - | [] -> List.rev acc - | Macro ("PP", _) :: rest -> - begin match rest with - | Text tag :: rest2 -> - (* look for .RS ... text ... .RE *) - let rec collect_rs lines desc_acc = - match lines with - | Macro ("RS", _) :: rest3 -> - collect_in_rs rest3 desc_acc - | Text text :: rest3 -> - (* sometimes description follows directly *) - collect_rs rest3 (text :: desc_acc) - | _ -> (String.concat " " (List.rev desc_acc), lines) - and collect_in_rs lines desc_acc = - match lines with - | Macro ("RE", _) :: rest3 -> - (String.concat " " (List.rev desc_acc), rest3) - | Text text :: rest3 -> - collect_in_rs rest3 (text :: desc_acc) - | Macro ("PP", _) :: _ | Macro ("SH", _) :: _ -> - (String.concat " " (List.rev desc_acc), lines) - | _ :: rest3 -> collect_in_rs rest3 desc_acc - | [] -> (String.concat " " (List.rev desc_acc), []) - in - let (desc, rest3) = collect_rs rest2 [] in - let entry = parse_tag_to_entry tag desc in - walk rest3 (match entry with Some e -> e :: acc | None -> acc) - | _ -> walk rest acc - end - | _ :: rest -> walk rest acc - in - walk lines [] - -(* strategy d: deroff fallback — strip all groff markup, then feed the - * resulting plain text through the --help parser from parser.ml. - * this is the last resort when no structured macro pattern is recognized. - * it works surprisingly well for simple manpages but may miss entries - * in heavily formatted ones. *) -let strategy_deroff_lines lines = - let buffer = Buffer.create 256 in - List.iter (fun line -> - match line with - | Text text -> - Buffer.add_string buffer text; - Buffer.add_char buffer '\n' - | Macro (("BI" | "BR" | "IR" | "B" | "I"), args) -> - let text = strip_inline_macro_args args in - let text = strip_groff_escapes text in - Buffer.add_string buffer text; - Buffer.add_char buffer '\n' - | Blank -> Buffer.add_char buffer '\n' - | _ -> () - ) lines; - let text = Buffer.contents buffer in - match parse_help text with - | Ok result -> result.entries - | Error _ -> [] - -(* strategy e: nix3-style bullet .IP with .UR/.UE hyperlinks. - * nix's manpages use .IP with bullet markers for flag entries, interleaved - * with .UR/.UE hyperlink macros. the flag tag is in text lines after the - * bullet .IP, and the description follows a non-bullet .IP marker. - * - * nix manpages nest .RS/.RE blocks inside descriptions for sub-examples. - * the skip_rs helper tracks nesting depth to skip these without losing - * the rest of the description. *) -let strategy_nix lines = - (* a bullet .IP has non-empty args (the bullet marker) *) - let is_bullet_ip args = - String.length (String.trim args) > 0 - in - let rec walk lines acc = - match lines with - | [] -> List.rev acc - | Macro ("IP", args) :: rest when is_bullet_ip args -> - (* collect tag: skip .UR/.UE macros, collect Text lines *) - let rec collect_tag lines parts = - match lines with - | Macro ("UR", _) :: rest2 -> collect_tag rest2 parts - | Macro ("UE", _) :: rest2 -> collect_tag rest2 parts - | Text text :: rest2 -> collect_tag rest2 (text :: parts) - | _ -> (String.concat " " (List.rev parts), lines) - in - let (tag, rest2) = collect_tag rest [] in - (* collect description after the description .IP marker *) - let rec collect_desc lines parts = - match lines with - | Macro ("IP", dargs) :: rest3 when not (is_bullet_ip dargs) -> - collect_desc_text rest3 parts - | _ -> (String.concat " " (List.rev parts), lines) - and collect_desc_text lines parts = - match lines with - | Text text :: rest3 -> collect_desc_text rest3 (text :: parts) - | Macro ("IP", args2) :: _ when is_bullet_ip args2 -> - (* next bullet entry — stop collecting *) - (String.concat " " (List.rev parts), lines) - | Macro (("SS" | "SH"), _) :: _ -> - (* section boundary — stop collecting *) - (String.concat " " (List.rev parts), lines) - | Macro ("RS", _) :: rest3 -> - skip_rs rest3 parts 1 - | Macro ("IP", _) :: rest3 -> - (* non-bullet .IP = continuation paragraph *) - collect_desc_text rest3 parts - | Macro _ :: rest3 -> collect_desc_text rest3 parts - | Blank :: rest3 -> collect_desc_text rest3 parts - | Comment :: rest3 -> collect_desc_text rest3 parts - | [] -> (String.concat " " (List.rev parts), []) - and skip_rs lines parts depth = - match lines with - | Macro ("RE", _) :: rest3 -> - if depth <= 1 then collect_desc_text rest3 parts - else skip_rs rest3 parts (depth - 1) - | Macro ("RS", _) :: rest3 -> skip_rs rest3 parts (depth + 1) - | _ :: rest3 -> skip_rs rest3 parts depth - | [] -> (String.concat " " (List.rev parts), []) - in - let (desc, rest3) = collect_desc rest2 [] in - let entry = parse_tag_to_entry tag desc in - walk rest3 (match entry with Some e -> e :: acc | None -> acc) - | _ :: rest -> walk rest acc - in - walk lines [] - -(* count occurrences of a specific macro in the section. - * used by extract_entries to decide which strategies are worth trying. *) -let count_macro name lines = - List.fold_left (fun count line -> - match line with Macro (macro_name, _) when macro_name = name -> count + 1 | _ -> count - ) 0 lines - -(* auto-detect and try strategies, return the one with most entries. - * first counts macros to determine which strategies are applicable, - * then runs all applicable ones and picks the winner by entry count. - * if no specialized strategy produces results, falls back to deroff. - * - * this "try everything, pick the best" approach is intentional. - * manpage formatting is too varied and inconsistent to reliably detect the - * format from macro counts alone. running multiple strategies and comparing - * results is more robust. *) -let extract_entries lines = - let tp = count_macro "TP" lines - and ip = count_macro "IP" lines - and pp = count_macro "PP" lines - and rs = count_macro "RS" lines - and ur = count_macro "UR" lines in - (* build a list of (label, entries) for each applicable strategy *) - let specialized = List.filter_map Fun.id [ - (if tp > 0 then Some ("TP", strategy_tp lines) else None); - (if ip > 0 then Some ("IP", strategy_ip lines) else None); - (if pp > 0 && rs > 0 then Some ("PP+RS", strategy_pp_rs lines) else None); - (if ur > 0 && ip > 0 then Some ("nix", strategy_nix lines) else None); - ] in - (* filter to strategies that found at least one entry, fall back to deroff *) - let candidates = match List.filter (fun (_, entries) -> entries <> []) specialized with - | [] -> [("deroff", strategy_deroff_lines lines)] - | filtered -> filtered - in - (* pick the strategy with the most entries *) - List.fold_left (fun (_, best) (name, entries) -> - if List.length entries >= List.length best then (name, entries) - else (name, best) - ) ("none", []) candidates |> snd - -(* --- NAME section description extraction --- - * the NAME section in manpages follows the convention: - * "command \- short description" - * we extract the part after "\-" as the command's description. - * handles both "\-" (groff) and " - " (plain text) separators. *) - -let extract_name_description contents = - let lines = String.split_on_char '\n' contents in - let classified = List.map classify_line lines in - let rec find = function - | [] -> None - | Macro ("SH", args) :: rest - when String.uppercase_ascii (String.trim args) = "NAME" -> - collect rest [] - | _ :: rest -> find rest - and collect lines acc = - match lines with - | Macro ("SH", _) :: _ | [] -> finish acc - | Text text :: rest -> collect rest (text :: acc) - | Macro (("B" | "BI" | "BR" | "I" | "IR"), args) :: rest -> - let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in - collect rest (if String.length text > 0 then text :: acc else acc) - | Macro ("Nm", args) :: rest -> - let text = strip_groff_escapes args |> String.trim in - collect rest (if String.length text > 0 then text :: acc else acc) - | Macro ("Nd", args) :: rest -> - let text = strip_groff_escapes args |> String.trim in - collect rest (if String.length text > 0 then ("\\- " ^ text) :: acc else acc) - | _ :: rest -> collect rest acc - and finish acc = - let full = String.concat " " (List.rev acc) |> String.trim in - (* NAME lines look like: "git-add \- Add file contents to the index" *) - let sep = Str.regexp {| *\\- *\| +- +|} in - match Str.bounded_split sep full 2 with - | [_; desc] -> Some (String.trim desc) - | _ -> None - in - find classified - -(* --- SYNOPSIS command name extraction --- - * the SYNOPSIS section shows how to invoke the command: - * .SH SYNOPSIS - * .B git add - * [\fIOPTIONS\fR] [\fB\-\-\fR] [\fI\fR...] - * - * we extract the command name by taking consecutive "word" tokens until - * we hit something that looks like an argument (starts with [, <, -, etc.). *) - -let extract_synopsis_command_lines lines = - (* replace italic text (\fI...\fR) with angle-bracketed placeholders - * before classification strips the font info. italic in groff indicates - * a parameter/placeholder (e.g. \fIoperation\fR), not a command word. - * the angle brackets cause extract_cmd to stop at these tokens since - * '<' is in its stop set. without this, "nix-env \fIoperation\fR" - * would be parsed as command "nix-env operation" instead of "nix-env". *) - let lines = List.map (fun line -> - Str.global_replace (Str.regexp {|\\fI\([^\\]*\)\\f[RP]|}) {|<\1>|} line - ) lines in - let classified = List.map classify_line lines in - let is_synopsis name = - String.uppercase_ascii (String.trim name) = "SYNOPSIS" - in - (* extract the command name from a line by taking leading word tokens *) - let extract_cmd line = - let words = String.split_on_char ' ' (String.trim line) in - let words = List.filter (fun word -> String.length word > 0) words in - let is_cmd_char = function - | 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '.' -> true - | _ -> false - in - (* take words that look like command name parts, stop at arguments *) - let rec take = function - | [] -> [] - | word :: rest -> - if String.length word > 0 - && (word.[0] = '[' || word.[0] = '-' || word.[0] = '<' - || word.[0] = '(' || word.[0] = '{') - then [] - else if String.for_all is_cmd_char word then - word :: take rest - else [] - in - match take words with - | [] -> None - | cmd -> Some (String.concat " " cmd) - in - let rec find = function - | [] -> None - | Macro ("SH", args) :: rest when is_synopsis args -> collect rest - | _ :: rest -> find rest - and collect = function - | [] -> None - | Macro ("SH", _) :: _ -> None - | Text text :: _ -> - let text = String.trim text in - if String.length text > 0 then extract_cmd text else None - | Macro (("B" | "BI" | "BR"), args) :: _ -> - let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in - if String.length text > 0 then extract_cmd text else None - | _ :: rest -> collect rest - in - find classified - -let extract_synopsis_command contents = - let lines = String.split_on_char '\n' contents in - extract_synopsis_command_lines lines - -(* --- SYNOPSIS positional extraction --- - * extract positional arguments from the SYNOPSIS section by collecting - * all text/formatting macro lines, joining them, skipping the command - * name prefix, then running parse_usage_args from parser.ml on the remainder. *) - -let extract_synopsis_positionals_lines lines = - let classified = List.map classify_line lines in - let is_synopsis name = - String.uppercase_ascii (String.trim name) = "SYNOPSIS" - in - let rec find = function - | [] -> [] - | Macro ("SH", args) :: rest when is_synopsis args -> collect rest [] - | _ :: rest -> find rest - and collect lines acc = - match lines with - | [] -> finish acc - | Macro ("SH", _) :: _ -> finish acc - | Macro ("SS", _) :: _ -> finish acc - | Macro ("br", _) :: _ -> finish acc - | Text text :: rest -> - let text = strip_groff_escapes text |> String.trim in - collect rest (if String.length text > 0 then text :: acc else acc) - | Macro (("B" | "BI" | "BR" | "I" | "IR" | "IB" | "RB" | "RI"), args) :: rest -> - let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in - collect rest (if String.length text > 0 then text :: acc else acc) - | _ :: rest -> collect rest acc - and finish acc = - let parts = List.rev acc in - let full = String.concat " " parts |> String.trim in - if String.length full = 0 then [] - else - let cmd_end = skip_command_prefix full in - let args = String.sub full cmd_end (String.length full - cmd_end) in - parse_usage_args args - in - find classified - -(* --- mdoc (bsd) format support --- - * mdoc is the bsd manpage macro package. it uses semantic macros rather than - * presentation macros: - * .Fl v -> flag: -v - * .Ar file -> argument: file - * .Op ... -> optional: [...] - * .Bl/.It/.El -> list begin/item/end - * .Sh -> section header (note lowercase 'h', vs groff's .SH) - * - * the parser walks through classified lines looking for .Bl (list begin) - * blocks containing .It (items) with .Fl (flag) entries. *) - -let is_mdoc lines = - List.exists (fun line -> - match classify_line line with Macro ("Sh", _) -> true | _ -> false - ) lines - -(* extract renderable text from an mdoc line, skipping structural macros *) -let mdoc_text_of line = - match line with - | Text text -> Some (strip_groff_escapes text) - | Macro (macro_name, args) -> - (match macro_name with - | "Pp" | "Bl" | "El" | "Sh" | "Ss" | "Os" | "Dd" | "Dt" - | "Oo" | "Oc" | "Op" -> None - | _ -> - let text = strip_groff_escapes args |> String.trim in - if text = "" then None else Some text) - | _ -> None - -(* parse an mdoc .It (list item) line that contains flag definitions. - * mdoc .It lines look like: ".It Fl v Ar file" - * where Fl = flag, Ar = argument. we extract the flag name and parameter. - * - * only handles single-char short flags and long flags starting with '-'. - * mdoc's .Fl macro automatically prepends '-', so "Fl v" means "-v" - * and "Fl -verbose" means "--verbose". *) -let parse_mdoc_it args = - let words = String.split_on_char ' ' args - |> List.filter (fun word -> word <> "" && word <> "Ns") in - let param = match words with - | _ :: _ :: "Ar" :: param_name :: _ -> Some (Mandatory param_name) - | _ -> None - in - match words with - | "Fl" :: char_str :: _ when String.length char_str = 1 && is_alphanumeric char_str.[0] -> - Some { switch = Short char_str.[0]; param; desc = "" } - | "Fl" :: name :: _ when String.length name > 1 && name.[0] = '-' -> - Some { switch = Long (String.sub name 1 (String.length name - 1)); param; desc = "" } - | _ -> None - -(* extract a positional argument from an mdoc line (.Ar or .Op Ar) *) -let positional_of_mdoc_line optional args = - let words = String.split_on_char ' ' args - |> List.filter (fun word -> word <> "") in - match words with - | name :: _ when String.length name >= 2 -> - Some { pos_name = String.lowercase_ascii name; - optional; variadic = List.mem "..." words } - | _ -> None - -(* parse an entire mdoc-format manpage. - * walks through all classified lines looking for: - * 1. .Bl/.It/.El list blocks containing flag definitions - * 2. .Sh SYNOPSIS sections containing positional arguments (.Ar, .Op Ar) - * - * the scan function handles nested .Bl blocks — if the first .It in a .Bl - * starts with .Fl (a flag), the entire list is parsed as options. otherwise - * the list is skipped (it might be an example list or a description list). *) -let parse_mdoc_lines lines = - let classified = List.map classify_line lines in - (* skip lines until the matching .El closing tag *) - let rec skip_to_el = function - | [] -> [] - | Macro ("El", _) :: rest -> rest - | _ :: rest -> skip_to_el rest - in - (* collect description text lines until the next structural macro *) - let rec collect_desc acc = function - | [] -> (acc, []) - | (Macro ("It", _) | Macro ("El", _) - | Macro ("Sh", _) | Macro ("Ss", _)) :: _ as rest -> (acc, rest) - | line :: rest -> - collect_desc (match mdoc_text_of line with Some text -> text :: acc | None -> acc) rest - in - (* convenience: collect desc and join into a trimmed string *) - let desc_of rest = - let parts, rest = collect_desc [] rest in - (String.concat " " (List.rev parts) |> String.trim, rest) - in - (* parse a single .It entry: extract flag, collect description *) - let parse_it args rest entries = - let desc, rest = desc_of rest in - let entries = match parse_mdoc_it args with - | Some entry -> { entry with desc } :: entries - | None -> entries - in - (entries, rest) - in - (* parse all .It entries within a .Bl/.El option list *) - let rec parse_option_list entries = function - | [] -> (entries, []) - | Macro ("El", _) :: rest -> (entries, rest) - | Macro ("It", args) :: rest -> - let entries, rest = parse_it args rest entries in - parse_option_list entries rest - | _ :: rest -> parse_option_list entries rest - in - (* main scan: walk through all lines, collecting flags and positionals *) - let rec scan entries positionals = function - | [] -> (entries, positionals) - | Macro ("Bl", _) :: Macro ("It", it_args) :: rest -> - (* peek at first .It to decide if this is a flag list *) - let words = String.split_on_char ' ' it_args - |> List.filter (fun word -> word <> "") in - if (match words with "Fl" :: _ -> true | _ -> false) then - let entries, rest = parse_it it_args rest entries in - let entries, rest = parse_option_list entries rest in - scan entries positionals rest - else - scan entries positionals (skip_to_el rest) - | Macro ("Bl", _) :: rest -> scan entries positionals (skip_to_el rest) - | Macro ("Sh", args) :: rest - when String.uppercase_ascii (String.trim args) = "SYNOPSIS" -> - let positionals, rest = parse_synopsis positionals rest in - scan entries positionals rest - | _ :: rest -> scan entries positionals rest - and parse_synopsis positionals = function - | [] -> (positionals, []) - | Macro ("Sh", _) :: _ as rest -> (positionals, rest) - | Macro ("Ar", args) :: rest -> - let positionals = match positional_of_mdoc_line false args with - | Some p -> p :: positionals | None -> positionals in - parse_synopsis positionals rest - | Macro ("Op", args) :: rest -> - let words = String.split_on_char ' ' args - |> List.filter (fun word -> word <> "") in - let positionals = match words with - | "Ar" :: _ -> - (match positional_of_mdoc_line true args with - | Some p -> p :: positionals | None -> positionals) - | _ -> positionals in - parse_synopsis positionals rest - | _ :: rest -> parse_synopsis positionals rest - in - let entries, positionals = scan [] [] classified in - (* deduplicate positionals by name, preserving order *) - let positionals = - List.rev positionals - |> List.fold_left (fun (seen, acc) p -> - if List.mem p.pos_name seen then (seen, acc) - else (p.pos_name :: seen, p :: acc) - ) ([], []) - |> snd |> List.rev - in - { entries = List.rev entries; subcommands = []; positionals; description = "" } - -(* --- COMMANDS section subcommand extraction --- - * some manpages (notably systemctl) have a dedicated COMMANDS section - * listing subcommands with descriptions. these use .PP + bold name + - * .RS/.RE blocks: - * .PP - * \fBstart\fR \fIUNIT\fR... - * .RS 4 - * Start (activate) one or more units. - * .RE - * - * we extract the bold command name and first sentence of description. *) - -let extract_commands_section lines = - let classified = List.map classify_line lines in - (* collect all lines from the current position until the next .SH *) - let rec collect_until_next_sh lines acc = - match lines with - | [] -> List.rev acc - | Macro ("SH", _) :: _ -> List.rev acc - | line :: rest -> collect_until_next_sh rest (line :: acc) - in - let is_commands_section name = - let upper = String.uppercase_ascii (String.trim name) in - upper = "COMMANDS" || upper = "COMMAND" - in - (* find all COMMANDS/.COMMAND sections and collect their lines *) - let rec find_commands acc = function - | [] -> List.rev acc - | Macro ("SH", args) :: rest when is_commands_section args -> - find_commands (collect_until_next_sh rest [] :: acc) rest - | _ :: rest -> find_commands acc rest - in - let sections = find_commands [] classified in - List.concat sections - -(* extract subcommand name from a bold groff text like - * "\fBlist\-units\fR [\fIPATTERN\fR...]" -> "list-units" - * - * validates that the extracted name looks like a subcommand: lowercase, - * at least 2 chars, no leading dash. falls back to stripping all groff - * and taking the first word if no \fB...\fR wrapper is found. *) -let extract_bold_command_name text = - let trimmed = String.trim text in - (* check whether a string looks like a valid subcommand name *) - let is_valid_subcmd name = - String.length name >= 2 - && name.[0] <> '-' - && String.for_all (fun char_val -> - (char_val >= 'a' && char_val <= 'z') - || (char_val >= '0' && char_val <= '9') - || char_val = '-' || char_val = '_' - ) name - in - (* look for \fB...\fR at the start *) - if String.length trimmed >= 4 - && trimmed.[0] = '\\' && trimmed.[1] = 'f' && trimmed.[2] = 'B' then - let start = 3 in - let end_marker = "\\fR" in - match String.split_on_char '\\' (String.sub trimmed start (String.length trimmed - start)) with - | name_part :: _ -> - let name = strip_groff_escapes ("\\fB" ^ name_part ^ end_marker) |> String.trim in - if is_valid_subcmd name then Some name else None - | [] -> None - else - (* try already-stripped text — take the first word *) - let stripped = strip_groff_escapes trimmed in - let first_word = match String.split_on_char ' ' stripped with - | word :: _ -> word | [] -> "" in - if is_valid_subcmd first_word then Some first_word else None - -(* walk through commands section lines, extracting subcommand name+description - * pairs from .PP + Text + .RS/.RE blocks *) -let extract_subcommands_from_commands lines = - let rec walk lines acc = - match lines with - | [] -> List.rev acc - | Macro ("PP", _) :: rest -> - begin match rest with - | Text tag :: rest2 -> - (* check if this is a subcommand (bold name, not a flag) *) - begin match extract_bold_command_name tag with - | Some name -> - (* collect description from .RS/.RE block *) - let rec collect_desc lines desc_acc = - match lines with - | Macro ("RS", _) :: rest3 -> - collect_in_rs rest3 desc_acc - | Text text :: rest3 -> - collect_desc rest3 (text :: desc_acc) - | _ -> (String.concat " " (List.rev desc_acc), lines) - and collect_in_rs lines desc_acc = - match lines with - | Macro ("RE", _) :: rest3 -> - (String.concat " " (List.rev desc_acc), rest3) - | Text text :: rest3 -> - collect_in_rs rest3 (text :: desc_acc) - | Macro ("PP", _) :: _ | Macro ("SH", _) :: _ | Macro ("SS", _) :: _ -> - (String.concat " " (List.rev desc_acc), lines) - | _ :: rest3 -> collect_in_rs rest3 desc_acc - | [] -> (String.concat " " (List.rev desc_acc), []) - in - let (desc, rest3) = collect_desc rest2 [] in - let desc = String.trim desc in - (* take first sentence as description *) - let short_desc = match String.split_on_char '.' desc with - | first :: _ when String.length first > 0 -> String.trim first - | _ -> desc in - let sc : subcommand = { name; desc = short_desc } in - walk rest3 (sc :: acc) - | None -> walk rest2 acc - end - | _ -> walk rest acc - end - | _ :: rest -> walk rest acc - in - walk lines [] - -(* --- top-level api --- *) - -(* parse a manpage from its classified lines. - * auto-detects mdoc vs groff format. for groff, runs the multi-strategy - * extraction pipeline: extract OPTIONS section -> try all strategies -> - * pick best -> extract SYNOPSIS positionals -> extract COMMANDS subcommands. *) -let parse_manpage_lines lines = - if is_mdoc lines then - parse_mdoc_lines lines - else begin - let options_section = extract_options_section lines in - let entries = extract_entries options_section in - let positionals = extract_synopsis_positionals_lines lines in - let commands_section = extract_commands_section lines in - let subcommands = extract_subcommands_from_commands commands_section in - { entries; subcommands; positionals; description = "" } - end - -(* parse a manpage from its raw string contents. - * splits into lines, parses, then extracts the NAME section description. *) -let parse_manpage_string contents = - let lines = String.split_on_char '\n' contents in - let result = parse_manpage_lines lines in - let description = match extract_name_description contents with - | Some desc -> desc | None -> "" in - { result with description } - -(* --- clap-style SUBCOMMAND section extraction --- - * manpages generated by clap (rust's cli arg parser) put each subcommand - * under its own .SH SUBCOMMAND header with a Usage: line giving the name. - * this is unusual — most tools list subcommands under a single COMMANDS section. - * - * we collect all .SH SUBCOMMAND/SUBCOMMANDS sections, find the Usage: line - * in each to get the subcommand name, then extract flag entries from the - * section body. returns triples of (name, description, help_result). *) -let extract_subcommand_sections contents = - let lines = String.split_on_char '\n' contents in - let classified = List.map classify_line lines in - (* split into sections at .SH boundaries, keeping only SUBCOMMAND(S) sections *) - let rec collect_sections acc current_name current_lines = function - | [] -> - let acc = match current_name with - | Some section_name -> (section_name, List.rev current_lines) :: acc - | None -> acc in - List.rev acc - | Macro ("SH", args) :: rest -> - let acc = match current_name with - | Some section_name -> (section_name, List.rev current_lines) :: acc - | None -> acc in - let name = String.uppercase_ascii (String.trim args) in - if name = "SUBCOMMAND" || name = "SUBCOMMANDS" then - collect_sections acc (Some name) [] rest - else - collect_sections acc None [] rest - | line :: rest -> - collect_sections acc current_name (line :: current_lines) rest - in - let sections = collect_sections [] None [] classified in - (* for each SUBCOMMAND section, extract name from Usage: line and parse entries *) - let usage_re = Str.regexp {|Usage: \([a-zA-Z0-9_-]+\)|} in - let matches_usage text = - try ignore (Str.search_forward usage_re text 0); Some (Str.matched_group 1 text) - with Not_found -> None in - List.filter_map (fun (_header, section_lines) -> - (* scan section lines for the Usage: line to get the subcommand name *) - let name, desc_lines = - List.fold_left (fun (name, desc_lines) line -> - match name with - | Some _ -> (name, desc_lines) - | None -> - match line with - | Text text -> - (match matches_usage text with - | Some _ as found -> (found, desc_lines) - | None -> (None, text :: desc_lines)) - | Macro (("TP" | "B" | "BI" | "BR"), args) -> - let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in - (matches_usage text, desc_lines) - | _ -> (None, desc_lines) - ) (None, []) section_lines in - match name with - | None -> None - | Some subcmd_name -> - let entries = extract_entries section_lines in - let desc = String.concat " " (List.rev desc_lines) - |> strip_groff_escapes |> String.trim in - (* strip backtick-quoted words *) - let desc = Str.global_replace (Str.regexp "`\\([^`]*\\)`") "\\1" desc in - Some (subcmd_name, desc, { entries; subcommands = []; positionals = []; description = desc }) - ) sections - -(* read a manpage file from disk. handles .gz compressed files (the common - * case — most installed manpages are gzipped) using the Gzip library. - * plain text files are read directly. *) -let read_manpage_file path = - if Filename.check_suffix path ".gz" then begin - let ic = Gzip.open_in path in - let buffer = Buffer.create 8192 in - let chunk = Bytes.create 8192 in - (try while true do - let bytes_read = Gzip.input ic chunk 0 8192 in - if bytes_read = 0 then raise Exit - else Buffer.add_subbytes buffer chunk 0 bytes_read - done with Exit | End_of_file -> ()); - Gzip.close_in ic; - Buffer.contents buffer - end else begin - let ic = open_in path in - let size = in_channel_length ic in - let bytes = Bytes.create size in - really_input ic bytes 0 size; - close_in ic; - Bytes.to_string bytes - end - -(* convenience: read + parse a manpage file in one step *) -let parse_manpage_file path = - read_manpage_file path |> parse_manpage_string diff --git a/lib/nushell.ml b/lib/nushell.ml deleted file mode 100644 index b5e4d4f..0000000 --- a/lib/nushell.ml +++ /dev/null @@ -1,253 +0,0 @@ -(* nushell.ml — generate nushell extern definitions from parsed help data. - * - * this module is the code generation backend. it takes a help_result (from - * the parser or manpage modules) and produces nushell source code that - * defines `extern` declarations — nushell's mechanism for teaching the shell - * about external commands' flags and subcommands so it can offer completions. - * - * it also maintains a list of nushell's built-in commands to avoid generating - * extern definitions that would shadow them. - * - * key responsibilities: - * - deduplicating flag entries (same flag from multiple help sources) - * - mapping parameter names to nushell types (path, int, string) - * - formatting flags in nushell syntax: --flag(-f): type # description - * - handling positional arguments with nushell's ordering constraints - * - escaping special characters for nushell string literals - *) - -open Parser - -module SSet = Set.Make(String) -module SMap = Map.Make(String) -module CSet = Set.Make(Char) - -(* nushell built-in commands and keywords — we must never generate `extern` - * definitions for these because it would shadow nushell's own implementations. - * this list is maintained manually and should be updated with new nushell releases. *) -let nushell_builtins = [ - "alias"; "all"; "ansi"; "any"; "append"; "ast"; "attr"; - "bits"; "break"; "bytes"; - "cal"; "cd"; "char"; "chunk-by"; "chunks"; "clear"; "collect"; - "columns"; "commandline"; "compact"; "complete"; "config"; "const"; - "continue"; "cp"; - "date"; "debug"; "decode"; "def"; "default"; "describe"; "detect"; - "do"; "drop"; "du"; - "each"; "echo"; "encode"; "enumerate"; "error"; "every"; "exec"; - "exit"; "explain"; "explore"; "export"; "export-env"; "extern"; - "fill"; "filter"; "find"; "first"; "flatten"; "for"; "format"; "from"; - "generate"; "get"; "glob"; "grid"; "group-by"; - "hash"; "headers"; "help"; "hide"; "hide-env"; "histogram"; - "history"; "http"; - "if"; "ignore"; "input"; "insert"; "inspect"; "interleave"; "into"; - "is-admin"; "is-empty"; "is-not-empty"; "is-terminal"; "items"; - "job"; "join"; - "keybindings"; "kill"; - "last"; "length"; "let"; "let-env"; "lines"; "load-env"; "loop"; "ls"; - "match"; "math"; "merge"; "metadata"; "mkdir"; "mktemp"; "module"; - "move"; "mut"; "mv"; - "nu-check"; "nu-highlight"; - "open"; "overlay"; - "panic"; "par-each"; "parse"; "path"; "plugin"; "port"; "prepend"; "print"; "ps"; - "query"; - "random"; "reduce"; "reject"; "rename"; "return"; "reverse"; "rm"; - "roll"; "rotate"; "run-external"; - "save"; "schema"; "scope"; "select"; "seq"; "shuffle"; "skip"; "sleep"; - "slice"; "sort"; "sort-by"; "source"; "source-env"; "split"; "start"; - "stor"; "str"; "sys"; - "table"; "take"; "tee"; "term"; "timeit"; "to"; "touch"; "transpose"; - "try"; "tutor"; - "ulimit"; "umask"; "uname"; "uniq"; "uniq-by"; "unlet"; "update"; - "upsert"; "url"; "use"; - "values"; "version"; "view"; - "watch"; "where"; "which"; "while"; "whoami"; "window"; "with-env"; "wrap"; - "zip"; -] - -(* lazily constructed set for fast membership checks against builtins *) -let builtin_set = lazy (SSet.of_list nushell_builtins) - -(* returns true if the given command name collides with a nushell built-in *) -let is_nushell_builtin cmd = - SSet.mem cmd (Lazy.force builtin_set) - -(* deduplicate flag entries that refer to the same flag. - * when the same flag appears multiple times (e.g. from overlapping manpage - * sections or repeated help text), we keep the "best" version using a score: - * - both short+long form present: +10 (most informative) - * - has a parameter: +5 - * - description length bonus: up to +5 - * - * after deduplication by long name, we also remove standalone short flags - * whose letter is already covered by a Both(short, long) entry. this prevents - * emitting both "-v" and "--verbose(-v)" which nushell would reject as a - * duplicate. the filtering preserves original ordering from the help text. *) -let dedup_entries entries = - (* produce a canonical key for each entry based on its switch form *) - let key_of entry = - match entry.switch with - | Short c -> Printf.sprintf "-%c" c - | Long l | Both (_, l) -> Printf.sprintf "--%s" l - in - (* compute a quality score for ranking duplicate entries *) - let score entry = - let switch_bonus = match entry.switch with Both _ -> 10 | _ -> 0 in - let param_bonus = match entry.param with Some _ -> 5 | None -> 0 in - let desc_bonus = min 5 (String.length entry.desc / 10) in - switch_bonus + param_bonus + desc_bonus - in - (* fold over entries, keeping only the highest-scored entry per key *) - let best = List.fold_left (fun acc entry -> - let key = key_of entry in - match SMap.find_opt key acc with - | Some prev when score prev >= score entry -> acc - | _ -> SMap.add key entry acc - ) SMap.empty entries in - (* collect all short-flag characters that are already part of a Both entry, - * so we can suppress standalone Short entries for the same character *) - let covered = SMap.fold (fun _ entry acc -> - match entry.switch with - | Both (c, _) -> CSet.add c acc - | _ -> acc - ) best CSet.empty in - (* emit entries in original order, skipping duplicates and covered shorts *) - List.fold_left (fun (seen, acc) entry -> - let key = key_of entry in - if SSet.mem key seen then (seen, acc) - else match entry.switch with - | Short c when CSet.mem c covered -> (seen, acc) - | _ -> (SSet.add key seen, SMap.find key best :: acc) - ) (SSet.empty, []) entries |> snd |> List.rev - -(* map parameter names to nushell types. - * nushell's `extern` declarations use typed parameters, so we infer the type - * from the parameter name. file/path-related names become "path" (enables - * path completion), numeric names become "int", everything else is "string". *) -let nushell_type_of_param = function - | "FILE" | "file" | "PATH" | "path" | "DIR" | "dir" | "DIRECTORY" - | "FILENAME" | "PATTERNFILE" -> "path" - | "NUM" | "N" | "COUNT" | "NUMBER" | "int" | "INT" | "COLS" | "WIDTH" - | "LINES" | "DEPTH" | "depth" -> "int" - | _ -> "string" - -(* escape a string for use inside nushell double-quoted string literals. - * only double quotes and backslashes need escaping in nushell's syntax. *) -let escape_nu s = - if not (String.contains s '"') && not (String.contains s '\\') then s - else begin - let buf = Buffer.create (String.length s + 4) in - String.iter (fun c -> match c with - | '"' -> Buffer.add_string buf "\\\"" - | '\\' -> Buffer.add_string buf "\\\\" - | _ -> Buffer.add_char buf c - ) s; - Buffer.contents buf - end - -(* format a single flag entry as a nushell `extern` parameter line. - * output examples: - * " --verbose(-v) # increase verbosity" - * " --output(-o): path # write output to file" - * " -n: int # number of results" - * - * the description is right-padded to column 40 with a "# " comment prefix. - * nushell's syntax for combined short+long is "--long(-s)". *) -let format_flag entry = - let name = match entry.switch with - | Both (short_char, l) -> Printf.sprintf "--%s(-%c)" l short_char - | Long l -> Printf.sprintf "--%s" l - | Short short_char -> Printf.sprintf "-%c" short_char - in - let typed = match entry.param with - | Some (Mandatory p) | Some (Optional p) -> ": " ^ nushell_type_of_param p - | None -> "" - in - let flag = " " ^ name ^ typed in - if String.length entry.desc = 0 then flag - else - let pad_len = max 1 (40 - String.length flag) in - flag ^ String.make pad_len ' ' ^ "# " ^ entry.desc - -(* format a positional argument as a nushell `extern` parameter line. - * nushell syntax: "...name: type" for variadic, "name?: type" for optional. - * hyphens in names are converted to underscores since nushell identifiers - * cannot contain hyphens. *) -let format_positional positional = - let name = String.map (function '-' -> '_' | c -> c) positional.pos_name in - let prefix = if positional.variadic then "..." else "" in - let suffix = if positional.optional && not positional.variadic then "?" else "" in - let typ = nushell_type_of_param (String.uppercase_ascii positional.pos_name) in - Printf.sprintf " %s%s%s: %s" prefix name suffix typ - -(* enforce nushell's positional argument ordering rules: - * 1. no required positional may follow an optional one - * 2. at most one variadic ("rest") parameter is allowed - * - * if a required positional appears after an optional one, it is silently - * promoted to optional. duplicate variadic params are dropped. - * uses a fold to track the state across the list in one pass. *) -let fixup_positionals positionals = - List.fold_left (fun (seen_optional, seen_variadic, acc) positional -> - if positional.variadic then - (* only allow the first variadic parameter *) - if seen_variadic then (seen_optional, seen_variadic, acc) - else (true, true, positional :: acc) - else if seen_optional then - (* once we've seen an optional, all subsequent must be optional too *) - (true, seen_variadic, { positional with optional = true } :: acc) - else - (positional.optional, seen_variadic, positional :: acc) - ) (false, false, []) positionals - |> fun (_, _, acc) -> List.rev acc - -(* generate the full nushell `extern` block for a command. - * produces output like: - * export extern "git add" [ - * ...pathspec?: path - * --verbose(-v) # be verbose - * --dry-run(-n) # dry run - * ] - * - * subcommands that weren't resolved into their own full definitions get - * stub `extern` blocks with just a comment containing their description: - * export extern "git stash" [ # stash changes - * ] - *) -let extern_of cmd_name result = - let entries = dedup_entries result.entries in - let escaped_name = escape_nu cmd_name in - let positionals = fixup_positionals result.positionals in - (* format all positional and flag lines, each terminated with a newline *) - let pos_lines = List.map (fun positional -> format_positional positional ^ "\n") positionals in - let flags = List.map (fun entry -> format_flag entry ^ "\n") entries in - let main = Printf.sprintf "export extern \"%s\" [\n%s%s]\n" escaped_name (String.concat "" pos_lines) (String.concat "" flags) in - (* generate stub extern blocks for unresolved subcommands *) - let subs = List.map (fun (subcommand : subcommand) -> - Printf.sprintf "\nexport extern \"%s %s\" [ # %s\n]\n" - escaped_name (escape_nu subcommand.name) (escape_nu subcommand.desc) - ) result.subcommands in - String.concat "" (main :: subs) - -(* public alias for extern_of — this is the main entry point for callers *) -let generate_extern = extern_of - -(* derive a nushell `module` name from a command name. - * replaces non-alphanumeric characters with hyphens and appends "-completions". - * e.g. "git" becomes "git-completions", "docker-compose" stays "docker-compose-completions" *) -let module_name_of cmd_name = - let s = String.map (function - | ('a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_') as c -> c | _ -> '-') cmd_name in - s ^ "-completions" - -(* generate a complete nushell `module` wrapping the `extern`. - * output: "module git-completions { ... }\n\nuse git-completions *\n" - * the `use` at the end makes the `extern` immediately available in scope. *) -let generate_module cmd_name result = - let mod_name = module_name_of cmd_name in - Printf.sprintf "module %s {\n%s}\n\nuse %s *\n" mod_name (extern_of cmd_name result) mod_name - -(* convenience wrapper: generate an `extern` from just a list of entries - * (no subcommands, positionals, or description). used when we only have - * flag data and nothing else. *) -let generate_extern_from_entries cmd_name entries = - generate_extern cmd_name { entries; subcommands = []; positionals = []; description = "" } diff --git a/lib/parser.ml b/lib/parser.ml deleted file mode 100644 index f20aae5..0000000 --- a/lib/parser.ml +++ /dev/null @@ -1,814 +0,0 @@ -(* parser.ml — parse --help output into structured flag/subcommand/positional data. - * - * this module is the core of inshellah's help-text understanding. it takes the - * raw text that a cli tool prints when you run `cmd --help` and extracts: - * - flag entries (short/long switches with optional parameters and descriptions) - * - subcommand listings (name + description pairs) - * - positional arguments (from usage lines) - * - * the parser is built on Angstrom (a monadic parser combinator library) for the - * structured flag/subcommand extraction, with hand-rolled imperative parsers for - * usage-line positional extraction (where the format is too varied for clean - * combinator composition). - * - * key design decisions: - * - the Angstrom parser runs in prefix-consume mode — it doesn't need to parse - * the entire input, just extract what it can recognize. unrecognized lines are - * skipped via skip_non_option_line. - * - multi-line descriptions are handled via indentation-based continuation: - * lines indented 8+ spaces that don't start with '-' are folded into the - * previous entry's description. - * - subcommand detection uses a heuristic: lines with a name followed by 2+ - * spaces then a description, where the name is at least 2 chars. section - * headers (like "arguments:") toggle whether name-description pairs are - * treated as subcommands or positionals. - * - positional extraction has two paths: usage-line parsing (the common case) - * and CLI11's explicit "positionals:" section format. - *) - -open Angstrom - -(* strip ansi escape sequences and osc hyperlinks from --help output. - * many modern cli tools emit colored/styled output even when piped, - * so we need to clean this before parsing. handles: - * - csi sequences (esc [ ... final_byte) — colors, cursor movement, etc. - * - osc sequences (esc ] ... bel/st) — hyperlinks, window titles, etc. - * - other two-byte esc+char sequences *) -let strip_ansi s = - let buf = Buffer.create (String.length s) in - let len = String.length s in - let pos = ref 0 in - while !pos < len do - if !pos + 1 < len && Char.code s.[!pos] = 0x1b then begin - let next = s.[!pos + 1] in - if next = '[' then begin - (* csi sequence: esc [ ... final_byte *) - pos := !pos + 2; - while !pos < len && not (s.[!pos] >= '@' && s.[!pos] <= '~') do incr pos done; - if !pos < len then incr pos - end else if next = ']' then begin - (* osc sequence: esc ] ... (terminated by bel or esc \) *) - pos := !pos + 2; - let terminated = ref false in - while !pos < len && not !terminated do - if s.[!pos] = '\x07' then - (incr pos; terminated := true) - else if !pos + 1 < len && Char.code s.[!pos] = 0x1b && s.[!pos + 1] = '\\' then - (pos := !pos + 2; terminated := true) - else - incr pos - done - end else begin - (* other esc sequence, skip esc + one char *) - pos := !pos + 2 - end - end else begin - Buffer.add_char buf s.[!pos]; - incr pos - end - done; - Buffer.contents buf - -(* --- character class predicates --- - * used throughout the Angstrom parsers to classify characters. - * separated out for readability and reuse. *) - -let is_whitespace = function ' ' | '\t' -> true | _ -> false - -let is_alphanumeric = function - | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' -> true - | _ -> false - -(* characters allowed inside parameter names like FILE, output-dir, etc. *) -let is_param_char = function - | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '_' | '-' -> true - | _ -> false - -(* used to detect ALL_CAPS parameter names like FILE, TIME_STYLE *) -let is_upper_or_underscore = function - | 'A' .. 'Z' | '_' -> true - | _ -> false - -(* characters allowed in long flag names (--foo-bar, --enable-feature2) *) -let is_long_char = function - | 'A' .. 'Z' | 'a' .. 'z' | '0' .. '9' | '-' -> true - | _ -> false - -(* --- core types --- - * these types represent the structured output of parsing a help text. - * they are shared across the entire codebase (nushell codegen, store, manpage parser). - * - * switch: a flag can be short-only (-v), long-only (--verbose), or both (-v, --verbose). - * the both variant keeps the pair together so nushell can emit "--verbose(-v)". - * - * param: flags can take mandatory (--output FILE) or optional (--color[=WHEN]) values. - * - * entry: one complete flag definition — its switch form, optional parameter, and - * the description text (potentially multi-line, already joined). - * - * help_result: the complete parsed output for a single command. *) -type switch = Short of char | Long of string | Both of char * string -type param = Mandatory of string | Optional of string -type entry = { switch : switch; param : param option; desc : string } -type subcommand = { name : string; desc : string } -type positional = { pos_name : string; optional : bool; variadic : bool } -type help_result = { entries : entry list; subcommands : subcommand list; positionals : positional list; description : string } - -(* --- low-level Angstrom combinators --- - * building blocks for all the parsers below. *) - -(* consume horizontal whitespace (spaces and tabs) without crossing lines *) -let inline_ws = skip_while (function ' ' | '\t' -> true | _ -> false) -(* end of line — matches either a newline or end of input. - * this is the permissive version used in most places. *) -let eol = end_of_line <|> end_of_input -(* strict end of line — must consume an actual newline character. - * used in skip_non_option_line so we don't accidentally match eof - * and consume it when we shouldn't. *) -let eol_strict = end_of_line - -(* --- switch and parameter parsers --- - * parse the flag name portion of an option line, e.g. "-v", "--verbose" *) - -let short_switch = char '-' *> satisfy is_alphanumeric -let long_switch = string "--" *> take_while1 is_long_char -let comma = char ',' *> inline_ws - -(* parameter parsers — handle the various syntaxes tools use to indicate - * that a flag takes a value. the formats are surprisingly diverse: - * --output=FILE (eq_man_param — mandatory, common in gnu tools) - * --color[=WHEN] (eq_opt_param — optional with = syntax) - * --depth DEPTH (space_upper_param — space-separated ALL_CAPS) - * --file (space_angle_param — angle brackets) - * --file [] (space_opt_angle_param — optional angle brackets) - * --format string (space_type_param — go/cobra lowercase type word) - *) -let eq_opt_param = - string "[=" *> take_while1 is_param_char <* char ']' >>| fun a -> Optional a - -let eq_man_param = - char '=' *> take_while1 is_param_char >>| fun a -> Mandatory a - -(* space-separated ALL_CAPS param: e.g. " FILE", " TIME_STYLE". - * peek ahead and check the first char is uppercase, then validate - * the entire word is ALL_CAPS. prevents false positives where a - * description word like "Do" or "Set" immediately follows the flag name. - * digits are allowed (e.g. "SHA256") but lowercase chars disqualify. *) -let space_upper_param = - char ' ' *> peek_char_fail >>= fun c -> - if is_upper_or_underscore c then - take_while1 is_param_char >>= fun name -> - if String.length name >= 1 && String.for_all (fun c -> is_upper_or_underscore c || c >= '0' && c <= '9') name then - return (Mandatory name) - else - fail "not an all-caps param" - else - fail "not an uppercase param" - -(* angle-bracket param: e.g. "", "" *) -let angle_param = - char '<' *> take_while1 (fun c -> c <> '>') <* char '>' >>| fun name -> - Mandatory name - -(* space + angle bracket param *) -let space_angle_param = - char ' ' *> angle_param - -(* optional angle bracket param: [] *) -let opt_angle_param = - char '[' *> char '<' *> take_while1 (fun c -> c <> '>') <* char '>' <* char ']' - >>| fun name -> Optional name - -let space_opt_angle_param = - char ' ' *> opt_angle_param - -(* go/cobra style: space + lowercase type word like "string", "list", "int". - * capped at 10 chars to avoid consuming description words. - * go's flag libraries commonly emit "--timeout duration" or "--name string" - * where the type name is a short lowercase word. longer words are almost - * certainly the start of a description, not a type annotation. *) -let space_type_param = - char ' ' *> peek_char_fail >>= fun c -> - if c >= 'a' && c <= 'z' then - take_while1 (fun c -> c >= 'a' && c <= 'z') >>= fun name -> - if String.length name <= 10 then - return (Mandatory name) - else - fail "too long for type param" - else - fail "not a lowercase type param" - -(* try each parameter format in order of specificity. the ordering matters: - * eq_opt_param must come before eq_man_param because "[=WHEN]" would otherwise - * partially match as "=WHEN" then fail on the trailing "]". similarly, - * space_opt_angle_param before space_angle_param to catch "[]" before "". *) -let param_parser = - option None - (choice - [ eq_opt_param; eq_man_param; - space_opt_angle_param; space_angle_param; - space_upper_param; space_type_param ] - >>| fun a -> Some a) - -(* switch parser — handles the various ways help text presents flag names. - * formats handled (in order of attempt): - * -a, --all (short + comma + long — gnu style) - * -a --all (short + space + long — some tools omit the comma) - * --all / -a (long + slash + short — rare but seen in some tools) - * -a (short only) - * --all (long only) - * - * the ordering is critical because Angstrom's choice commits to - * the first parser that makes progress. short_switch consumes "-a", so the - * combined parsers must be tried before the short-only parser. *) -let switch_parser = - choice - [ - (short_switch >>= fun s -> - comma *> long_switch >>| fun l -> Both (s, l)); - (short_switch >>= fun s -> - char ' ' *> long_switch >>| fun l -> Both (s, l)); - (long_switch >>= fun l -> - inline_ws *> char '/' *> inline_ws *> - short_switch >>| fun s -> Both (s, l)); - (short_switch >>| fun s -> Short s); - (long_switch >>| fun l -> Long l); - ] - -(* --- description parsing with multi-line continuation --- - * descriptions in help text often wrap across multiple lines. the convention - * is that continuation lines are deeply indented (8+ spaces) and don't start - * with '-' (which would indicate a new flag entry). we peek ahead to check - * indentation without consuming, then decide whether to fold the line in. *) - -(* take the rest of the line as text (does not consume the newline itself) *) -let rest_of_line = take_till (fun c -> c = '\n' || c = '\r') - -(* check if a line is a continuation line: deeply indented, doesn't start with '-'. - * tabs count as 8 spaces to match typical terminal rendering. - * the 8-space threshold was chosen empirically — most help formatters indent - * descriptions at least this much, while flag lines are indented 2-4 spaces. *) -let continuation_line = - peek_string 1 >>= fun _ -> - (* must start with significant whitespace (8+ spaces or tab) *) - let count_indent s = - let indent = ref 0 in - let pos = ref 0 in - while !pos < String.length s do - (match s.[!pos] with - | ' ' -> incr indent - | '\t' -> indent := !indent + 8 - | _ -> pos := String.length s); - incr pos - done; - !indent - in - available >>= fun avail -> - if avail = 0 then fail "eof" - else - (* peek ahead to see indentation level *) - peek_string (min avail 80) >>= fun preview -> - let indent = count_indent preview in - let trimmed = String.trim preview in - let starts_with_dash = - String.length trimmed > 0 && trimmed.[0] = '-' - in - if indent >= 8 && not starts_with_dash then - (* this is a continuation line — consume whitespace + text *) - inline_ws *> rest_of_line <* eol - else - fail "not a continuation line" - -(* parse description text: first line (after switch+param) plus any continuation lines. - * blank continuation lines are filtered out, and all lines are trimmed and joined - * with spaces into a single string. *) -let description = - inline_ws *> rest_of_line <* eol >>= fun first_line -> - many continuation_line >>| fun cont_lines -> - let all = first_line :: cont_lines in - let all = List.filter (fun s -> String.length (String.trim s) > 0) all in - String.concat " " (List.map String.trim all) - -(* description that appears on a separate line below the flag. - * this handles the clap (rust) "long" help format where flags and descriptions - * are on separate lines: - * --verbose - * increase verbosity - * here there's no inline description — just deeply-indented continuation lines. *) -let description_below = - many1 continuation_line >>| fun lines -> - let lines = List.filter (fun s -> String.length (String.trim s) > 0) lines in - String.concat " " (List.map String.trim lines) - -(* --- line classification for skipping --- - * the parser needs to skip lines it doesn't understand (section headers, - * blank lines, description paragraphs not attached to a flag, etc.) - * without consuming lines that are flag entries. *) - -(* peek ahead to check if the current line looks like a flag entry. - * an option line starts with whitespace then '-'. *) -let at_option_line = - peek_string 1 >>= fun _ -> - available >>= fun avail -> - if avail = 0 then fail "eof" - else - peek_string (min avail 40) >>= fun preview -> - let s = String.trim preview in - if String.length s > 0 && s.[0] = '-' then return () - else fail "not an option line" - -(* skip a non-option line (section header, blank, description-only, etc.). - * uses eol_strict (not eol) so it won't match at eof — this prevents the - * parser from infinitely skipping at the end of input. if the line looks - * like an option line (at_option_line succeeds), we deliberately fail so - * that the entry parser gets a chance at it instead. *) -let skip_non_option_line = - (at_option_line *> fail "this is an option line") - <|> (rest_of_line *> eol_strict *> return ()) - -(* --- entry parsing --- *) - -(* parse a single flag entry: leading whitespace, then switch+param, then description. - * the description can appear on the same line (inline) or on the next line (below). - * if there's no description at all, we accept an empty string. - * the (eol *> description_below) branch handles the clap long-help format. *) -let entry = - inline_ws *> - lift2 (fun (sw, param) desc -> { switch = sw; param; desc }) - (lift2 (fun a b -> (a, b)) switch_parser param_parser) - (description <|> (eol *> (description_below <|> return ""))) - -(* --- subcommand parsing --- - * subcommand lines in help text follow the pattern: - * " name description" - * where the name and description are separated by 2+ spaces. - * some tools also include argument placeholders between name and description: - * " start UNIT... start one or more units" - * " list [PATTERN] list matching units" - *) - -let is_subcommand_char = function - | 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | '_' -> true - | _ -> false - -(* skip argument placeholders like UNIT..., [PATTERN...|PID...], - * that appear between the subcommand name and the description. - * only consumes single-space gaps — the two-space gap before the - * description is left for the main parser to use as the delimiter. - * - * this is a recursive (fix-point) parser that peeks ahead to distinguish - * single-space argument gaps from the double-space description separator. - * it accepts tokens that start with [, <, or are ALL_CAPS (with dots/pipes/ - * commas for variadic syntax). *) -let skip_arg_placeholders = - fix (fun self -> - (* peek ahead: single space followed by arg-like token *) - available >>= fun avail -> - if avail < 2 then return () - else - peek_string (min avail 2) >>= fun peek_two -> - if String.length peek_two >= 2 && peek_two.[0] = ' ' && peek_two.[1] <> ' ' then - (* single space — could be an arg placeholder *) - let next = peek_two.[1] in - if next = '[' || next = '<' - || (next >= 'A' && next <= 'Z') then - (* peek the full token to check if it's ALL_CAPS/brackets *) - peek_string (min avail 80) >>= fun preview -> - (* extract the token after the single space *) - let tok_start = 1 in - let token_end = ref tok_start in - while !token_end < String.length preview - && preview.[!token_end] <> ' ' - && preview.[!token_end] <> '\n' - && preview.[!token_end] <> '\r' do - incr token_end - done; - let tok = String.sub preview tok_start (!token_end - tok_start) in - (* accept as placeholder if it starts with [ or < or is ALL_CAPS - (possibly with dots, pipes, dashes) *) - let is_placeholder = - tok.[0] = '[' || tok.[0] = '<' - || String.for_all (fun c -> - (c >= 'A' && c <= 'Z') || c = '_' || c = '-' - || c = '.' || c = '|' || c = ',' || (c >= '0' && c <= '9') - ) tok - in - if is_placeholder then - advance (1 + String.length tok) *> self - else return () - else return () - else return ()) - -(* parse a subcommand entry line. - * requires: name >= 2 chars, followed by 2+ spaces, then description. - * the name is lowercased for consistent lookup. - * - * if the description starts with "- " (a dash-space prefix), it's stripped. - * some tools format their subcommand lists as: - * " add - add a new item" - * where the "- " is decorative, not part of the description. *) -let subcommand_entry = - inline_ws *> - take_while1 is_subcommand_char >>= fun name -> - if String.length name < 2 then fail "subcommand name too short" - else - skip_arg_placeholders *> - char ' ' *> char ' ' *> inline_ws *> - rest_of_line <* eol >>| fun desc -> - { name = String.lowercase_ascii name; - desc = let trimmed = String.trim desc in - if String.length trimmed >= 2 && trimmed.[0] = '-' && trimmed.[1] = ' ' then - String.trim (String.sub trimmed 2 (String.length trimmed - 2)) - else trimmed } - -(* --- section header detection --- - * section headers are critical for disambiguating subcommands from positional - * arguments. lines like "commands:" introduce subcommand sections, while - * "arguments:" or "positionals:" introduce argument sections where the same - * name+description format should not be treated as subcommands. *) - -(* detect section names that introduce positional argument listings. - * the check is case-insensitive and strips trailing colons. *) -let is_arg_section s = - let lc = String.lowercase_ascii (String.trim s) in - let base = if String.ends_with ~suffix:":" lc - then String.sub lc 0 (String.length lc - 1) |> String.trim - else lc in - base = "arguments" || base = "args" || base = "positionals" - || base = "positional arguments" - -(* a section header: left-aligned (or lightly indented, <= 4 spaces) text - * ending with ':', not starting with '-'. must be consumed before - * subcommand_entry in the choice combinator, otherwise "commands:" would - * be parsed as a subcommand named "commands" with description ":". - * - * returns a bool indicating whether this is an argument section (true) - * or some other section (false). this drives the subcommand filtering logic - * in help_parser — entries under argument sections are excluded from the - * subcommand list. *) -let section_header = - available >>= fun avail -> - if avail = 0 then fail "eof" - else - peek_string (min avail 80) >>= fun preview -> - (* extract just the first line from the preview *) - let first_line = match String.index_opt preview '\n' with - | Some pos -> String.sub preview 0 pos - | None -> preview in - let trimmed = String.trim first_line in - let len = String.length trimmed in - let indent = let pos = ref 0 in - while !pos < String.length first_line && (first_line.[!pos] = ' ' || first_line.[!pos] = '\t') do incr pos done; - !pos in - if len >= 2 && trimmed.[len - 1] = ':' && trimmed.[0] <> '-' && indent <= 4 then - rest_of_line <* eol_strict >>| fun line -> is_arg_section line - else fail "not a section header" - -(* --- top-level parser --- - * the main help parser: walks through all lines, trying each line as one of: - * 1. a flag entry (starts with whitespace + '-') - * 2. a section header (left-aligned text ending with ':') - * 3. a subcommand line (name + 2+ spaces + description) - * 4. anything else — skip - * - * the choice ordering matters: entries are tried first (highest priority), - * then section headers (must beat subcommand_entry to avoid misparse), - * then subcommands, then skip as fallback. - * - * after collecting all items, two post-processing steps happen: - * - subcommands under argument sections are excluded (tracked via - * a running in_arg_sec boolean toggled by section headers) - * - duplicate subcommand names are deduplicated, keeping the entry - * with the longer description (heuristic: more info = better) - * - * positionals are not extracted here — they come from the usage line - * parser (extract_usage_positionals) or CLI11's explicit section parser - * (extract_cli11_positionals), applied later in parse_help. *) -let help_parser = - let open Angstrom in - fix (fun _self -> - let try_entry = - entry >>| fun e -> `Entry e - in - let try_section = - section_header >>| fun is_arg -> `Section is_arg - in - let try_subcommand = - subcommand_entry >>| fun sc -> `Subcommand sc - in - let try_skip = - skip_non_option_line >>| fun () -> `Skip - in - many (choice [ try_entry; try_section; try_subcommand; try_skip ]) >>| fun items -> - let entries = List.filter_map (function `Entry e -> Some e | _ -> None) items in - let subcommands = - List.fold_left (fun (in_arg_sec, acc) item -> - match item with - | `Section is_arg -> (is_arg, acc) - | `Subcommand sc when not in_arg_sec -> (in_arg_sec, sc :: acc) - | _ -> (in_arg_sec, acc) - ) (false, []) items - |> snd |> List.rev - |> List.fold_left (fun acc sc -> - match List.assoc_opt sc.name acc with - | Some prev when String.length prev.desc >= String.length sc.desc -> acc - | _ -> (sc.name, sc) :: List.remove_assoc sc.name acc - ) [] - |> List.rev_map snd - in - { entries; subcommands; positionals = []; description = "" }) - -(* --- usage line parsing --- - * usage lines look like: "usage: git add [OPTIONS] [--] [...]" - * to extract positional arguments, we first need to skip past the command - * name prefix ("git add") to reach the argument portion. - * - * skip_command_prefix walks word-by-word, treating each space-separated - * token as part of the command name as long as it: - * - is made of "word chars" (alphanumeric, hyphen, underscore, slash, dot) - * - contains at least one lowercase letter (to distinguish from ALL_CAPS - * positional names like FILE) - * - doesn't start with [, <, (, {, or - (which indicate arguments, not - * command name components) - * - * this is an imperative index-walking parser rather than using Angstrom, - * because usage lines are a single string (not line-oriented) and the format - * is too varied for clean combinator composition. *) -let skip_command_prefix s = - let len = String.length s in - let pos = ref 0 in - let skip_ws () = while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done in - let is_word_char = function - | 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '/' | '.' -> true - | _ -> false - in - let rec loop () = - skip_ws (); - if !pos >= len then () - else if s.[!pos] = '[' || s.[!pos] = '<' || s.[!pos] = '(' || s.[!pos] = '{' || s.[!pos] = '-' then () - else if is_word_char s.[!pos] then begin - let start = !pos in - while !pos < len && is_word_char s.[!pos] do incr pos done; - let word = String.sub s start (!pos - start) in - let has_lower = ref false in - String.iter (fun c -> if c >= 'a' && c <= 'z' then has_lower := true) word; - if not !has_lower then - pos := start - else - loop () - end - in - loop (); - !pos - -(* parse the argument portion of a usage line into positional definitions. - * handles these syntactic forms: - * - mandatory positional - * [file] - optional positional - * FILE - mandatory positional (ALL_CAPS convention) - * ... - variadic (also handles utf-8 ellipsis) - * [file...] - optional variadic - * curly-brace alternatives - skipped, not a positional - * -flag - flags (skipped) - * - * certain ALL_CAPS names are skipped because they're not real positionals — - * "OPTIONS", "FLAGS", etc. are section labels that sometimes appear in usage - * lines for readability. - * - * deduplication at the end ensures we don't emit the same positional twice - * (can happen when usage lines are reformatted or repeated). *) -let parse_usage_args s = - let len = String.length s in - let pos = ref 0 in - let positionals = ref [] in - let skip_ws () = - while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done in - let is_pos_char c = - (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || (c >= '0' && c <= '9') in - (* detect trailing dots or utf-8 ellipsis indicating variadic args *) - let read_dots () = - skip_ws (); - if !pos + 2 < len && s.[!pos] = '.' && s.[!pos+1] = '.' && s.[!pos+2] = '.' then - (pos := !pos + 3; true) - else if !pos + 2 < len && s.[!pos] = '\xe2' && s.[!pos+1] = '\x80' && s.[!pos+2] = '\xa6' then - (pos := !pos + 3; true) (* utf-8 ellipsis *) - else false - in - (* names that are section labels, not actual positional arguments *) - let is_skip name = - let u = String.uppercase_ascii name in - u = "OPTIONS" || u = "OPTION" || u = "FLAGS" || u = "FLAG" - in - (* validate that a name contains only alphanumeric, underscore, hyphen chars *) - let is_clean_name name = - String.length name >= 2 - && String.for_all (fun c -> - (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') - || (c >= '0' && c <= '9') || c = '_' || c = '-') name - in - let is_letter c = (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') in - (* skip {A|c|d|...} alternative blocks — not positional arguments *) - let skip_braces () = - if !pos < len && s.[!pos] = '{' then begin - let depth = ref 1 in - incr pos; - while !pos < len && !depth > 0 do - if s.[!pos] = '{' then incr depth - else if s.[!pos] = '}' then decr depth; - incr pos - done; - ignore (read_dots ()); - true - end else false - in - while !pos < len do - skip_ws (); - if !pos >= len then () - else if skip_braces () then () - else match s.[!pos] with - | '[' -> - (* optional positional: [name] or [] or [name...] *) - incr pos; - let start = !pos in - let depth = ref 1 in - while !pos < len && !depth > 0 do - if s.[!pos] = '[' then incr depth - else if s.[!pos] = ']' then decr depth; - incr pos - done; - let bracket_end = !pos - 1 in - let inner = String.sub s start (max 0 (bracket_end - start)) |> String.trim in - let inner, has_inner_dots = - if String.ends_with ~suffix:"..." inner then - (String.sub inner 0 (String.length inner - 3) |> String.trim, true) - else (inner, false) - in - let variadic = has_inner_dots || read_dots () in - if String.length inner > 0 - && inner.[0] <> '-' - && (is_letter inner.[0] || inner.[0] = '<') then begin - let name = - if inner.[0] = '<' then - let e = try String.index inner '>' with Not_found -> String.length inner in - String.sub inner 1 (e - 1) - else inner - in - if is_clean_name name && not (is_skip name) then - positionals := { pos_name = String.lowercase_ascii name; - optional = true; variadic } :: !positionals - end - | '<' -> - (* mandatory positional in angle brackets: *) - incr pos; - let start = !pos in - while !pos < len && s.[!pos] <> '>' do incr pos done; - let name = String.sub s start (!pos - start) in - if !pos < len then incr pos; - let variadic = read_dots () in - if is_clean_name name && not (is_skip name) then - positionals := { pos_name = String.lowercase_ascii name; - optional = false; variadic } :: !positionals - | '-' -> - (* flag — skip entirely, not a positional *) - while !pos < len && s.[!pos] <> ' ' && s.[!pos] <> '\t' && s.[!pos] <> ']' do incr pos done - | c when c >= 'A' && c <= 'Z' -> - (* ALL_CAPS positional name *) - let start = !pos in - while !pos < len && is_pos_char s.[!pos] do incr pos done; - let name = String.sub s start (!pos - start) in - let variadic = read_dots () in - if String.length name >= 2 - && String.for_all (fun c -> - (c >= 'A' && c <= 'Z') || c = '_' || c = '-' || (c >= '0' && c <= '9') - ) name - && not (is_skip name) then - positionals := { pos_name = String.lowercase_ascii name; - optional = false; variadic } :: !positionals - | _ -> - incr pos - done; - (* deduplicate positionals by name, keeping the first occurrence *) - List.rev !positionals - |> List.fold_left (fun (seen, acc) p -> - if List.mem p.pos_name seen then (seen, acc) - else (p.pos_name :: seen, p :: acc) - ) ([], []) - |> snd |> List.rev - -(* find the "usage:" line in the help text and extract positionals from it. - * searches line-by-line for a line starting with "usage:" (case-insensitive). - * handles both inline usage ("usage: cmd [OPTIONS] FILE") and the clap style - * where the actual usage is on the next line: - * USAGE: - * cmd [OPTIONS] FILE - * - * also handles the bare "usage" header (no colon) followed by a next line. *) -let extract_usage_positionals text = - let lines = String.split_on_char '\n' text in - let lines_arr = Array.of_list lines in - let len = Array.length lines_arr in - (* search through lines for the first usage header and return the usage content *) - let find_usage_line () = - let check_line idx = - let trimmed = String.trim lines_arr.(idx) in - let trimmed_len = String.length trimmed in - let lc = String.lowercase_ascii trimmed in - if trimmed_len >= 6 && String.sub lc 0 6 = "usage:" then begin - let after = String.sub trimmed 6 (trimmed_len - 6) |> String.trim in - if String.length after > 0 then Some after - else if idx + 1 < len then - (* clap style: USAGE:\n cmd [OPTIONS] PATTERN *) - let next = String.trim lines_arr.(idx + 1) in - if String.length next > 0 then Some next else None - else None - end else if lc = "usage" then begin - if idx + 1 < len then - let next = String.trim lines_arr.(idx + 1) in - if String.length next > 0 then Some next else None - else None - end else None - in - (* use List.find_map over the index range to find the first matching line *) - List.find_map check_line (List.init len Fun.id) - in - match find_usage_line () with - | None -> [] - | Some usage -> - let cmd_end = skip_command_prefix usage in - let args = String.sub usage cmd_end (String.length usage - cmd_end) in - parse_usage_args args - -(* extract positionals from CLI11's explicit "POSITIONALS:" section. - * CLI11 (a c++ arg parsing library) emits a dedicated section: - * Positionals: - * name TEXT description here - * count INT another description - * - * this is preferred over usage-line extraction when present because it - * provides more accurate type information. the parser looks for the - * section header, then reads indented lines until a blank or unindented - * line signals the end. type words (TEXT, INT, FLOAT, etc.) between the - * name and description are skipped. *) -let extract_cli11_positionals text = - let lines = String.split_on_char '\n' text in - (* parse a single indented positional line into a positional record *) - let parse_one s = - let len = String.length s in - let pos = ref 0 in - let is_name_char c = - (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') - || (c >= '0' && c <= '9') || c = '_' || c = '-' in - while !pos < len && is_name_char s.[!pos] do incr pos done; - if !pos < 2 then None - else - let name = String.sub s 0 !pos in - while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done; - (* skip type word: TEXT, INT, FLOAT, ENUM, BOOLEAN, etc. *) - while !pos < len && s.[!pos] >= 'A' && s.[!pos] <= 'Z' do incr pos done; - while !pos < len && (s.[!pos] = ' ' || s.[!pos] = '\t') do incr pos done; - let variadic = !pos + 2 < len && s.[!pos] = '.' && s.[!pos+1] = '.' && s.[!pos+2] = '.' in - Some { pos_name = String.lowercase_ascii name; optional = false; variadic } - in - (* parse consecutive indented lines under the section header *) - let rec parse_lines lines acc = - match lines with - | [] -> List.rev acc - | line :: rest -> - let len = String.length line in - if len = 0 || (line.[0] <> ' ' && line.[0] <> '\t') then - List.rev acc - else - let trimmed = String.trim line in - if String.length trimmed = 0 then List.rev acc - else match parse_one trimmed with - | Some p -> parse_lines rest (p :: acc) - | None -> parse_lines rest acc - in - (* scan lines for the positionals section header, then parse the body *) - let rec find_section = function - | [] -> [] - | line :: rest -> - let trimmed = String.trim line in - if trimmed = "POSITIONALS:" || trimmed = "Positionals:" then - parse_lines rest [] - else - find_section rest - in - find_section lines - -(* top-level entry point: parse a --help text string into a help_result. - * steps: - * 1. strip ansi escapes (colors, hyperlinks, etc.) - * 2. run the Angstrom help_parser for flags and subcommands - * 3. extract positionals via CLI11 format (preferred) or usage line (fallback) - * 4. merge positionals into the result - * uses Angstrom's prefix-consume mode — we don't need to parse every byte. *) -let parse_help txt = - let clean = strip_ansi txt in - match Angstrom.parse_string ~consume:Consume.Prefix help_parser clean with - | Ok result -> - let cli11 = extract_cli11_positionals clean in - let usage = extract_usage_positionals clean in - let positionals = if cli11 <> [] then cli11 else usage in - Ok { result with positionals } - | Error msg -> Error msg diff --git a/lib/store.ml b/lib/store.ml deleted file mode 100644 index 2466c81..0000000 --- a/lib/store.ml +++ /dev/null @@ -1,670 +0,0 @@ -(* store.ml — filesystem-backed cache of parsed completion data. - * - * this module handles persistence of completion data to disk. each command's - * help_result is serialized to JSON and stored as a file in a cache directory - * (default: $XDG_CACHE_HOME/inshellah). commands with native nushell completions - * are stored as .nu files instead. - * - * the store also provides lookup, listing, and subcommand discovery by - * scanning filenames in the cache directory. - * - * file naming convention: - * - spaces in command names become underscores (e.g. "git add" -> "git_add.json") - * - subcommands of a parent share the prefix (e.g. "git_add.json", "git_commit.json") - * - .json files contain serialized help_result - * - .nu files contain native nushell extern source code - * - * the module includes a minimal hand-rolled JSON parser/serializer because - * we only need to handle our own output format (no need for a full JSON library). - *) - -open Parser - -(* get the default store path: $XDG_CACHE_HOME/inshellah, falling back to - * ~/.cache/inshellah if XDG_CACHE_HOME is not set. *) -let default_store_path () = - let cache = try Sys.getenv "XDG_CACHE_HOME" - with Not_found -> Filename.concat (Sys.getenv "HOME") ".cache" in - Filename.concat cache "inshellah" - -(* recursively create directories along a path (equivalent to mkdir -p). - * splits the path into components and folds over them, accumulating - * the current directory prefix and creating each level if missing. *) -let ensure_dir dir = - let sep = Filename.dir_sep in - let parts = String.split_on_char sep.[0] dir in - (* determine the starting prefix: absolute paths begin with "/" *) - let start = if String.length dir > 0 && dir.[0] = sep.[0] then sep else "" in - let _final = - List.fold_left (fun current part -> - if part = "" then current - else begin - let next = if current = sep then sep ^ part - else if current = "" then part - else current ^ sep ^ part in - (if not (Sys.file_exists next) then Unix.mkdir next 0o755); - next - end - ) start parts - in - () - -(* convert command name to safe filename: spaces become underscores, - * non-alphanumeric chars become hyphens. - * e.g. "git add" -> "git_add", "docker-compose" -> "docker-compose" *) -let filename_of_command cmd = - String.map (function - | ' ' -> '_' - | ('a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '.') as char_val -> char_val - | _ -> '-') cmd - -(* inverse of filename_of_command: underscores back to spaces. - * note: this is lossy — original underscores in command names - * (e.g. "my_tool") would be converted to spaces. in practice this - * doesn't matter because tools with underscores in names are rare, - * and subcommands use space-separated naming. *) -let command_of_filename base_name = - String.map (function '_' -> ' ' | char_val -> char_val) base_name - -(* --- JSON serialization of help_result --- - * hand-rolled JSON emitters. we don't use a JSON library because: - * 1. the schema is fixed and simple — we only serialize our own types - * 2. avoiding dependencies keeps the binary small - * 3. printf-style emission is fast and straightforward for our types *) - -(* escape a string for JSON: quotes, backslashes, and control characters. - * control chars below 0x20 are emitted as \uXXXX unicode escapes. *) -let escape_json contents = - let buf = Buffer.create (String.length contents + 4) in - String.iter (fun char_val -> match char_val with - | '"' -> Buffer.add_string buf "\\\"" - | '\\' -> Buffer.add_string buf "\\\\" - | '\n' -> Buffer.add_string buf "\\n" - | '\t' -> Buffer.add_string buf "\\t" - | '\r' -> Buffer.add_string buf "\\r" - | c when Char.code c < 0x20 -> - Buffer.add_string buf (Printf.sprintf "\\u%04x" (Char.code c)) - | c -> Buffer.add_char buf c - ) contents; - Buffer.contents buf - -(* wrap a string in quotes after escaping for JSON *) -let json_string text = Printf.sprintf "\"%s\"" (escape_json text) - -(* the literal null value for JSON output *) -let json_null = "null" - -(* serialize a switch (short flag, long flag, or both) to JSON *) -let json_switch_of = function - | Short char_val -> - Printf.sprintf "{\"type\":\"short\",\"char\":%s}" (json_string (String.make 1 char_val)) - | Long name -> - Printf.sprintf "{\"type\":\"long\",\"name\":%s}" (json_string name) - | Both (char_val, name) -> - Printf.sprintf "{\"type\":\"both\",\"char\":%s,\"name\":%s}" - (json_string (String.make 1 char_val)) (json_string name) - -(* serialize a parameter spec (mandatory, optional, or absent) to JSON *) -let json_param_of = function - | None -> json_null - | Some (Mandatory name) -> - Printf.sprintf "{\"kind\":\"mandatory\",\"name\":%s}" (json_string name) - | Some (Optional name) -> - Printf.sprintf "{\"kind\":\"optional\",\"name\":%s}" (json_string name) - -(* serialize a single flag entry (switch + param + description) to JSON *) -let json_entry_of entry = - Printf.sprintf "{\"switch\":%s,\"param\":%s,\"desc\":%s}" - (json_switch_of entry.switch) (json_param_of entry.param) (json_string entry.desc) - -(* serialize a subcommand (name + description) to JSON *) -let json_subcommand_of sc = - Printf.sprintf "{\"name\":%s,\"desc\":%s}" (json_string sc.name) (json_string sc.desc) - -(* serialize a positional argument to JSON *) -let json_positional_of p = - Printf.sprintf "{\"name\":%s,\"optional\":%b,\"variadic\":%b}" - (json_string p.pos_name) p.optional p.variadic - -(* serialize a list of items to a JSON array using the given formatter *) -let json_list formatter items = - "[" ^ String.concat "," (List.map formatter items) ^ "]" - -(* serialize an entire help_result to a JSON object string *) -let json_of_help_result ?(source="help") result = - Printf.sprintf "{\"source\":%s,\"description\":%s,\"entries\":%s,\"subcommands\":%s,\"positionals\":%s}" - (json_string source) - (json_string result.description) - (json_list json_entry_of result.entries) - (json_list json_subcommand_of result.subcommands) - (json_list json_positional_of result.positionals) - -(* --- JSON deserialization --- - * minimal hand-rolled recursive-descent JSON parser. only handles the subset - * we emit: strings, booleans, nulls, arrays, and objects. no number parsing - * (we don't emit numbers). this is intentionally minimal — we only read back - * our own serialized format, so robustness against arbitrary JSON is not needed. - * - * note: the \u escape handler does basic UTF-8 encoding for code points - * up to 0xFFFF but doesn't handle surrogate pairs. this is fine for our use - * case since we only escape control characters below 0x20. *) - -type json = - | Jnull - | Jbool of bool - | Jstring of string - | Jarray of json list - | Jobject of (string * json) list - -(* JSON accessor helpers — return sensible defaults for missing/wrong types *) -let json_get key = function - | Jobject pairs -> (try List.assoc key pairs with Not_found -> Jnull) - | _ -> Jnull - -(* extract a string from a JSON value, defaulting to empty string *) -let json_to_string = function Jstring text -> text | _ -> "" - -(* extract a boolean from a JSON value, defaulting to false *) -let json_to_bool = function Jbool value -> value | _ -> false - -(* extract a list from a JSON array value, defaulting to empty list *) -let json_to_list = function Jarray items -> items | _ -> [] - -exception Json_error of string - -(* imperative recursive-descent JSON parser. - * uses a mutable position ref to walk through the string. - * note: boolean/null parsing just advances a fixed number of chars - * without validating the actual characters — safe because we only read - * our own output, but would be incorrect for arbitrary JSON. *) -let parse_json contents = - let len = String.length contents in - let pos = ref 0 in - (* peek at the current character without consuming it *) - let peek () = if !pos < len then contents.[!pos] else '\x00' in - (* advance the position by one character *) - let advance () = incr pos in - (* skip over any whitespace characters at current position *) - let skip_ws () = - while !pos < len && (contents.[!pos] = ' ' || contents.[!pos] = '\t' - || contents.[!pos] = '\n' || contents.[!pos] = '\r') do - advance () - done in - (* skip whitespace then consume the expected character, or raise *) - let expect char_val = - skip_ws (); - if peek () <> char_val then - raise (Json_error (Printf.sprintf "expected '%c' at %d" char_val !pos)); - advance () in - (* mutually recursive parsers for each JSON value type *) - let rec parse_value () = - skip_ws (); - match peek () with - | '"' -> Jstring (parse_string ()) - | '{' -> parse_object () - | '[' -> parse_array () - | 'n' -> advance (); advance (); advance (); advance (); Jnull - | 't' -> advance (); advance (); advance (); advance (); Jbool true - | 'f' -> - advance (); advance (); advance (); advance (); advance (); Jbool false - | char_val -> - raise (Json_error (Printf.sprintf "unexpected '%c' at %d" char_val !pos)) - (* parse a quoted string value, handling escape sequences *) - and parse_string () = - expect '"'; - let buf = Buffer.create 32 in - while peek () <> '"' do - if peek () = '\\' then begin - advance (); - (match peek () with - | '"' -> Buffer.add_char buf '"' - | '\\' -> Buffer.add_char buf '\\' - | 'n' -> Buffer.add_char buf '\n' - | 't' -> Buffer.add_char buf '\t' - | 'r' -> Buffer.add_char buf '\r' - | 'u' -> - (* handle \uXXXX unicode escapes with basic UTF-8 encoding *) - advance (); - let hex = String.sub contents !pos 4 in - pos := !pos + 3; - let code = int_of_string ("0x" ^ hex) in - if code < 128 then Buffer.add_char buf (Char.chr code) - else begin - if code < 0x800 then begin - Buffer.add_char buf (Char.chr (0xc0 lor (code lsr 6))); - Buffer.add_char buf (Char.chr (0x80 lor (code land 0x3f))) - end else begin - Buffer.add_char buf (Char.chr (0xe0 lor (code lsr 12))); - Buffer.add_char buf (Char.chr (0x80 lor ((code lsr 6) land 0x3f))); - Buffer.add_char buf (Char.chr (0x80 lor (code land 0x3f))) - end - end - | char_val -> Buffer.add_char buf char_val); - advance () - end else begin - Buffer.add_char buf (peek ()); - advance () - end - done; - advance (); (* consume closing quote *) - Buffer.contents buf - (* parse a JSON object: { "key": value, ... } *) - and parse_object () = - expect '{'; - skip_ws (); - if peek () = '}' then (advance (); Jobject []) - else begin - let pairs = ref [] in - let more = ref true in - while !more do - skip_ws (); - let key = parse_string () in - expect ':'; - let value = parse_value () in - pairs := (key, value) :: !pairs; - skip_ws (); - if peek () = ',' then advance () - else more := false - done; - expect '}'; - Jobject (List.rev !pairs) - end - (* parse a JSON array: [ value, value, ... ] *) - and parse_array () = - expect '['; - skip_ws (); - if peek () = ']' then (advance (); Jarray []) - else begin - let items = ref [] in - let more = ref true in - while !more do - let value = parse_value () in - items := value :: !items; - skip_ws (); - if peek () = ',' then advance () - else more := false - done; - expect ']'; - Jarray (List.rev !items) - end - in - parse_value () - -(* --- JSON to OCaml type converters --- - * these reconstruct our parser types from their JSON representations. - * they mirror the json_*_of serializers above. *) - -(* reconstruct a switch value from its JSON representation *) -let switch_of_json json_node = - match json_to_string (json_get "type" json_node) with - | "short" -> - let char_str = json_to_string (json_get "char" json_node) in - Short (if String.length char_str > 0 then char_str.[0] else '?') - | "long" -> Long (json_to_string (json_get "name" json_node)) - | "both" -> - let char_str = json_to_string (json_get "char" json_node) in - Both ((if String.length char_str > 0 then char_str.[0] else '?'), - json_to_string (json_get "name" json_node)) - | _ -> Long "?" - -(* reconstruct a parameter spec from its JSON representation *) -let param_of_json = function - | Jnull -> None - | json_node -> - let name = json_to_string (json_get "name" json_node) in - (match json_to_string (json_get "kind" json_node) with - | "mandatory" -> Some (Mandatory name) - | "optional" -> Some (Optional name) - | _ -> None) - -(* reconstruct a flag entry from its JSON representation *) -let entry_of_json json_node = - { switch = switch_of_json (json_get "switch" json_node); - param = param_of_json (json_get "param" json_node); - desc = json_to_string (json_get "desc" json_node) } - -(* reconstruct a subcommand from its JSON representation *) -let subcommand_of_json json_node = - { name = json_to_string (json_get "name" json_node); - desc = json_to_string (json_get "desc" json_node) } - -(* reconstruct a positional argument from its JSON representation *) -let positional_of_json json_node = - { pos_name = json_to_string (json_get "name" json_node); - optional = json_to_bool (json_get "optional" json_node); - variadic = json_to_bool (json_get "variadic" json_node) } - -(* reconstruct a full help_result from its JSON representation *) -let help_result_of_json json_node = - { entries = List.map entry_of_json (json_to_list (json_get "entries" json_node)); - subcommands = List.map subcommand_of_json (json_to_list (json_get "subcommands" json_node)); - positionals = List.map positional_of_json (json_to_list (json_get "positionals" json_node)); - description = json_to_string (json_get "description" json_node) } - -(* --- filesystem operations --- *) - -(* write a string to a file, overwriting any existing content *) -let write_file path contents = - let oc = open_out path in - output_string oc contents; - close_out oc - -(* read an entire file into a string, returning None on any error *) -let read_file path = - try - let ic = open_in path in - let size = in_channel_length ic in - let contents = Bytes.create size in - really_input ic contents 0 size; - close_in ic; - Some (Bytes.to_string contents) - with _ -> None - -(* write a parsed help_result to the store as JSON *) -let write_result ~dir ?(source="help") command result = - let path = Filename.concat dir (filename_of_command command ^ ".json") in - write_file path (json_of_help_result ~source result) - -(* write native nushell completion source to the store as a .nu file *) -let write_native ~dir command data = - let path = Filename.concat dir (filename_of_command command ^ ".nu") in - write_file path data - -(* check whether a path exists and is a directory *) -let is_dir path = Sys.file_exists path && Sys.is_directory path - -(* look for a command's data file across multiple store directories. - * checks JSON first, then .nu. returns the first match found. - * directories are searched in order (user dir before system dirs). *) -let find_file dirs command = - let base_name = filename_of_command command in - List.find_map (fun directory -> - let json_path = Filename.concat directory (base_name ^ ".json") in - if Sys.file_exists json_path then Some json_path - else - let nu_path = Filename.concat directory (base_name ^ ".nu") in - if Sys.file_exists nu_path then Some nu_path - else None - ) dirs - -(* parse a nushell .nu file to extract a help_result for a specific command. - * .nu files contain `export extern "cmd" [ ... ]` blocks with flag definitions. - * this parser extracts flags, positionals, subcommands, and descriptions - * from the nushell extern syntax so the completer can use native completions. - * - * nushell extern parameter syntax: - * --flag(-s): type # description → Both(s, "flag") with param - * --flag: type # description → Long "flag" with param - * --flag # description → Long "flag" no param - * -s # description → Short 's' - * name: type # description → positional - * name?: type → optional positional - * ...name: type → variadic positional - *) -let parse_nu_completions target_cmd contents = - let lines = String.split_on_char '\n' contents in - (* extract the description comment preceding an export extern block *) - let current_desc = ref "" in - (* collect all extern blocks: (cmd_name, entries, positionals, description) *) - let blocks = ref [] in - let in_block = ref false in - let block_cmd = ref "" in - let block_entries = ref [] in - let block_positionals = ref [] in - let block_desc = ref "" in - let finish_block () = - if !in_block then begin - blocks := (!block_cmd, List.rev !block_entries, - List.rev !block_positionals, !block_desc) :: !blocks; - in_block := false - end in - List.iter (fun line -> - let trimmed = String.trim line in - if not !in_block then begin - (* look for description comments and export extern lines *) - if String.length trimmed > 2 && trimmed.[0] = '#' && trimmed.[1] = ' ' then - current_desc := String.trim (String.sub trimmed 2 (String.length trimmed - 2)) - else if String.length trimmed > 15 - && (try ignore (Str.search_forward - (Str.regexp_string "export extern") trimmed 0); true - with Not_found -> false) then begin - (* extract command name from: export extern "cmd name" [ or export extern cmd [ *) - let re_quoted = Str.regexp {|export extern "\([^"]*\)"|} in - let re_bare = Str.regexp {|export extern \([a-zA-Z0-9_-]+\)|} in - let cmd_opt = - if try ignore (Str.search_forward re_quoted trimmed 0); true - with Not_found -> false - then Some (Str.matched_group 1 trimmed) - else if try ignore (Str.search_forward re_bare trimmed 0); true - with Not_found -> false - then Some (Str.matched_group 1 trimmed) - else None in - if cmd_opt <> None then begin - let cmd = match cmd_opt with Some c -> c | None -> "" in - in_block := true; - block_cmd := cmd; - block_entries := []; - block_positionals := []; - block_desc := !current_desc; - current_desc := "" - end - end else - current_desc := "" - end else begin - (* inside an extern block — parse flag/positional lines *) - if String.length trimmed > 0 && trimmed.[0] = ']' then - finish_block () - else begin - (* extract description from # comment *) - let param_part, desc = - match String.split_on_char '#' trimmed with - | before :: rest -> - (String.trim before, - String.trim (String.concat "#" rest)) - | _ -> (trimmed, "") - in - if String.length param_part > 1 then begin - if param_part.[0] = '-' && param_part.[1] = '-' then begin - (* long flag: --flag(-s): type or --flag: type or --flag *) - let re_both = Str.regexp {|--\([a-zA-Z0-9-]+\)(-\([a-zA-Z0-9]\))\(: *\([a-zA-Z]+\)\)?|} in - let re_long = Str.regexp {|--\([a-zA-Z0-9-]+\)\(: *\([a-zA-Z]+\)\)?|} in - if try ignore (Str.search_forward re_both param_part 0); true - with Not_found -> false then begin - let long = Str.matched_group 1 param_part in - let short = (Str.matched_group 2 param_part).[0] in - let param = try Some (Mandatory (Str.matched_group 4 param_part)) - with Not_found | Invalid_argument _ -> None in - block_entries := { switch = Both (short, long); param; desc } :: !block_entries - end else if try ignore (Str.search_forward re_long param_part 0); true - with Not_found -> false then begin - let long = Str.matched_group 1 param_part in - let param = try Some (Mandatory (Str.matched_group 3 param_part)) - with Not_found | Invalid_argument _ -> None in - block_entries := { switch = Long long; param; desc } :: !block_entries - end - end else if param_part.[0] = '-' then begin - (* short flag: -s *) - if String.length param_part >= 2 then - let c = param_part.[1] in - if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') then - block_entries := { switch = Short c; param = None; desc } :: !block_entries - end else begin - (* positional: name: type or name?: type or ...name: type *) - let variadic = String.starts_with ~prefix:"..." param_part in - let part = if variadic then String.sub param_part 3 (String.length param_part - 3) - else param_part in - let optional = try let q = String.index part '?' in q > 0 - with Not_found -> false in - let name = match String.index_opt part ':' with - | Some i -> String.trim (String.sub part 0 i) - | None -> match String.index_opt part '?' with - | Some i -> String.trim (String.sub part 0 i) - | None -> String.trim part in - let name = String.map (function '-' -> '_' | c -> c) name in - if String.length name > 0 && name.[0] <> '-' then - block_positionals := { pos_name = name; optional = optional || variadic; - variadic } :: !block_positionals - end - end - end - end - ) lines; - finish_block (); - let blocks = List.rev !blocks in - (* find the block matching the target command *) - let target = target_cmd in - match List.find_opt (fun (cmd, _, _, _) -> cmd = target) blocks with - | Some (_, entries, positionals, description) -> - (* collect subcommands from other blocks that are children of this command *) - let prefix = target ^ " " in - let subcommands = List.filter_map (fun (cmd, _, _, desc) -> - if String.starts_with ~prefix cmd then - let sub_name = String.sub cmd (String.length prefix) - (String.length cmd - String.length prefix) in - (* only immediate subcommands (no further spaces) *) - if not (String.contains sub_name ' ') && String.length sub_name > 0 - then Some { name = sub_name; desc } - else None - else None - ) blocks in - { entries; subcommands; positionals; description } - | None -> - (* target not found — return empty result *) - { entries = []; subcommands = []; positionals = []; description = "" } - -(* look up a command and deserialize its help_result. - * searches for .json files first, then falls back to .nu files - * (parsing the nushell extern syntax to extract completion data). - * for subcommands like "rbw get", also checks the parent's .nu file - * (e.g. rbw.nu) since clap-generated .nu files contain all extern - * blocks in a single file. *) -let lookup dirs command = - let base_name = filename_of_command command in - (* also try the root command's .nu file for subcommand lookups. - * "rbw get" -> try rbw.nu and look for the "rbw get" extern block. *) - let parent_base = match String.index_opt command ' ' with - | Some i -> Some (filename_of_command (String.sub command 0 i)) - | None -> None in - List.find_map (fun directory -> - let json_path = Filename.concat directory (base_name ^ ".json") in - match read_file json_path with - | Some data -> - (try Some (help_result_of_json (parse_json data)) - with _ -> None) - | None -> - let nu_path = Filename.concat directory (base_name ^ ".nu") in - (match read_file nu_path with - | Some data -> - (try Some (parse_nu_completions command data) - with _ -> None) - | None -> - (* try parent's .nu file for subcommand blocks *) - match parent_base with - | Some pb -> - let parent_nu = Filename.concat directory (pb ^ ".nu") in - (match read_file parent_nu with - | Some data -> - (try - let r = parse_nu_completions command data in - if r.entries <> [] || r.subcommands <> [] || r.positionals <> [] - then Some r else None - with _ -> None) - | None -> None) - | None -> None) - ) dirs - -(* look up a command's raw data (JSON or .nu source) without parsing. - * used by the "query" command to dump stored data as-is. *) -let lookup_raw dirs command = - let base_name = filename_of_command command in - List.find_map (fun directory -> - let json_path = Filename.concat directory (base_name ^ ".json") in - match read_file json_path with - | Some _ as result -> result - | None -> - let nu_path = Filename.concat directory (base_name ^ ".nu") in - read_file nu_path - ) dirs - -(* strip known extensions (.json or .nu) from a filename, returning None - * if the filename has neither extension *) -let chop_extension filename = - if Filename.check_suffix filename ".json" then Some (Filename.chop_suffix filename ".json") - else if Filename.check_suffix filename ".nu" then Some (Filename.chop_suffix filename ".nu") - else None - -(* discover subcommands of a command by scanning filenames in the store. - * looks for files whose names start with the command's filename + "_" - * (e.g. for "git", finds "git_add.json", "git_commit.json", etc.) - * - * only returns immediate subcommands (no nested underscores beyond the prefix). - * tries to extract description from the JSON "description" field if available. - * - * note: this filesystem-based discovery is used as a fallback when the - * command's own help_result doesn't list subcommands. it enables completion - * for subcommands that were indexed from separate manpages or help runs. *) -let subcommands_of dirs command = - let prefix = filename_of_command command ^ "_" in - let prefix_len = String.length prefix in - let module SMap = Map.Make(String) in - let subs = List.fold_left (fun subs directory -> - if is_dir directory then - Array.fold_left (fun subs filename -> - if not (String.starts_with ~prefix filename) then subs - else - let is_json = Filename.check_suffix filename ".json" in - match chop_extension filename with - | None -> subs - | Some base_name -> - let rest = String.sub base_name prefix_len (String.length base_name - prefix_len) in - (* skip nested subcommands and empty names *) - if String.contains rest '_' || String.length rest = 0 then subs - else if SMap.mem rest subs then subs - else - (* try to read the description from the JSON file *) - let desc = if is_json then - match read_file (Filename.concat directory filename) with - | Some data -> - (try json_to_string (json_get "description" (parse_json data)) - with _ -> "") - | None -> "" - else "" in - SMap.add rest { name = rest; desc } subs - ) subs (Sys.readdir directory) - else subs - ) SMap.empty dirs in - SMap.fold (fun _ sc acc -> sc :: acc) subs [] |> List.rev - -(* list all indexed commands across all store directories. - * returns a sorted, deduplicated list of command names. *) -let all_commands dirs = - let module SSet = Set.Make(String) in - List.fold_left (fun cmds directory -> - if is_dir directory then - Array.fold_left (fun cmds filename -> - match chop_extension filename with - | Some base_name -> SSet.add (command_of_filename base_name) cmds - | None -> cmds - ) cmds (Sys.readdir directory) - else cmds - ) SSet.empty dirs - |> SSet.elements - -(* determine how a command was indexed: "help", "manpage", "native", etc. - * for JSON files, reads the "source" field. for .nu files, returns "native". - * used by the "dump" command to show provenance. *) -let file_type_of dirs command = - let base_name = filename_of_command command in - List.find_map (fun directory -> - let json_path = Filename.concat directory (base_name ^ ".json") in - if Sys.file_exists json_path then - (match read_file json_path with - | Some data -> - (try Some (json_to_string (json_get "source" (parse_json data))) - with _ -> Some "json") - | None -> Some "json") - else - let nu_path = Filename.concat directory (base_name ^ ".nu") in - if Sys.file_exists nu_path then Some "native" - else None - ) dirs diff --git a/nix/inshellah-completer.nu b/nix/inshellah-completer.nu new file mode 100644 index 0000000..bee5363 --- /dev/null +++ b/nix/inshellah-completer.nu @@ -0,0 +1,813 @@ +@complete external +def --wrapped sudo [...args] { + ^sudo ...$args +} + +@complete external +def --wrapped doas [...args] { + ^doas ...$args +} + +let inshellah_nonempty = { |items| + let result = ($items | default [] | compact) + if ($result | is-empty) { null } else { $result } +} + +let inshellah_fuzzy_score = { |needle, haystack| + let needle = $needle | default "" | into string + let haystack = $haystack | default "" | into string + let needle_len = ($needle | str length) + let haystack_len = ($haystack | str length) + + if $needle_len == 0 { + 1 + } else if $needle_len > $haystack_len { + 0 + } else if $needle == $haystack { + 1000 + } else { + let needle_lc = $needle | str downcase + let haystack_lc = $haystack | str downcase + if ($haystack_lc | str starts-with $needle_lc) { + 900 + (($needle_len * 100) // $haystack_len) + } else { + let needle_chars = $needle_lc | split chars + let haystack_chars = $haystack | split chars + let haystack_lc_chars = $haystack_lc | split chars + let scored = ( + $haystack_lc_chars + | enumerate + | reduce --fold {needle_idx: 0, score: 0, prev_match: -2} { |it, acc| + if $acc.needle_idx >= $needle_len { + $acc + } else if $it.item == ($needle_chars | get $acc.needle_idx) { + let idx = $it.index + let prev = if $idx == 0 { "" } else { $haystack_chars | get ($idx - 1) } + let current = $haystack_chars | get $idx + let boundary = ( + ($idx == 0) + or ($prev == "-") + or ($prev == "_") + or (($prev =~ '^[a-z]$') and ($current =~ '^[A-Z]$')) + ) + let base = if $boundary { 50 } else { 10 } + let consecutive = if $acc.prev_match == ($idx - 1) { 20 } else { 0 } + { + needle_idx: ($acc.needle_idx + 1) + score: ($acc.score + $base + $consecutive) + prev_match: $idx + } + } else { + $acc + } + } + ) + if $scored.needle_idx == $needle_len { $scored.score } else { 0 } + } + } +} + +let inshellah_filter_candidates = { |items, prefix| + let result = do $inshellah_nonempty $items + if $result == null { + null + } else if ($prefix | is-empty) { + $result + } else { + let needle = $prefix | into string + let filtered = ( + $result + | enumerate + | each { |row| $row.item | insert __idx $row.index } + | insert __score { |item| do $inshellah_fuzzy_score $needle $item.value } + | where { |item| + let value = ($item.value | into string) + let desc = ($item.description? | default "" | into string | str downcase) + let exact_command = ($value == $needle) and (($desc | str contains "subcommand") or $desc == "external command") + ($item.__score > 0) and not $exact_command + } + | insert __rank { |item| 0 - $item.__score } + | sort-by __rank __idx + | reject __idx __score __rank + ) + do $inshellah_nonempty $filtered + } +} + +let inshellah_static_complete = { |spans| + try { + let completed = (^inshellah complete ...$spans | complete) + if $completed.exit_code != 0 { + null + } else { + let parsed = (try { $completed.stdout | from json } catch { null }) + let parsed_type = ($parsed | describe) + if $parsed == null { + null + } else if (($parsed_type | str starts-with "list") or ($parsed_type | str starts-with "table")) { + do $inshellah_nonempty $parsed + } else { + null + } + } + } catch { + null + } +} + +let inshellah_unit_candidates = { |scope, prefix| + try { + ^systemctl ...$scope list-units --all --no-pager --plain --full --no-legend $"($prefix)*" + | lines + | each { |l| + let parsed = $l | parse -r '(?P\S+)\s+\S+\s+\S+\s+\S+\s+(?P.*)' + if ($parsed | length) > 0 { + {value: $parsed.0.unit, description: ($parsed.0.desc | str trim)} + } + } | compact + } catch { null } +} + +let inshellah_kubectl_scope = { |spans| + let all_namespaces = ("-A" in $spans) or ("--all-namespaces" in $spans) + let namespace_eq = ($spans | where { |s| $s =~ '^--namespace=' } | get 0? | default "") + let namespace_arg = ( + $spans + | enumerate + | where { |it| $it.item == "-n" or $it.item == "--namespace" } + | reverse + | get 0? + | default null + ) + let namespace = if not ($namespace_eq | is-empty) { + $namespace_eq | str replace --regex '^--namespace=' '' + } else if $namespace_arg != null and (($namespace_arg.index + 1) < ($spans | length)) { + $spans | get ($namespace_arg.index + 1) + } else { + "" + } + + if $all_namespaces { + {args: [--all-namespaces], all: true} + } else if not ($namespace | is-empty) { + {args: [-n $namespace], all: false} + } else { + {args: [], all: false} + } +} + +let inshellah_kubectl_names = { |kind, spans| + if ($kind | is-empty) or ($kind | str starts-with "-") { + null + } else { + let scope = do $inshellah_kubectl_scope $spans + let columns = if $scope.all { + "custom-columns=NAMESPACE:.metadata.namespace,NAME:.metadata.name" + } else { + "custom-columns=NAME:.metadata.name" + } + try { + let rows = ( + ^kubectl get $kind ...$scope.args --no-headers -o $columns + | lines + | str trim + | where { |n| not ($n | is-empty) } + ) + if $scope.all { + $rows | each { |row| + let parts = $row | split row -r '\s+' + if ($parts | length) >= 2 { + {value: ($parts | get 1), description: $"($kind) in ($parts | get 0)"} + } + } | compact + } else { + $rows | each { |n| {value: $n, description: $kind} } + } + } catch { null } + } +} + +let inshellah_git_refs = { || + try { + ^git for-each-ref --format='%(refname:short)%09%(objecttype)%09%(contents:subject)' refs/heads refs/remotes refs/tags + | lines + | each { |l| + let p = $l | split row "\t" + if ($p | length) >= 3 { {value: $p.0, description: $p.2} } + } | compact + } catch { null } +} + +let inshellah_git_branches = { || + try { + ^git for-each-ref --format='%(refname:short)%09%(contents:subject)' refs/heads + | lines + | each { |l| + let p = $l | split row "\t" + if ($p | length) >= 2 { {value: $p.0, description: $p.1} } + } | compact + } catch { null } +} + +let inshellah_git_tags = { || + try { + ^git for-each-ref --format='%(refname:short)%09%(contents:subject)' refs/tags + | lines + | each { |l| + let p = $l | split row "\t" + if ($p | length) >= 2 { {value: $p.0, description: $p.1} } + } | compact + } catch { null } +} + +let inshellah_git_remotes = { || + try { + ^git remote + | lines + | str trim + | where { |r| not ($r | is-empty) } + | each { |r| {value: $r, description: "remote"} } + } catch { null } +} + +let inshellah_git_stashes = { || + try { + ^git stash list + | lines + | each { |l| + let m = $l | parse -r '^(?Pstash@\{[0-9]+\}):\s*(?P.*)$' + if ($m | length) > 0 { {value: $m.0.stash, description: $m.0.desc} } + } | compact + } catch { null } +} + +let inshellah_git_status_paths = { || + try { + ^git status --porcelain -uall + | lines + | each { |l| + let m = $l | parse -r '^.. (?P.+)$' + if ($m | length) > 0 { + let raw = $m.0.path + let path = if ($raw | str contains " -> ") { $raw | split row " -> " | last } else { $raw } + {value: $path, description: "changed path"} + } + } | compact + } catch { null } +} + +let inshellah_git_tracked_paths = { || + try { + ^git ls-files + | lines + | where { |p| not ($p | is-empty) } + | each { |p| {value: $p, description: "tracked file"} } + } catch { null } +} + +let inshellah_git_submodules = { || + try { + ^git config --file .gitmodules --get-regexp '^submodule\..*\.path$' + | lines + | each { |l| + let p = $l | split row -r '\s+' + if ($p | length) >= 2 { {value: $p.1, description: "submodule"} } + } | compact + } catch { null } +} + +let inshellah_git_worktrees = { || + try { + ^git worktree list --porcelain + | lines + | each { |l| + let m = $l | parse -r '^worktree\s+(?P

.+)$' + if ($m | length) > 0 { {value: $m.0.p, description: ""} } + } | compact + } catch { null } +} + +let inshellah_jj_revs = { || + try { + ^jj log --ignore-working-copy --no-graph -r 'all()' -T 'change_id.shortest() ++ "\t" ++ description.first_line() ++ "\n"' err> /dev/null + | lines + | each { |l| + let p = $l | split row "\t" + if ($p | length) >= 2 { {value: $p.0, description: $p.1} } + } | compact + } catch { null } +} + +let inshellah_jj_bookmarks = { || + try { + ^jj bookmark list --all-remotes -T 'name ++ "\n"' err> /dev/null + | lines + | str trim + | where { |b| not ($b | is-empty) } + | each { |b| {value: $b, description: "bookmark"} } + } catch { null } +} + +let inshellah_jj_tags = { || + try { + ^jj tag list --all-remotes -T 'name ++ "\n"' err> /dev/null + | lines + | str trim + | where { |t| not ($t | is-empty) } + | each { |t| {value: $t, description: "tag"} } + } catch { null } +} + +let inshellah_jj_remotes = { || + try { + ^jj git remote list err> /dev/null + | lines + | each { |l| + let p = $l | str trim | split row -r '\s+' + if ($p | length) >= 1 { {value: $p.0, description: ($p | get 1? | default "remote")} } + } | compact + } catch { null } +} + +let inshellah_jj_ops = { || + try { + ^jj op log --ignore-working-copy --no-graph -T 'id.short() ++ "\t" ++ description.first_line() ++ "\n"' err> /dev/null + | lines + | each { |l| + let p = $l | split row "\t" + if ($p | length) >= 2 { {value: $p.0, description: $p.1} } + } | compact + } catch { null } +} + +let inshellah_jj_files = { || + try { + ^jj file list --ignore-working-copy err> /dev/null + | lines + | str trim + | where { |p| not ($p | is-empty) } + | each { |p| {value: $p, description: "repo file"} } + } catch { null } +} + +let inshellah_jj_workspaces = { || + try { + ^jj workspace list -T 'name ++ "\n"' err> /dev/null + | lines + | str trim + | where { |w| not ($w | is-empty) } + | each { |w| {value: $w, description: "workspace"} } + } catch { null } +} + +let inshellah_complete = { |spans| + let completions = do $inshellah_static_complete $spans + let span_len = ($spans | length) + let last_span = if $span_len > 0 { $spans | last } else { "" } + let prev_span = if $span_len >= 2 { $spans | get ($span_len - 2) } else { "" } + let sub = if $span_len >= 2 { $spans | get 1 } else { "" } + + let additional = if ($completions == null and $span_len > 0) { + match $spans.0 { + "nix" => { + if $span_len < 2 { + null + } else { + try { + let nix_output = ( + with-env { NIX_GET_COMPLETIONS: ($span_len - 1) } { + $spans | run-external $in + } + | split row -r '\n' + | str trim + | skip 1 + | where { |e| not ($e | is-empty) } + ) + if (($nix_output | length) < 6 and + $last_span =~ "[a-zA-Z][a-zA-Z0-9_-]*#[a-zA-Z][a-zA-Z0-9_-]*") { + with-env { NIX_ALLOW_UNFREE: "1" NIX_ALLOW_BROKEN: "1" } { + $nix_output | par-each { |e| + try { + {value: $e, description: (^nix eval --raw --impure $e --apply "f: f.meta.description" err> /dev/null)} + } catch { + {value: $e, description: ""} + } + } + } + } else { + $nix_output | each { |e| {value: $e, description: ""} } + } + } catch { null } + } + } + "systemctl" => { + let unit_verbs = [ + "status" "show" "cat" "help" "start" "stop" "restart" "reload" "try-restart" + "reload-or-restart" "reload-or-try-restart" "isolate" "kill" "reset-failed" + "enable" "disable" "reenable" "preset" "mask" "unmask" "is-active" "is-failed" + "is-enabled" "edit" + ] + let args = $spans | skip 1 | where { |s| not ($s | str starts-with "-") } + let verb = $args | get 0? | default "" + if (($verb in $unit_verbs) and $span_len >= 3) { + let scope = if ("--user" in $spans) { [--user] } else { [] } + do $inshellah_unit_candidates $scope $last_span + } else { null } + } + "journalctl" => { + if ($prev_span == "--unit" or $prev_span == "-u") { + let scope = if ("--user-unit" in $spans or "--user" in $spans) { [--user] } else { [] } + do $inshellah_unit_candidates $scope $last_span + } else { null } + } + "coredumpctl" => { + let unit_verbs = ["dump" "info" "debug" "list"] + if (($sub in $unit_verbs) and $span_len >= 3) { + let units = (do $inshellah_unit_candidates [] $last_span | default []) + let pids = (try { + ^coredumpctl list --no-pager --no-legend + | lines + | each { |l| + let p = $l | split row -r '\s+' + if ($p | length) >= 5 { {value: $p.4, description: $"PID ($p.4) ($p | get 9? | default "")"} } + } | compact + } catch { [] }) + $units | append $pids + } else { null } + } + "loginctl" => { + let user_verbs = ["user-status" "show-user" "enable-linger" "disable-linger" "kill-user" "terminate-user"] + let session_verbs = ["session-status" "show-session" "activate" "lock-session" "unlock-session" "terminate-session" "kill-session"] + if (($sub in $user_verbs) and $span_len >= 3) { + try { + ^loginctl list-users --no-pager --no-legend + | lines | each { |l| + let p = $l | str trim | split row -r '\s+' + if ($p | length) >= 2 { {value: $p.1, description: $"UID ($p.0)"} } + } | compact + } catch { null } + } else if (($sub in $session_verbs) and $span_len >= 3) { + try { + ^loginctl list-sessions --no-pager --no-legend + | lines | each { |l| + let p = $l | str trim | split row -r '\s+' + if ($p | length) >= 3 { {value: $p.0, description: $"user ($p.2)"} } + } | compact + } catch { null } + } else { null } + } + "machinectl" => { + let machine_verbs = ["status" "show" "start" "login" "shell" "enable" "disable" "poweroff" "reboot" "terminate" "kill" "bind" "copy-to" "copy-from"] + if (($sub in $machine_verbs) and $span_len >= 3) { + try { + ^machinectl list --no-pager --no-legend + | lines | each { |l| + let p = $l | str trim | split row -r '\s+' + if ($p | length) >= 1 { {value: $p.0, description: ($p | get 1? | default "")} } + } | compact + } catch { null } + } else { null } + } + "networkctl" => { + let link_verbs = ["status" "show" "up" "down" "renew" "forcerenew" "reconfigure" "delete"] + if (($sub in $link_verbs) and $span_len >= 3) { + try { + ^networkctl list --no-pager --no-legend + | lines | each { |l| + let p = $l | str trim | split row -r '\s+' + if ($p | length) >= 4 { {value: $p.1, description: $"($p.2) ($p.3)"} } + } | compact + } catch { null } + } else { null } + } + "hostnamectl" | "timedatectl" | "localectl" => { + null + } + "ssh" | "scp" | "sftp" => { + let cfg_hosts = (try { + open ~/.ssh/config | lines | each { |l| + let m = $l | parse -r '(?i)^\s*Host\s+(?P.+)$' + if ($m | length) > 0 { $m.0.h | split row -r '\s+' } else { [] } + } | flatten | where { |h| not ($h | str contains '*') and not ($h | is-empty) } + } catch { [] }) + let known = (try { + open ~/.ssh/known_hosts | lines | each { |l| + ($l | split row -r '\s+' | get 0? | default "") | split row ',' + } | flatten | where { |h| (not ($h | is-empty)) and (not ($h | str starts-with '|')) and (not ($h | str starts-with '[')) } + } catch { [] }) + $cfg_hosts | append $known | uniq | each { |h| {value: $h, description: ""} } + } + "docker" | "podman" => { + let need_container = ["exec" "logs" "inspect" "start" "stop" "restart" "rm" "kill" "attach" "cp" "top" "wait" "pause" "unpause" "port" "commit" "diff" "export"] + let need_image = ["run" "rmi" "tag" "push" "pull" "history" "save" "create"] + if ($sub in $need_container) { + try { + ^($spans.0) ps -a --format '{{.Names}}\t{{.Image}}' + | lines | each { |l| + let p = $l | split row "\t" + if ($p | length) >= 2 { {value: $p.0, description: $p.1} } + } | compact + } catch { null } + } else if ($sub in $need_image) { + try { + ^($spans.0) images --format '{{.Repository}}:{{.Tag}}\t{{.Size}}' + | lines | each { |l| + let p = $l | split row "\t" + if (($p | length) >= 2) and (not ($p.0 | str ends-with ':')) { + {value: $p.0, description: $p.1} + } + } | compact + } catch { null } + } else { null } + } + "kubectl" => { + let resource_verbs = ["get" "describe" "delete" "edit" "scale" "annotate" "label"] + if (($sub in $resource_verbs) and $span_len >= 4) { + let kind = $spans | get 2? | default "" + do $inshellah_kubectl_names $kind $spans + } else if (($sub == "logs" or $sub == "exec" or $sub == "port-forward") and $span_len >= 3) { + do $inshellah_kubectl_names "pods" $spans + } else if ($sub == "rollout" and $span_len >= 5) { + let action = $spans | get 2? | default "" + let kind = $spans | get 3? | default "" + if ($action in ["history" "pause" "restart" "resume" "status" "undo"]) { + do $inshellah_kubectl_names $kind $spans + } else { null } + } else { null } + } + "git" => { + let git_verbs = [ + "add" "bisect" "branch" "checkout" "cherry-pick" "clone" "commit" "diff" + "fetch" "grep" "init" "log" "merge" "mv" "pull" "push" "rebase" "reflog" + "remote" "reset" "restore" "revert" "rm" "show" "stash" "status" + "submodule" "switch" "tag" "worktree" + ] + let ref_verbs = ["checkout" "merge" "rebase" "log" "diff" "show" "reset" "cherry-pick" "revert" "tag" "blame" "bisect"] + let branch_verbs = ["switch" "branch"] + let remote_verbs = ["add" "rename" "remove" "rm" "set-head" "set-branches" "get-url" "set-url" "show" "prune" "update"] + let stash_verbs = ["push" "save" "list" "show" "drop" "pop" "apply" "branch" "clear" "create" "store"] + let submodule_verbs = ["add" "status" "init" "deinit" "update" "set-branch" "set-url" "summary" "foreach" "sync" "absorbgitdirs"] + let bisect_verbs = ["start" "bad" "good" "new" "old" "terms" "skip" "next" "reset" "visualize" "view" "replay" "log" "run"] + let git_args = $spans | skip 2 | where { |s| not ($s | is-empty) and not ($s | str starts-with "-") } + if $span_len <= 2 { + $git_verbs | each { |v| {value: $v, description: "git subcommand"} } + } else if ($sub == "worktree") { + let worktree_verb = $spans | get 2? | default "" + if $span_len <= 3 { + ["add" "list" "lock" "move" "prune" "remove" "repair" "unlock"] | each { |v| {value: $v, description: "worktree subcommand"} } + } else if ($worktree_verb in ["remove" "move" "lock" "unlock" "repair"]) { + do $inshellah_git_worktrees + } else if ($worktree_verb == "add" and $span_len >= 5) { + do $inshellah_git_refs + } else { null } + } else if ($sub == "remote" and $span_len >= 3) { + let remote_verb = $spans | get 2? | default "" + if $span_len <= 3 { + $remote_verbs | each { |v| {value: $v, description: "remote subcommand"} } + } else if ($remote_verb in ["rename" "remove" "rm" "set-head" "set-branches" "get-url" "set-url" "show" "prune" "update"]) { + do $inshellah_git_remotes + } else { null } + } else if (($sub in ["fetch" "push" "pull"]) and $span_len >= 3) { + if ($git_args | is-empty) { + do $inshellah_git_remotes + } else { + do $inshellah_git_refs + } + } else if ($sub == "stash" and $span_len >= 3) { + let stash_verb = $spans | get 2? | default "" + if $span_len <= 3 { + $stash_verbs | each { |v| {value: $v, description: "stash subcommand"} } + } else if ($stash_verb in ["show" "drop" "pop" "apply" "store"]) { + do $inshellah_git_stashes + } else if ($stash_verb == "branch" and ($git_args | length) >= 2) { + do $inshellah_git_stashes + } else { null } + } else if ($sub == "submodule" and $span_len >= 3) { + let submodule_verb = $spans | get 2? | default "" + if $span_len <= 3 { + $submodule_verbs | each { |v| {value: $v, description: "submodule subcommand"} } + } else if ($submodule_verb in ["status" "init" "deinit" "update" "set-branch" "set-url" "summary" "foreach" "sync"]) { + do $inshellah_git_submodules + } else { null } + } else if ($sub == "bisect" and $span_len >= 3) { + let bisect_verb = $spans | get 2? | default "" + if $span_len <= 3 { + $bisect_verbs | each { |v| {value: $v, description: "bisect subcommand"} } + } else if ($bisect_verb in ["bad" "good" "new" "old" "skip" "reset" "start"]) { + do $inshellah_git_refs + } else { null } + } else if ($sub == "tag" and $span_len >= 3) { + if (["-d" "--delete" "-v" "--verify"] | any { |f| $f in $spans }) { + do $inshellah_git_tags + } else if ($span_len >= 4) { + do $inshellah_git_refs + } else { + do $inshellah_git_tags + } + } else if ($sub == "add" and $span_len >= 3) { + do $inshellah_git_status_paths + } else if ($sub == "restore" and $span_len >= 3) { + if ($prev_span == "--source" or $prev_span == "-s") { + do $inshellah_git_refs + } else { + do $inshellah_git_status_paths + } + } else if ($sub == "rm" and $span_len >= 3) { + do $inshellah_git_tracked_paths + } else if ($sub == "mv" and $span_len >= 3) { + if ($git_args | is-empty) { do $inshellah_git_tracked_paths } else { null } + } else if ($sub == "checkout" and $span_len >= 3) { + if ($prev_span in ["-b" "-B" "--orphan"]) { null } else { do $inshellah_git_refs } + } else if ($sub == "switch" and $span_len >= 3) { + if ($prev_span in ["-c" "-C" "--create" "--force-create" "--orphan"]) { null } else { do $inshellah_git_branches } + } else if (($sub in $branch_verbs) and $span_len >= 3) { + do $inshellah_git_branches + } else if (($sub in $ref_verbs) and $span_len >= 3) { + do $inshellah_git_refs + } else { null } + } + "jj" => { + let jj_verbs = [ + "abandon" "absorb" "bookmark" "commit" "describe" "diff" "diffedit" + "duplicate" "edit" "evolog" "file" "git" "interdiff" "log" "new" + "operation" "op" "rebase" "resolve" "restore" "revert" "show" "sparse" + "split" "squash" "status" "tag" "undo" "workspace" "b" "ci" "desc" "st" + ] + let rev_flags = [ + "-r" "--revision" "--revisions" "--from" "--to" "-s" "--source" + "-d" "--destination" "--insert-after" "--insert-before" "--before" + "--after" "--onto" "--change" + ] + let rev_verbs = [ + "abandon" "absorb" "describe" "diff" "diffedit" "duplicate" "edit" + "evolog" "interdiff" "log" "metaedit" "new" "parallelize" "rebase" + "restore" "revert" "show" "sign" "simplify-parents" "split" "squash" + "unsign" + ] + let bookmark_verbs = ["advance" "create" "delete" "forget" "list" "move" "rename" "set" "track" "untrack"] + let jj_git_verbs = ["clone" "colocation" "export" "fetch" "import" "init" "push" "remote" "root"] + let jj_remote_verbs = ["add" "list" "remove" "rename" "set-url"] + let op_verbs = ["abandon" "diff" "integrate" "log" "restore" "revert" "show"] + let file_verbs = ["annotate" "chmod" "list" "search" "show" "track" "untrack"] + let workspace_verbs = ["add" "forget" "list" "rename" "root" "update-stale"] + let sparse_verbs = ["edit" "list" "reset" "set"] + let jj_args = $spans | skip 2 | where { |s| not ($s | is-empty) and not ($s | str starts-with "-") } + if ($prev_span in $rev_flags) { + do $inshellah_jj_revs + } else if ($prev_span == "--remote") { + do $inshellah_jj_remotes + } else if ($prev_span == "--at-operation" or $prev_span == "--at-op") { + do $inshellah_jj_ops + } else if $span_len <= 2 { + $jj_verbs | each { |v| {value: $v, description: "jj subcommand"} } + } else if ($sub == "bookmark" or $sub == "b") { + let verb = $spans | get 2? | default "" + if $span_len <= 3 { + $bookmark_verbs | each { |v| {value: $v, description: "bookmark subcommand"} } + } else if ($verb in ["delete" "forget" "move" "rename" "set" "track" "untrack" "advance"]) { + do $inshellah_jj_bookmarks + } else { null } + } else if ($sub == "tag") { + let verb = $spans | get 2? | default "" + if $span_len <= 3 { + ["delete" "list" "set"] | each { |v| {value: $v, description: "tag subcommand"} } + } else if ($verb in ["delete" "set"]) { + do $inshellah_jj_tags + } else { null } + } else if ($sub == "git") { + let git_verb = $spans | get 2? | default "" + let remote_verb = $spans | get 3? | default "" + if $span_len <= 3 { + $jj_git_verbs | each { |v| {value: $v, description: "jj git subcommand"} } + } else if ($git_verb == "remote") { + if $span_len <= 4 { + $jj_remote_verbs | each { |v| {value: $v, description: "remote subcommand"} } + } else if ($remote_verb in ["remove" "rename" "set-url"]) { + do $inshellah_jj_remotes + } else { null } + } else if ($git_verb in ["fetch" "push"]) { + do $inshellah_jj_remotes + } else { null } + } else if ($sub == "operation" or $sub == "op") { + let verb = $spans | get 2? | default "" + if $span_len <= 3 { + $op_verbs | each { |v| {value: $v, description: "operation subcommand"} } + } else if ($verb in ["abandon" "diff" "integrate" "restore" "revert" "show"]) { + do $inshellah_jj_ops + } else { null } + } else if ($sub == "file") { + let verb = $spans | get 2? | default "" + if $span_len <= 3 { + $file_verbs | each { |v| {value: $v, description: "file subcommand"} } + } else if ($verb in ["annotate" "chmod" "list" "search" "show" "untrack"]) { + do $inshellah_jj_files + } else { null } + } else if ($sub == "workspace") { + let verb = $spans | get 2? | default "" + if $span_len <= 3 { + $workspace_verbs | each { |v| {value: $v, description: "workspace subcommand"} } + } else if ($verb in ["forget" "update-stale"]) { + do $inshellah_jj_workspaces + } else { null } + } else if ($sub == "sparse") { + if $span_len <= 3 { + $sparse_verbs | each { |v| {value: $v, description: "sparse subcommand"} } + } else { null } + } else if ($sub in ["diff" "log"] and ($jj_args | is-empty)) { + do $inshellah_jj_files + } else if ($sub in $rev_verbs and $span_len >= 3) { + do $inshellah_jj_revs + } else { null } + } + "npm" | "pnpm" | "yarn" => { + let wants = ( + (($spans.0 == "yarn") and $span_len == 2) + or (($sub == "run" or $sub == "run-script") and $span_len == 3) + ) + if $wants { + try { + open package.json | get scripts? | default {} | transpose name cmd + | each { |row| {value: $row.name, description: $row.cmd} } + } catch { null } + } else { null } + } + "make" => { + if $span_len <= 2 { + try { + open Makefile | lines + | each { |l| + let m = $l | parse -r '^(?P[A-Za-z0-9_./-]+)\s*:' + if (($m | length) > 0) and (not ($m.0.t | str starts-with '.')) { + {value: $m.0.t, description: ""} + } + } | compact | uniq-by value + } catch { null } + } else { null } + } + "just" => { + if $span_len <= 2 { + try { + ^just --list --unsorted + | lines | skip 1 + | each { |l| + let m = $l | parse -r '^\s+(?P[A-Za-z0-9_-]+)(?:\s+\S.*)?(?:\s*#\s*(?P.*))?$' + if ($m | length) > 0 { + {value: $m.0.t, description: ($m.0.d? | default "")} + } + } | compact + } catch { null } + } else { null } + } + "cargo" => { + let target_flags = ["--bin" "--example" "--test" "--bench"] + if ($prev_span == "-p" or $prev_span == "--package") { + try { + ^cargo metadata --no-deps --format-version 1 + | from json + | get packages + | each { |pkg| {value: $pkg.name, description: ($pkg.version? | default "")} } + | uniq-by value + } catch { null } + } else if ($prev_span in $target_flags) { + let kind = $prev_span | str replace "--" "" + try { + ^cargo metadata --no-deps --format-version 1 + | from json + | get packages + | each { |pkg| + $pkg.targets + | where { |t| $kind in $t.kind } + | each { |t| {value: $t.name, description: ($t.kind | str join ",")} } + } + | flatten + | uniq-by value + } catch { null } + } else { null } + } + "kill" | "pkill" => { + try { + ^ps -eo pid,comm --no-headers + | lines + | each { |l| + let parts = $l | str trim | split row -r '\s+' + if ($parts | length) >= 2 { + let pid = $parts | get 0 + let comm = $parts | skip 1 | str join " " + if ($spans.0 == "kill") { {value: $pid, description: $comm} } + else { {value: $comm, description: $pid} } + } + } | compact + } catch { null } + } + _ => { null } + } + } else { null } + + if $completions == null { + do $inshellah_filter_candidates $additional $last_span + } else { + $completions + } +} + +$env.config.completions.external = {enable: true, max_results: 200, completer: $inshellah_complete} diff --git a/nix/module.nix b/nix/module.nix index 04ae7f2..95289f8 100644 --- a/nix/module.nix +++ b/nix/module.nix @@ -10,7 +10,7 @@ # # Usage: # { pkgs, ... }: { -# imports = [ ./path/to/inshellah/nix/module.nix ]; +# imports = [ ./path/to/inshellah-rs/nix/module.nix ]; # programs.inshellah.enable = true; # } @@ -23,6 +23,34 @@ let cfg = config.programs.inshellah; + completerSnippet = ./inshellah-completer.nu; + dynamicStubCommands = [ + "systemctl" + "journalctl" + "coredumpctl" + "loginctl" + "machinectl" + "networkctl" + "hostnamectl" + "timedatectl" + "localectl" + "ssh" + "scp" + "sftp" + "docker" + "podman" + "kubectl" + "git" + "jj" + "npm" + "pnpm" + "yarn" + "make" + "just" + "cargo" + "pkill" + ]; + dynamicStubCommandArgs = lib.escapeShellArgs dynamicStubCommands; in { options.programs.inshellah = { @@ -72,9 +100,33 @@ in ''; }; + timeoutMs = lib.mkOption { + type = lib.types.nullOr lib.types.int; + default = null; + example = 200; + description = '' + per-subprocess timeout in milliseconds. when null the binary's + compiled-in default is used (currently 200ms). + ''; + }; + + workers = lib.mkOption { + type = lib.types.nullOr lib.types.int; + default = null; + example = 8; + description = '' + worker thread count for the parallel scrape pool. when null, + `std::thread::available_parallelism` is used. + ''; + }; + snippet = lib.mkOption { type = lib.types.str; readOnly = true; + default = builtins.readFile completerSnippet; + description = '' + nushell external completer snippet installed by the module. + ''; }; }; @@ -98,7 +150,10 @@ in (lib.hiPrio wrapped) cfg.package ]; - environment.pathsToLink = [ "/share/nushell/autoload" ]; + environment.pathsToLink = [ + "/share/nushell/autoload" + "/share/nushell/vendor/autoload" + ]; environment.extraSetup = let inshellah = "${cfg.package}/bin/inshellah"; @@ -109,30 +164,38 @@ in lib.concatStringsSep "\n" cfg.helpOnlyCommands ); helpOnlyFlag = lib.optionalString (cfg.helpOnlyCommands != [ ]) " --help-only ${helpOnlyFile}"; + timeoutFlag = lib.optionalString (cfg.timeoutMs != null) " --timeout-ms ${toString cfg.timeoutMs}"; + workersFlag = lib.optionalString (cfg.workers != null) " --workers ${toString cfg.workers}"; + snippetFile = pkgs.writeText "inshellah-completer.nu" cfg.snippet; in '' mkdir -p ${destDir} if [ -d "$out/bin" ] && [ -d "$out/share/man" ]; then - ${inshellah} index "$out" --dir ${destDir}${ignoreFlag}${helpOnlyFlag} \ + ${inshellah} index "$out" --dir ${destDir}${ignoreFlag}${helpOnlyFlag}${timeoutFlag}${workersFlag} \ 2>/dev/null || true fi find ${destDir} -maxdepth 1 -empty -delete - # nushell hardcodes sudo and doas to bypass the external completer, - # returning command-name completion instead of calling inshellah. - # these @complete external stubs override that so inshellah handles - # their flags and elevation stripping. placed in the nushell autoload - # dir so they are sourced automatically at shell startup. + # Install the full nushell completer plus sudo/doas wrapped commands. + # Nushell otherwise hardcodes sudo/doas to bypass external completers. mkdir -p $out/share/nushell/vendor/autoload - cat > $out/share/nushell/vendor/autoload/inshellah-elevation.nu << 'NUSHELL' - @complete external - extern "sudo" [] + cp ${snippetFile} $out/share/nushell/vendor/autoload/inshellah.nu - @complete external - extern "doas" [] - NUSHELL + # Register command names for dynamic backends that are actually present + # in the linked profile. The externs keep Nu's command list aware of + # these commands while the external completer still supplies arguments. + stubFile=$out/share/nushell/vendor/autoload/inshellah-command-stubs.nu + : > "$stubFile" + for cmd in ${dynamicStubCommandArgs}; do + if [ -x "$out/bin/$cmd" ]; then + printf '@complete external\nextern "%s" [...args]\n\n' "$cmd" >> "$stubFile" + fi + done + if [ ! -s "$stubFile" ]; then + rm -f "$stubFile" + fi ''; }; } diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..2256bee --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,4 @@ +pub mod parsers; +pub mod pool; +pub mod store; +pub mod types; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..df1f49b --- /dev/null +++ b/src/main.rs @@ -0,0 +1,2241 @@ +//! inshellah CLI. +//! +//! subcommands: +//! index PREFIX... scan PREFIX/bin and PREFIX/share/man, write JSON cache +//! manpage FILE parse a single manpage, emit nushell extern +//! manpage-dir DIR batch-process manpages under DIR +//! complete CMD ARG... nushell external completer; reads the cache, +//! falls back to on-the-fly --help if uncached +//! query CMD print stored data for CMD +//! dump list indexed commands +//! completions emit nushell completion definitions for inshellah itself + +use std::collections::HashSet; +use std::fs; +use std::io::Read; +use std::path::{Path, PathBuf}; +use std::process::{Command, Stdio}; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use parking_lot::Mutex; + +use inshellah::parsers::help::help_parser; +use inshellah::parsers::manpage::{ + ManpageEntry, ManpageResult, ManpageSubcommand, OwnedParam, OwnedSwitch, + extract_synopsis_command, parse_manpage_string, parse_manpage_with_subs, read_manpage_file, +}; +use inshellah::parsers::nushell::{generate_extern, generate_module, is_nushell_builtin}; +use inshellah::pool::{ScrapePool, Submitter}; +use inshellah::store::{ + all_commands, default_store_path, ensure_dir, file_type_of, filename_of_command, lookup, + lookup_raw, parse_nu_completions, subcommands_of, write_native, write_result, +}; + +const COMMAND_SECTIONS: &[u8] = &[1, 8]; + +/// per-subprocess timeout default when --timeout-ms isn't passed. +/// empirically tuned so that a slow-to-print binary doesn't block the +/// pool, while fast-responding ones (the vast majority) print their +/// --help well inside the window. with `n` parallel workers a 200ms +/// ceiling means the worst-case waste from an unresponsive binary is +/// `200ms / n_workers` of wall time. +const DEFAULT_TIMEOUT_MS: u64 = 200; + +fn usage() { + eprintln!( + "inshellah - nushell completions engine + +Usage: + inshellah index PREFIX... [--dir PATH] [--ignore FILE] [--help-only FILE] + [--timeout-ms N] [--workers N] + Index completions into a directory of JSON/nu files. + PREFIX is a directory containing bin/ and share/man/. + Default dir: $XDG_CACHE_HOME/inshellah + --ignore FILE skip listed commands entirely + --help-only FILE skip manpages for listed commands, use --help instead + --timeout-ms N per-subprocess timeout in milliseconds (default 200) + --workers N parallel scrape workers (default: cpu count) + inshellah complete CMD [ARGS...] [--dir PATH[:PATH...]] [--timeout-ms N] + Nushell custom completer. Outputs JSON completion candidates. + Falls back to --help resolution if command is not indexed. + --dir takes colon-separated paths. The first path is the writable + user cache; additional paths are read-only system directories. + inshellah query CMD [--dir PATH[:PATH...]] + Print stored completion data for CMD. + inshellah dump [--dir PATH[:PATH...]] + List indexed commands. + inshellah manpage FILE Parse a manpage and emit nushell extern + inshellah manpage-dir DIR Batch-process manpages under DIR + inshellah completions Generate nushell completions for inshellah +" + ); +} + +// --- subprocess management --- + +/// sanitized env: strip display-related variables to prevent gui tools from +/// popping up windows when run with --help. cached once per process — +/// `vars_os` walks the whole env every call, which adds up across thousands +/// of spawns. +fn safe_env_vars() -> &'static [(std::ffi::OsString, std::ffi::OsString)] { + static CACHE: std::sync::OnceLock> = + std::sync::OnceLock::new(); + CACHE.get_or_init(|| { + std::env::vars_os() + .filter(|(k, _)| { + let s = k.to_string_lossy(); + !(s == "DISPLAY" + || s == "WAYLAND_DISPLAY" + || s == "DBUS_SESSION_BUS_ADDRESS" + || s == "XAUTHORITY") + }) + .collect() + }) +} + +/// run a command with a timeout, capturing stdout+stderr merged. +/// returns None if the process couldn't be started, produced no output, +/// or was killed due to timeout. +/// +/// uses `poll(2)` on the pipe fds directly from the calling thread — no +/// reader threads, no try_wait polling loop. we block in the kernel for +/// either data (POLLIN), peer-close (POLLHUP), or the timeout deadline, +/// so the cost per subprocess is roughly one syscall per data chunk +/// plus the spawn itself. +/// +/// unix process groups still apply: the child is its own pgid leader, so +/// on timeout we killpg(pgid, SIGKILL) and the whole tree (wrapper +/// scripts, forked grandchildren) dies, closing the pipe writers and +/// letting our reads finish cleanly. +fn run_cmd(args: &[String], timeout_ms: u64) -> Option { + use std::io::Read; + use std::os::fd::AsRawFd; + use std::os::unix::process::CommandExt; + + if args.is_empty() { + return None; + } + let mut cmd = Command::new(&args[0]); + cmd.args(&args[1..]); + cmd.stdin(Stdio::null()); + cmd.stdout(Stdio::piped()); + cmd.stderr(Stdio::piped()); + cmd.env_clear(); + for (k, v) in safe_env_vars() { + cmd.env(k, v); + } + cmd.current_dir("/tmp"); + cmd.process_group(0); + + let mut child = cmd.spawn().ok()?; + let pgid = child.id() as i32; + let mut stdout = child.stdout.take()?; + let mut stderr = child.stderr.take()?; + let stdout_fd = stdout.as_raw_fd(); + let stderr_fd = stderr.as_raw_fd(); + + // both pipe fds must be non-blocking so poll-then-read can drain + // everything available without blocking on the next chunk. + unsafe { + for fd in [stdout_fd, stderr_fd] { + let flags = libc::fcntl(fd, libc::F_GETFL); + libc::fcntl(fd, libc::F_SETFL, flags | libc::O_NONBLOCK); + } + } + + let deadline = Instant::now() + Duration::from_millis(timeout_ms); + let mut buf: Vec = Vec::with_capacity(4096); + let mut chunk = [0u8; 4096]; + let mut stdout_open = true; + let mut stderr_open = true; + let mut timed_out = false; + + while stdout_open || stderr_open { + let now = Instant::now(); + if now >= deadline { + timed_out = true; + break; + } + let remaining_ms = (deadline - now).as_millis().min(i32::MAX as u128) as i32; + + let mut fds = [ + libc::pollfd { + fd: if stdout_open { stdout_fd } else { -1 }, + events: libc::POLLIN, + revents: 0, + }, + libc::pollfd { + fd: if stderr_open { stderr_fd } else { -1 }, + events: libc::POLLIN, + revents: 0, + }, + ]; + let n = unsafe { libc::poll(fds.as_mut_ptr(), fds.len() as libc::nfds_t, remaining_ms) }; + if n < 0 { + // EINTR — retry. anything else: bail and let the child reap below. + if std::io::Error::last_os_error().kind() == std::io::ErrorKind::Interrupted { + continue; + } + break; + } + if n == 0 { + // poll itself returned without events — deadline check at top + // of next iter will catch it. + continue; + } + + // drain whichever fds are ready until EAGAIN or EOF. + for (i, pfd) in fds.iter().enumerate() { + if pfd.revents == 0 { + continue; + } + let (reader, open): (&mut dyn Read, &mut bool) = if i == 0 { + (&mut stdout as &mut dyn Read, &mut stdout_open) + } else { + (&mut stderr as &mut dyn Read, &mut stderr_open) + }; + loop { + match reader.read(&mut chunk) { + Ok(0) => { + *open = false; + break; + } + Ok(read) => buf.extend_from_slice(&chunk[..read]), + Err(e) if e.kind() == std::io::ErrorKind::WouldBlock => break, + Err(_) => { + *open = false; + break; + } + } + } + if pfd.revents & (libc::POLLHUP | libc::POLLERR) != 0 { + *open = false; + } + } + } + + if timed_out { + unsafe { + libc::killpg(pgid, libc::SIGKILL); + } + } + let _ = child.wait(); + + if buf.is_empty() { + None + } else { + Some(String::from_utf8_lossy(&buf).into_owned()) + } +} + +// --- file classification --- + +fn is_executable(path: &Path) -> bool { + use std::os::unix::fs::PermissionsExt; + fs::metadata(path) + .map(|m| m.is_file() && (m.permissions().mode() & 0o111) != 0) + .unwrap_or(false) +} + +fn is_script(path: &Path) -> bool { + let real = match fs::canonicalize(path) { + Ok(p) => p, + Err(_) => return false, + }; + let Ok(mut f) = fs::File::open(&real) else { + return false; + }; + let mut buf = [0u8; 2]; + f.read_exact(&mut buf) + .map(|_| &buf == b"#!") + .unwrap_or(false) +} + +/// skip filenames that aren't real commands (e.g. doc/locale paths). +fn skip_name(name: &str) -> bool { + name.starts_with('.') + || name.ends_with(".so") + || name.ends_with(".a") + || name.ends_with(".la") + || name.contains('/') +} + +// --- ELF scanning --- + +/// scan an ELF binary (or any file) for string needles. returns the set of +/// needles that appeared. on read failure all needles are reported found +/// (conservative — we'd rather try --help than skip). +fn elf_scan(path: &Path, needles: &[&str]) -> HashSet { + let mut found: HashSet = HashSet::new(); + let real = match fs::canonicalize(path) { + Ok(p) => p, + Err(_) => { + for n in needles { + found.insert((*n).to_string()); + } + return found; + } + }; + let Ok(mut f) = fs::File::open(&real) else { + for n in needles { + found.insert((*n).to_string()); + } + return found; + }; + let mut magic = [0u8; 4]; + if f.read_exact(&mut magic).is_err() { + return found; + } + if magic != [0x7f, b'E', b'L', b'F'] { + // not ELF — return empty so caller decides + return found; + } + let max_needle = needles.iter().map(|s| s.len()).max().unwrap_or(0); + let chunk_size = 65536usize; + let mut buf = vec![0u8; chunk_size + max_needle]; + let mut carry = 0usize; + let needles_b: Vec<&[u8]> = needles.iter().map(|s| s.as_bytes()).collect(); + loop { + let n: usize = f + .read(&mut buf[carry..carry + chunk_size]) + .unwrap_or_default(); + if n == 0 { + break; + } + let total = carry + n; + for (i, needle) in needles_b.iter().enumerate() { + let key = needles[i]; + if found.contains(key) { + continue; + } + if needle.len() > total { + continue; + } + let win = &buf[..total]; + if win.windows(needle.len()).any(|w| w == *needle) { + found.insert(key.to_string()); + } + } + if found.len() == needles.len() { + break; + } + let new_carry = max_needle.min(total); + buf.copy_within(total - new_carry..total, 0); + carry = new_carry; + } + found +} + +// --- nix wrapper detection --- + +fn read_to_string_capped(path: &Path, cap: usize) -> Option { + let real = fs::canonicalize(path).ok()?; + let md = fs::metadata(&real).ok()?; + if md.len() as usize > cap { + return None; + } + fs::read_to_string(&real).ok() +} + +/// detect nix-generated c wrappers; return the real binary path. +fn nix_wrapper_target(path: &Path) -> Option { + let contents = read_to_string_capped(path, 65536)?; + if !contents.contains("makeCWrapper") { + return None; + } + // pattern: /nix/store/-/bin/ + extract_nix_bin_path(&contents) +} + +/// detect nix-generated bash/sh wrappers. +fn nix_script_wrapper_target(path: &Path) -> Option { + let contents = read_to_string_capped(path, 4096)?; + if !contents.starts_with("#!") { + return None; + } + if !contents.contains("/nix/store/") { + return None; + } + if !(contents.contains("exec ") || contents.contains("exec\t")) { + return None; + } + extract_nix_bin_path(&contents) +} + +fn extract_nix_bin_path(contents: &str) -> Option { + let needle = "/nix/store/"; + let bytes = contents.as_bytes(); + let mut idx = 0; + while let Some(rel) = contents[idx..].find(needle) { + let start = idx + rel; + // find end of the path (whitespace, quote, or null) + let mut end = start + needle.len(); + while end < bytes.len() { + let b = bytes[end]; + if b == b' ' + || b == b'\t' + || b == b'\n' + || b == b'\r' + || b == b'"' + || b == b'\'' + || b == 0 + { + break; + } + end += 1; + } + let candidate = &contents[start..end]; + if candidate.contains("/bin/") { + let path = PathBuf::from(candidate); + if path.exists() { + return Some(path); + } + } + idx = end; + } + None +} + +// --- binary classification --- + +#[derive(Debug, Clone, PartialEq, Eq)] +enum Classify { + /// can try --help + TryHelp, + /// the tool likely speaks the "nushell" completion subcommand + HasNativeCompletions, + /// skip — doesn't look like a CLI we can extract from + Skip, +} + +/// classify an ELF binary by scanning for help/completion needles. +fn classify_elf(path: &Path) -> Classify { + let found = elf_scan(path, &["-h", "--help", "complet"]); + if found.contains("complet") { + Classify::HasNativeCompletions + } else if found.contains("-h") || found.contains("--help") { + Classify::TryHelp + } else { + Classify::Skip + } +} + +/// classify a binary by its actual nature: script, ELF, or nix wrapper. +fn classify_binary(_bindir: &Path, full: &Path) -> Classify { + if is_script(full) { + return Classify::TryHelp; + } + if let Some(target) = nix_wrapper_target(full) { + return classify_elf(&target); + } + if let Some(target) = nix_script_wrapper_target(full) { + return classify_elf(&target); + } + classify_elf(full) +} + +// --- help text extraction --- + +/// try `--help`, then `-h`, returning the first non-empty output (with +/// ANSI escapes stripped). each attempt gets the same per-call timeout. +/// we deliberately skip the third historical `help`-subcommand variant: +/// if neither flag yielded usable text, a positional `help` is unlikely +/// to do anything different and the extra spawn dominates indexing cost. +fn try_help(bin: &Path, timeout_ms: u64) -> Option { + let bin_s = bin.to_string_lossy().to_string(); + for variant in [&["--help"][..], &["-h"][..]] { + let mut args = vec![bin_s.clone()]; + args.extend(variant.iter().map(|s| s.to_string())); + if let Some(out) = run_cmd(&args, timeout_ms) { + let cleaned = fast_strip_ansi::strip_ansi_string(&out); + if !cleaned.trim().is_empty() { + return Some(cleaned.to_string()); + } + } + } + None +} + +fn is_nushell_source(text: &str) -> bool { + text.len() > 20 + && (text.contains("export extern") + || text.contains("export def") + || (text.contains("module ") && text.contains("export"))) +} + +/// look for words that contain a known needle within the text (used to +/// find subcommand names that might be a native-completion command). +fn extract_matching_words(text: &str, needles: &[&str]) -> Vec { + let mut out: Vec = Vec::new(); + let mut seen: HashSet = HashSet::new(); + for token in text.split(|c: char| c.is_whitespace() || c == ',' || c == '|') { + let word = token.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_'); + if word.len() < 2 || word.starts_with('-') { + continue; + } + for needle in needles { + if word.contains(needle) && !seen.contains(word) { + seen.insert(word.to_string()); + out.push(word.to_string()); + break; + } + } + } + out +} + +/// try to get native nushell completions from a binary that supports them. +fn try_native_completion(bin: &Path, timeout_ms: u64) -> Option { + let help_text = try_help(bin, timeout_ms)?; + // look for words like "completion", "completions" — typical subcommand + let candidates = extract_matching_words(&help_text, &["complet"]); + let bin_s = bin.to_string_lossy().to_string(); + for sub in &candidates { + for args_form in [ + vec![bin_s.clone(), sub.clone(), "nushell".to_string()], + vec![ + bin_s.clone(), + sub.clone(), + "--shell".to_string(), + "nushell".to_string(), + ], + vec![bin_s.clone(), sub.clone(), "--shell=nushell".to_string()], + ] { + if let Some(out) = run_cmd(&args_form, timeout_ms) { + let cleaned = fast_strip_ansi::strip_ansi_string(&out); + if is_nushell_source(&cleaned) { + return Some(cleaned.to_string()); + } + } + } + } + None +} + +// --- subcommand recursion --- + +const MAX_RESOLVE_RESULTS: usize = 500; +const MAX_RECURSE_DEPTH: u32 = 5; + +fn parse_help_text(text: &str) -> ManpageResult { + let cleaned: String = fast_strip_ansi::strip_ansi_string(text).into_owned(); + match help_parser(&cleaned) { + Ok((_, r)) => (&r).into(), + Err(_) => ManpageResult::default(), + } +} + +/// recursively resolve subcommands, returning a vec of (cmd_path, result) +/// where cmd_path is the full "git stash apply" form. used by the +/// dynamic-resolve path in `cmd_complete`; the batch indexer uses the +/// pool instead, which expresses this same BFS shape with workers. +fn help_resolve( + bin: &Path, + cmd: &str, + depth: u32, + timeout_ms: u64, + acc: &mut Vec<(String, ManpageResult)>, +) { + if acc.len() >= MAX_RESOLVE_RESULTS { + return; + } + let Some(help_text) = try_help(bin, timeout_ms) else { + return; + }; + let result = parse_help_text(&help_text); + acc.push((cmd.to_string(), result)); + let initial_subs: Vec = acc + .last() + .map(|(_, r)| { + r.subcommands + .iter() + .map(|sc| sc.name.clone()) + .filter(|n| n.len() >= 2 && !n.starts_with('-')) + .collect() + }) + .unwrap_or_default(); + let bin_s = bin.to_string_lossy().to_string(); + for sub in initial_subs { + recurse_subcommand( + &bin_s, + cmd, + std::slice::from_ref(&sub), + depth + 1, + timeout_ms, + acc, + ); + } +} + +fn recurse_subcommand( + bin_s: &str, + base_cmd: &str, + sub_args: &[String], + depth: u32, + timeout_ms: u64, + acc: &mut Vec<(String, ManpageResult)>, +) { + if acc.len() >= MAX_RESOLVE_RESULTS || depth > MAX_RECURSE_DEPTH { + return; + } + let full_cmd = format!("{base_cmd} {}", sub_args.join(" ")); + let Some(text) = try_help_args(bin_s, sub_args, timeout_ms) else { + return; + }; + let result = parse_help_text(&text); + if result.entries.is_empty() && result.subcommands.is_empty() && result.positionals.is_empty() { + return; + } + if let Some(leaf) = sub_args.last() { + let self_listed = result + .subcommands + .iter() + .any(|sc| sc.name.eq_ignore_ascii_case(leaf)); + if self_listed { + return; + } + } + let inner_subs: Vec = result + .subcommands + .iter() + .map(|sc| sc.name.clone()) + .filter(|n| n.len() >= 2 && !n.starts_with('-') && n != "help") + .collect(); + acc.push((full_cmd, result)); + for sub in inner_subs { + if acc.len() >= MAX_RESOLVE_RESULTS { + break; + } + let mut next = sub_args.to_vec(); + next.push(sub); + recurse_subcommand(bin_s, base_cmd, &next, depth + 1, timeout_ms, acc); + } +} + +/// try `bin sub_path... --help` first, then `... -h` if --help came back +/// empty or "No manual entry…". used by deep subcommand recursion. +fn try_help_args(bin_s: &str, sub_args: &[String], timeout_ms: u64) -> Option { + let mut primary_args: Vec = vec![bin_s.to_string()]; + primary_args.extend(sub_args.iter().cloned()); + primary_args.push("--help".to_string()); + let primary = run_cmd(&primary_args, timeout_ms); + let primary_text = primary + .as_deref() + .map(|s| fast_strip_ansi::strip_ansi_string(s).into_owned()); + let primary_useful = primary_text + .as_ref() + .map(|t| { + let trimmed = t.trim(); + !trimmed.is_empty() + && !trimmed.starts_with("No manual entry") + && !trimmed.starts_with("man:") + }) + .unwrap_or(false); + if primary_useful { + return primary_text; + } + let mut fallback_args: Vec = vec![bin_s.to_string()]; + fallback_args.extend(sub_args.iter().cloned()); + fallback_args.push("-h".to_string()); + if let Some(out) = run_cmd(&fallback_args, timeout_ms) { + let cleaned = fast_strip_ansi::strip_ansi_string(&out).into_owned(); + if !cleaned.trim().is_empty() { + return Some(cleaned); + } + } + primary_text +} + +// --- manpage handling --- + +fn cmd_name_of_manpage(path: &Path) -> String { + let mut base = path + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or("") + .to_string(); + if base.ends_with(".gz") { + base.truncate(base.len() - 3); + } + // strip section suffix: "ls.1" -> "ls" + if let Some(dot) = base.rfind('.') { + base.truncate(dot); + } + base +} + +fn find_manpage_path(mandirs: &[PathBuf], hyphenated: &str) -> Option { + for mandir in mandirs { + for section in COMMAND_SECTIONS { + let secdir = mandir.join(format!("man{section}")); + for ext in ["", ".gz"] { + let path = secdir.join(format!("{hyphenated}.{section}{ext}")); + if path.is_file() { + return Some(path); + } + } + } + } + None +} + +/// derive the command name a manpage documents. the SYNOPSIS section +/// is authoritative because manpage filenames are ambiguous — +/// "btrfs-check.8" could mean either a standalone binary `btrfs-check` +/// or the subcommand `btrfs check`. we clamp to the number of +/// hyphen-separated parts in the filename to prevent synopsis lines +/// like "btrfs check [options] " from absorbing the device +/// placeholder into the command name. +fn resolve_manpage_cmd_name(file: &Path, contents: &str) -> String { + let fallback = cmd_name_of_manpage(file); + let max_words = fallback.matches('-').count() + 1; + match extract_synopsis_command(contents) { + Some(name) => { + let words: Vec<&str> = name.split(' ').filter(|w| !w.is_empty()).collect(); + if words.len() > max_words { + words[..max_words].join(" ") + } else { + name + } + } + None => fallback, + } +} + +type NamedManpageResult = (String, ManpageResult); +type ProcessedManpage = (String, ManpageResult, Vec); + +/// process a manpage and return (cmd_name, main_result, per-subcommand results). +/// the sub_results come from clap-style `.SH SUBCOMMAND` sections — each is +/// a self-contained command with its own flags. +fn process_manpage(file: &Path) -> Option { + let contents = read_manpage_file(file).ok()?; + let (mut result, sub_sections) = parse_manpage_with_subs(&contents); + if result.entries.is_empty() && result.subcommands.is_empty() && sub_sections.is_empty() { + return None; + } + let name = resolve_manpage_cmd_name(file, &contents); + if name.is_empty() { + return None; + } + strip_manpage_subcmd_prefixes(&mut result, file, &name); + // namespace the sub-section names under the resolved cmd name: + // e.g. nh's SUBCOMMAND "os" becomes the stored command "nh os". + let subs: Vec<(String, ManpageResult)> = sub_sections + .into_iter() + .map(|(sub_name, sub_result)| (format!("{name} {sub_name}"), sub_result)) + .collect(); + Some((name, result, subs)) +} + +fn list_manpages(mandirs: &[PathBuf]) -> Vec { + let mut out = Vec::new(); + for mandir in mandirs { + for section in COMMAND_SECTIONS { + let secdir = mandir.join(format!("man{section}")); + if let Ok(entries) = fs::read_dir(&secdir) { + for entry in entries.flatten() { + out.push(entry.path()); + } + } + } + } + out +} + +// --- index command --- + +fn load_ignorelist(path: &Path) -> HashSet { + let mut out = HashSet::new(); + if let Ok(contents) = fs::read_to_string(path) { + for line in contents.lines() { + let line = line.trim(); + if !line.is_empty() && !line.starts_with('#') { + out.insert(line.to_string()); + } + } + } + out +} + +fn list_binaries(bindirs: &[PathBuf]) -> Vec<(String, PathBuf)> { + let mut all: Vec<(String, PathBuf)> = Vec::new(); + let mut seen: HashSet = HashSet::new(); + for bd in bindirs { + let Ok(entries) = fs::read_dir(bd) else { + continue; + }; + for entry in entries.flatten() { + let path = entry.path(); + let Some(name) = path.file_name().and_then(|s| s.to_str()) else { + continue; + }; + if skip_name(name) || is_nushell_builtin(name) { + continue; + } + if !is_executable(&path) { + continue; + } + if seen.insert(name.to_string()) { + all.push((name.to_string(), path)); + } + } + } + all.sort_by(|a, b| a.0.cmp(&b.0)); + all +} + +fn manpage_name_has_installed_command(name: &str, binary_names: &HashSet) -> bool { + if binary_names.contains(name) { + return true; + } + name.split_once(' ') + .map(|(parent, _)| binary_names.contains(parent)) + .unwrap_or(false) +} + +#[cfg(test)] +mod main_tests { + use super::*; + + #[test] + fn manpage_names_must_match_installed_binary_or_subcommand_parent() { + let binary_names = HashSet::from(["git".to_string(), "getent".to_string()]); + + assert!(manpage_name_has_installed_command("git", &binary_names)); + assert!(manpage_name_has_installed_command("git add", &binary_names)); + assert!(manpage_name_has_installed_command( + "getent passwd", + &binary_names + )); + assert!(!manpage_name_has_installed_command("ld.so", &binary_names)); + assert!(!manpage_name_has_installed_command( + "git-add", + &binary_names + )); + } +} + +/// shared state passed to every pool worker. nothing inside mutates +/// except `indexed`, which is wrapped in a parking_lot::Mutex. +struct ScrapeCtx { + cache_dir: PathBuf, + mandirs: Vec, + help_only: HashSet, + indexed: Mutex>, + timeout_ms: u64, +} + +#[derive(Debug)] +struct PoolJob { + bin_path: PathBuf, + /// the binary's basename — e.g. "git". stays constant across the + /// whole recursion tree for this binary. + base_cmd: String, + /// chain of subcommand tokens past the base. empty for the + /// top-level scrape, ["clone"] for `git clone`, ["stash","apply"] + /// for `git stash apply`. + sub_args: Vec, + depth: u32, +} + +impl PoolJob { + fn full_cmd(&self) -> String { + if self.sub_args.is_empty() { + self.base_cmd.clone() + } else { + format!("{} {}", self.base_cmd, self.sub_args.join(" ")) + } + } +} + +/// hyphenated form used to look up a manpage for a (possibly nested) +/// command — "git" for top-level, "git-remote" for `git remote`, +/// "git-stash-apply" for `git stash apply`. +fn hyphenated_cmd(job: &PoolJob) -> String { + if job.sub_args.is_empty() { + job.base_cmd.clone() + } else { + format!("{}-{}", job.base_cmd, job.sub_args.join("-")) + } +} + +/// some manpages list subcommands with the parent's name as a prefix — +/// git.1 has \fBgit-add\fR(1), \fBgit-remote-ext\fR(1), etc. downstream +/// expects bare subcommand names ("add", "remote-ext") so they dispatch +/// as `git add` / `git remote-ext`. strips a leading "{base}-" wherever +/// present; a no-op when the manpage already uses bare names. +fn strip_subcmd_prefix(result: &mut ManpageResult, base: &str) { + let prefix = format!("{base}-"); + for sc in &mut result.subcommands { + if let Some(rest) = sc.name.strip_prefix(&prefix) { + sc.name = rest.to_string(); + } + } +} + +fn strip_manpage_subcmd_prefixes(result: &mut ManpageResult, file: &Path, cmd_name: &str) { + let filename_base = cmd_name_of_manpage(file); + if !filename_base.is_empty() { + strip_subcmd_prefix(result, &filename_base); + } + let hyphenated_cmd = cmd_name.replace(' ', "-"); + if !hyphenated_cmd.is_empty() && hyphenated_cmd != filename_base { + strip_subcmd_prefix(result, &hyphenated_cmd); + } +} + +/// enqueue child jobs for each discovered subcommand. shared between the +/// manpage and help branches of process_pool_job. +fn enqueue_subcommands( + job: &PoolJob, + subcommands: &[ManpageSubcommand], + submit: &Submitter, +) { + // matches the sequential recurse_subcommand depth check (`depth > MAX`), + // not `>=`, so we get 6 levels (0..=5) of recursion. without this we + // were cutting off the last layer of deep clap trees like jay. + if job.depth > MAX_RECURSE_DEPTH { + return; + } + for sc in subcommands { + if sc.name.len() < 2 || sc.name.starts_with('-') || sc.name == "help" { + continue; + } + let mut next = job.sub_args.clone(); + next.push(sc.name.clone()); + submit.submit(PoolJob { + bin_path: job.bin_path.clone(), + base_cmd: job.base_cmd.clone(), + sub_args: next, + depth: job.depth + 1, + }); + } +} + +/// per-job handler called by every worker. populates the cache + enqueues +/// child jobs (one per discovered subcommand) onto the same pool. +/// +/// source priority is: (1) native completions, (2) manpage, (3) --help. +/// --help text is fetched at step 1 only as a probe for the completions +/// subcommand; it is not mined for content unless steps 1 and 2 both miss. +fn process_pool_job(ctx: &ScrapeCtx, job: PoolJob, submit: &Submitter) { + let full_cmd = job.full_cmd(); + if ctx.indexed.lock().contains(&full_cmd) { + return; + } + let bin_s = job.bin_path.to_string_lossy().to_string(); + + // 1. native completions (top-level only — sub-commands don't ship + // their own completion payloads). classify_binary scans the ELF for + // "complet" needles, and try_native_completion confirms by invoking + // the completions subcommand. + if job.sub_args.is_empty() { + let class = classify_binary(&job.bin_path, &job.bin_path); + if matches!(class, Classify::Skip) { + return; + } + if matches!(class, Classify::HasNativeCompletions) + && let Some(nu) = try_native_completion(&job.bin_path, ctx.timeout_ms) + { + let _ = write_native(&ctx.cache_dir, &full_cmd, &nu); + ctx.indexed.lock().insert(full_cmd); + return; + } + } + + // 2. manpage as primary content source — structured documentation + // over the curated --help summary. + if !ctx.help_only.contains(&job.base_cmd) && !ctx.help_only.contains(&full_cmd) { + let hyphenated = hyphenated_cmd(&job); + if let Some(mp_path) = find_manpage_path(&ctx.mandirs, &hyphenated) + && let Ok(contents) = read_manpage_file(&mp_path) + { + let mut mp_result = parse_manpage_string(&contents); + if !mp_result.entries.is_empty() || !mp_result.subcommands.is_empty() { + strip_subcmd_prefix(&mut mp_result, &hyphenated); + let _ = write_result(&ctx.cache_dir, &full_cmd, "manpage", &mp_result); + ctx.indexed.lock().insert(full_cmd); + enqueue_subcommands(&job, &mp_result.subcommands, submit); + return; + } + } + } + + // 3. fallback: scrape --help text for content. + let text = if job.sub_args.is_empty() { + try_help(&job.bin_path, ctx.timeout_ms) + } else { + try_help_args(&bin_s, &job.sub_args, ctx.timeout_ms) + }; + let Some(text) = text else { return }; + + let result = parse_help_text(&text); + if result.entries.is_empty() && result.subcommands.is_empty() && result.positionals.is_empty() { + return; + } + + // self-listing detection for sub-probes: if the leaf token shows up in + // the result's subcommand list, the binary probably echoed the parent + // help (didn't recognize the token). discard. + if let Some(leaf) = job.sub_args.last() + && result + .subcommands + .iter() + .any(|sc| sc.name.eq_ignore_ascii_case(leaf)) + { + return; + } + + let _ = write_result(&ctx.cache_dir, &full_cmd, "help", &result); + ctx.indexed.lock().insert(full_cmd); + enqueue_subcommands(&job, &result.subcommands, submit); +} + +fn cmd_index( + bindirs: &[PathBuf], + mandirs: &[PathBuf], + ignorelist: &HashSet, + help_only: &HashSet, + dir: &Path, + timeout_ms: u64, + num_workers: usize, +) -> std::io::Result<()> { + ensure_dir(dir)?; + let binaries = list_binaries(bindirs); + let binary_names: HashSet = binaries + .iter() + .filter(|(name, _)| !ignorelist.contains(name)) + .map(|(name, _)| name.clone()) + .collect(); + + // phase 1: parallel scrape of every eligible binary via the BFS pool. + // shared state lives in an Arc; the `indexed` set is the + // one mutable bit and uses parking_lot::Mutex. + let ctx = Arc::new(ScrapeCtx { + cache_dir: dir.to_path_buf(), + mandirs: mandirs.to_vec(), + help_only: help_only.clone(), + indexed: Mutex::new(HashSet::new()), + timeout_ms, + }); + let pool = ScrapePool::new(num_workers, { + let ctx = ctx.clone(); + move |job: PoolJob, submit: &Submitter| { + process_pool_job(&ctx, job, submit); + } + }); + for (name, path) in &binaries { + if ignorelist.contains(name) { + continue; + } + pool.submit(PoolJob { + bin_path: path.clone(), + base_cmd: name.clone(), + sub_args: Vec::new(), + depth: 0, + }); + } + pool.wait(); + // unwrap the indexed set back out for phase 2 — by this point no + // workers are alive so the Arc has only one strong reference. + let mut indexed: HashSet = Arc::try_unwrap(ctx) + .ok() + .map(|c| c.indexed.into_inner()) + .unwrap_or_default(); + + // process manpages for commands not yet indexed (unless they're in help-only). + // shorter filenames sort first so parent manpages (e.g. nix-env.1) are + // processed before subpage manpages (nix-env-install.1). + let mut manpages = list_manpages(mandirs); + manpages.sort_by(|a, b| { + let alen = a.file_name().map(|s| s.len()).unwrap_or(0); + let blen = b.file_name().map(|s| s.len()).unwrap_or(0); + alen.cmp(&blen).then_with(|| a.cmp(b)) + }); + for manpage_path in manpages { + let Some((name, result, sub_sections)) = process_manpage(&manpage_path) else { + continue; + }; + if !manpage_name_has_installed_command(&name, &binary_names) { + continue; + } + let base_cmd = cmd_name_of_manpage(&manpage_path); + if indexed.contains(&name) { + if name != base_cmd { + eprintln!( + "warning: {} extracted cmd \"{}\" (already indexed), skipping", + manpage_path + .file_name() + .and_then(|s| s.to_str()) + .unwrap_or(""), + name + ); + } + continue; + } + if help_only.contains(&name) { + continue; + } + if is_nushell_builtin(&name) { + continue; + } + // clap-style SUBCOMMAND sections produce real, fully-populated + // sub-files (each with its own flags + positionals); they take + // priority over COMMANDS-section leaf stubs. + write_result(dir, &name, "manpage", &result)?; + indexed.insert(name.clone()); + for (sub_cmd, sub_result) in &sub_sections { + if indexed.contains(sub_cmd) { + continue; + } + write_result(dir, sub_cmd, "manpage", sub_result)?; + indexed.insert(sub_cmd.clone()); + } + // for COMMANDS-section subcommands that aren't already covered by + // a SUBCOMMAND section (or a per-subcommand manpage), write a + // description-only stub so the completer treats them as leaves. + // a real per-subcommand manpage processed later will overwrite the + // stub since we deliberately don't add it to `indexed`. + if sub_sections.is_empty() { + for sc in &result.subcommands { + let sub_cmd = format!("{name} {}", sc.name); + if indexed.contains(&sub_cmd) { + continue; + } + let stub = ManpageResult { + entries: Vec::new(), + subcommands: Vec::new(), + positionals: Default::default(), + description: sc.desc.clone(), + }; + write_result(dir, &sub_cmd, "manpage", &stub)?; + } + } + } + + println!("indexed {} commands into {}", indexed.len(), dir.display()); + Ok(()) +} + +// --- manpage subcommand --- + +fn cmd_manpage(file: &Path) -> std::io::Result<()> { + if let Some((name, result, sub_sections)) = process_manpage(file) { + print!("{}", generate_extern(&name, &result)); + for (sub_cmd, sub_result) in sub_sections { + print!("{}", generate_extern(&sub_cmd, &sub_result)); + } + } + Ok(()) +} + +fn cmd_manpage_dir(dir: &Path) -> std::io::Result<()> { + for section in COMMAND_SECTIONS { + let secdir = dir.join(format!("man{section}")); + let Ok(entries) = fs::read_dir(&secdir) else { + continue; + }; + for entry in entries.flatten() { + let path = entry.path(); + if let Some((name, result, sub_sections)) = process_manpage(&path) { + print!("{}", generate_extern(&name, &result)); + for (sub_cmd, sub_result) in sub_sections { + print!("{}", generate_extern(&sub_cmd, &sub_result)); + } + } + } + } + Ok(()) +} + +// --- query / dump / complete --- + +fn cmd_query(cmd: &str, dirs: &[PathBuf]) -> std::io::Result<()> { + match lookup_raw(dirs, cmd) { + Some(data) => { + print!("{data}"); + Ok(()) + } + None => { + eprintln!("not found: {cmd}"); + std::process::exit(1); + } + } +} + +fn cmd_dump(dirs: &[PathBuf]) { + let cmds = all_commands(dirs); + println!("{} commands", cmds.len()); + for cmd in &cmds { + let src = file_type_of(dirs, cmd).unwrap_or_else(|| "?".to_string()); + println!("{src:>8} {cmd}"); + } +} + +/// look up a command's path in $PATH. +fn find_in_path(name: &str) -> Option { + let path_var = std::env::var("PATH").ok()?; + for dir in path_var.split(':') { + let candidate = Path::new(dir).join(name); + if is_executable(&candidate) { + return Some(candidate); + } + } + None +} + +fn executable_span_path(span: &str) -> Option { + if !span.contains('/') { + return None; + } + let path = PathBuf::from(span); + is_executable(&path).then_some(path) +} + +fn command_name_for_path(path: &Path) -> Option { + path.file_name() + .and_then(|name| name.to_str()) + .filter(|name| !name.is_empty()) + .map(ToOwned::to_owned) +} + +/// compute completion match quality. zero means no match. +/// +/// scoring tiers: +/// - exact match: 1000 +/// - prefix match: 900 + length bonus +/// - subsequence match: per-character score with bonuses for word boundaries +/// and consecutive matches +fn fuzzy_score(needle: &str, haystack: &str) -> i32 { + let needle_len = needle.chars().count(); + let haystack_len = haystack.chars().count(); + if needle_len == 0 { + return 1; + } + if needle_len > haystack_len { + return 0; + } + if needle == haystack { + return 1000; + } + + let needle_lc = needle.to_ascii_lowercase(); + let haystack_lc = haystack.to_ascii_lowercase(); + if haystack_lc.starts_with(&needle_lc) { + return 900 + (needle_len as i32 * 100 / haystack_len as i32); + } + + let needle_chars: Vec = needle_lc.chars().collect(); + let haystack_chars: Vec = haystack.chars().collect(); + let haystack_lc_chars: Vec = haystack_lc.chars().collect(); + + let mut needle_idx = 0usize; + let mut score = 0i32; + let mut prev_match: Option = None; + + for (hay_idx, c) in haystack_lc_chars.iter().enumerate() { + if needle_idx >= needle_len { + break; + } + if *c != needle_chars[needle_idx] { + continue; + } + + let boundary = hay_idx == 0 + || haystack_chars[hay_idx - 1] == '-' + || haystack_chars[hay_idx - 1] == '_' + || (haystack_chars[hay_idx - 1].is_ascii_lowercase() + && haystack_chars[hay_idx].is_ascii_uppercase()); + let consecutive = prev_match == Some(hay_idx.saturating_sub(1)); + score += if boundary { 50 } else { 10 }; + if consecutive { + score += 20; + } + needle_idx += 1; + prev_match = Some(hay_idx); + } + + if needle_idx == needle_len { score } else { 0 } +} + +fn json_escape(s: &str) -> String { + let mut out = String::with_capacity(s.len() + 2); + for c in s.chars() { + match c { + '"' => out.push_str("\\\""), + '\\' => out.push_str("\\\\"), + '\n' => out.push_str("\\n"), + '\r' => out.push_str("\\r"), + '\t' => out.push_str("\\t"), + c if (c as u32) < 0x20 => out.push_str(&format!("\\u{:04x}", c as u32)), + c => out.push(c), + } + } + out +} + +fn completion_json(value: &str, desc: &str) -> String { + format!( + r#"{{"value":"{}","description":"{}"}}"#, + json_escape(value), + json_escape(desc) + ) +} + +fn print_completion_candidates(candidates: &[String]) { + if candidates.is_empty() { + println!("null"); + } else { + println!("[{}]", candidates.join(",")); + } +} + +#[derive(Clone, Debug)] +struct AdbDevice { + serial: String, + desc: String, + transport_id: Option, +} + +enum AdbDeviceCompletion { + Serial { + prefix: String, + replacement_prefix: String, + }, + TransportId { + prefix: String, + replacement_prefix: String, + }, +} + +fn adb_device_completion(rest: &[String]) -> Option { + if !adb_command_tokens(rest).is_empty() { + return None; + } + let current = rest.last().map(String::as_str).unwrap_or(""); + if let Some(prefix) = current.strip_prefix("--serial=") { + return Some(AdbDeviceCompletion::Serial { + prefix: prefix.to_string(), + replacement_prefix: "--serial=".to_string(), + }); + } + if let Some(prefix) = current.strip_prefix("--one-device=") { + return Some(AdbDeviceCompletion::Serial { + prefix: prefix.to_string(), + replacement_prefix: "--one-device=".to_string(), + }); + } + if let Some(prefix) = current.strip_prefix("--transport-id=") { + return Some(AdbDeviceCompletion::TransportId { + prefix: prefix.to_string(), + replacement_prefix: "--transport-id=".to_string(), + }); + } + if rest.len() >= 2 { + let prev = rest[rest.len() - 2].as_str(); + if prev == "-s" || prev == "--serial" || prev == "--one-device" { + return Some(AdbDeviceCompletion::Serial { + prefix: current.to_string(), + replacement_prefix: String::new(), + }); + } + if prev == "-t" || prev == "--transport-id" { + return Some(AdbDeviceCompletion::TransportId { + prefix: current.to_string(), + replacement_prefix: String::new(), + }); + } + } + None +} + +fn parse_adb_devices(output: &str) -> Vec { + let mut out = Vec::new(); + for line in output.lines() { + let trimmed = line.trim(); + if trimmed.is_empty() + || trimmed.starts_with('*') + || trimmed.eq_ignore_ascii_case("List of devices attached") + { + continue; + } + + let parts: Vec<&str> = trimmed.split_whitespace().collect(); + if parts.len() < 2 { + continue; + } + let serial = parts[0]; + let state = if parts.get(1) == Some(&"no") && parts.get(2) == Some(&"permissions") { + "no permissions" + } else { + parts[1] + }; + if serial.eq_ignore_ascii_case("list") { + continue; + } + if !is_adb_device_state(state) { + continue; + } + + let mut details = Vec::new(); + let mut transport_id = None; + let detail_start = if state == "no permissions" { 3 } else { 2 }; + for part in parts.iter().skip(detail_start) { + if let Some(model) = part.strip_prefix("model:") { + details.push(model.replace('_', " ")); + } else if let Some(product) = part.strip_prefix("product:") { + details.push(product.replace('_', " ")); + } else if let Some(id) = part.strip_prefix("transport_id:") { + transport_id = Some(id.to_string()); + } + } + let desc = if details.is_empty() { + state.to_string() + } else { + format!("{state} {}", details.join(" ")) + }; + out.push(AdbDevice { + serial: serial.to_string(), + desc, + transport_id, + }); + } + out +} + +fn is_adb_device_state(state: &str) -> bool { + matches!( + state, + "device" + | "offline" + | "unauthorized" + | "recovery" + | "sideload" + | "rescue" + | "no permissions" + ) +} + +fn adb_device_candidates( + path: &Path, + completion: AdbDeviceCompletion, + timeout_ms: u64, +) -> Vec { + let args = vec![ + path.to_string_lossy().to_string(), + "devices".to_string(), + "-l".to_string(), + ]; + let Some(output) = run_cmd(&args, timeout_ms) else { + return Vec::new(); + }; + let mut scored = Vec::new(); + for device in parse_adb_devices(&output) { + match &completion { + AdbDeviceCompletion::Serial { + prefix, + replacement_prefix, + } => { + let score = prefix_score(prefix, &device.serial); + if score > 0 { + scored.push(( + score, + completion_json( + &format!("{replacement_prefix}{}", &device.serial), + &device.desc, + ), + )); + } + } + AdbDeviceCompletion::TransportId { + prefix, + replacement_prefix, + } => { + if let Some(id) = &device.transport_id { + let score = prefix_score(prefix, id); + if score > 0 { + scored.push(( + score, + completion_json( + &format!("{replacement_prefix}{id}"), + &format!("{} {}", &device.serial, &device.desc), + ), + )); + } + } + } + } + } + scored.sort_by(|a, b| b.0.cmp(&a.0)); + scored.into_iter().map(|(_, json)| json).collect() +} + +fn prefix_score(prefix: &str, value: &str) -> i32 { + if prefix.is_empty() { + return 1; + } + let prefix = prefix.to_ascii_lowercase(); + let value = value.to_ascii_lowercase(); + if prefix == value { + 1000 + } else if value.starts_with(&prefix) { + 900 + } else { + 0 + } +} + +fn adb_selector_args(rest: &[String]) -> Vec { + let mut out = Vec::new(); + let mut i = 0; + while i < rest.len() { + let token = rest[i].as_str(); + if matches!(token, "-s" | "--serial" | "-t" | "--transport-id") { + if i + 1 < rest.len() && !rest[i + 1].is_empty() { + out.push(rest[i].clone()); + out.push(rest[i + 1].clone()); + i += 2; + continue; + } + } else if (token.starts_with("--serial=") || token.starts_with("--transport-id=")) + && !token.ends_with('=') + { + out.push(rest[i].clone()); + } + i += 1; + } + out +} + +fn adb_command_tokens(rest: &[String]) -> Vec<&str> { + let mut out = Vec::new(); + let mut i = 0; + while i < rest.len() { + let token = rest[i].as_str(); + if matches!( + token, + "-s" | "--serial" | "-t" | "--transport-id" | "--one-device" + ) { + i += if i + 1 < rest.len() { 2 } else { 1 }; + continue; + } + if token.starts_with("--serial=") + || token.starts_with("--transport-id=") + || token.starts_with("--one-device=") + { + i += 1; + continue; + } + out.push(token); + i += 1; + } + out +} + +fn adb_package_completion_prefix(rest: &[String]) -> Option<&str> { + let tokens = adb_command_tokens(rest); + let first = *tokens.first()?; + if first == "uninstall" { + return package_prefix_for_arg_tail(&tokens[1..], &["--user"]); + } + if tokens.len() >= 4 && tokens[0] == "shell" && tokens[1] == "pm" { + let action = tokens[2]; + if matches!(action, "clear" | "disable-user" | "enable") { + return package_prefix_for_arg_tail(&tokens[3..], &["--user"]); + } + } + if tokens.len() >= 4 && tokens[0] == "shell" && tokens[1] == "am" && tokens[2] == "force-stop" { + return package_prefix_for_arg_tail(&tokens[3..], &["--user"]); + } + None +} + +fn package_prefix_for_arg_tail<'a>(args: &[&'a str], value_flags: &[&str]) -> Option<&'a str> { + let current = *args.last()?; + if current.starts_with('-') { + return None; + } + if args.len() >= 2 && value_flags.contains(&args[args.len() - 2]) { + return None; + } + let mut positional_count = 0usize; + let mut i = 0usize; + let end = args.len().saturating_sub(1); + while i < end { + let token = args[i]; + if token.starts_with('-') { + i += if value_flags.contains(&token) && i + 1 < end { + 2 + } else { + 1 + }; + } else { + positional_count += 1; + i += 1; + } + } + (positional_count == 0).then_some(current) +} + +fn parse_adb_packages(output: &str) -> Vec { + let mut out = Vec::new(); + for line in output.lines() { + let Some(package) = line.trim().strip_prefix("package:") else { + continue; + }; + let package = package + .rsplit_once('=') + .map(|(_, rhs)| rhs) + .unwrap_or(package); + let package = package.trim(); + if !package.is_empty() { + out.push(package.to_string()); + } + } + out +} + +fn adb_package_candidates( + path: &Path, + selector_args: &[String], + prefix: &str, + timeout_ms: u64, +) -> Vec { + let mut args = vec![path.to_string_lossy().to_string()]; + args.extend(selector_args.iter().cloned()); + args.extend( + ["shell", "pm", "list", "packages"] + .into_iter() + .map(str::to_string), + ); + let Some(output) = run_cmd(&args, timeout_ms) else { + return Vec::new(); + }; + let mut scored = Vec::new(); + for package in parse_adb_packages(&output) { + let score = prefix_score(prefix, &package); + if score > 0 { + scored.push((score, completion_json(&package, "package"))); + } + } + scored.sort_by(|a, b| b.0.cmp(&a.0)); + scored.into_iter().map(|(_, json)| json).collect() +} + +fn dynamic_value_completions( + cmd_name: &str, + rest: &[String], + explicit_cmd_path: Option<&Path>, + timeout_ms: u64, +) -> Option> { + if cmd_name != "adb" { + return None; + } + let path = explicit_cmd_path + .map(Path::to_path_buf) + .or_else(|| find_in_path(cmd_name))?; + if let Some(completion) = adb_device_completion(rest) { + return Some(adb_device_candidates(&path, completion, timeout_ms)); + } + if let Some(prefix) = adb_package_completion_prefix(rest) { + let selectors = adb_selector_args(rest); + return Some(adb_package_candidates( + &path, &selectors, prefix, timeout_ms, + )); + } + None +} + +/// dynamically scrape --help for a command not in the cache, write the result +/// into the user store, and return its parsed form. discovered subcommands +/// are also written. +fn resolve_and_cache( + user_dir: &Path, + mandirs: &[PathBuf], + cmd_name: &str, + path: &Path, + timeout_ms: u64, +) -> Option { + resolve_command_path_and_cache(user_dir, mandirs, cmd_name, &[], path, timeout_ms) +} + +fn resolve_command_path_and_cache( + user_dir: &Path, + mandirs: &[PathBuf], + base_cmd: &str, + sub_args: &[String], + path: &Path, + timeout_ms: u64, +) -> Option { + let full_cmd = if sub_args.is_empty() { + base_cmd.to_string() + } else { + format!("{base_cmd} {}", sub_args.join(" ")) + }; + let hyphenated = if sub_args.is_empty() { + base_cmd.to_string() + } else { + format!("{base_cmd}-{}", sub_args.join("-")) + }; + + // 1. native completions + if matches!(classify_binary(path, path), Classify::HasNativeCompletions) + && let Some(nu) = try_native_completion(path, timeout_ms) + { + let _ = write_native(user_dir, base_cmd, &nu); + return Some(parse_nu_completions(&full_cmd, &nu)); + } + // 2. manpage as primary content source. + if let Some(mp_path) = find_manpage_path(mandirs, &hyphenated) + && let Ok(contents) = read_manpage_file(&mp_path) + { + let mut result = parse_manpage_string(&contents); + if !result.entries.is_empty() || !result.subcommands.is_empty() { + strip_subcmd_prefix(&mut result, &hyphenated); + let _ = write_result(user_dir, &full_cmd, "manpage", &result); + return Some(result); + } + } + // 3. fallback: scrape --help text. + let text = if sub_args.is_empty() { + try_help(path, timeout_ms) + } else { + let bin_s = path.to_string_lossy().to_string(); + try_help_args(&bin_s, sub_args, timeout_ms) + }?; + let parsed = parse_help_text(&text); + if parsed.entries.is_empty() && parsed.subcommands.is_empty() && parsed.positionals.is_empty() { + return None; + } + if let Some(leaf) = sub_args.last() + && parsed + .subcommands + .iter() + .any(|sc| sc.name.eq_ignore_ascii_case(leaf)) + { + return None; + } + + let _ = write_result(user_dir, &full_cmd, "help", &parsed); + if sub_args.is_empty() { + let mut sub_acc: Vec<(String, ManpageResult)> = Vec::new(); + help_resolve(path, base_cmd, 1, timeout_ms, &mut sub_acc); + for (cmd, r) in sub_acc.into_iter().skip(1) { + let _ = write_result(user_dir, &cmd, "help", &r); + } + } else { + let bin_s = path.to_string_lossy().to_string(); + let inner_subs: Vec = parsed + .subcommands + .iter() + .map(|sc| sc.name.clone()) + .filter(|n| n.len() >= 2 && !n.starts_with('-') && n != "help") + .collect(); + let mut sub_acc: Vec<(String, ManpageResult)> = Vec::new(); + for sub in inner_subs { + let mut next = sub_args.to_vec(); + next.push(sub); + recurse_subcommand( + &bin_s, + base_cmd, + &next, + sub_args.len() as u32 + 2, + timeout_ms, + &mut sub_acc, + ); + } + for (cmd, r) in sub_acc { + let _ = write_result(user_dir, &cmd, "help", &r); + } + } + Some(parsed) +} + +const ELEVATION_COMMANDS: &[&str] = &["sudo", "doas", "pkexec", "su", "run0"]; + +fn cmd_complete( + spans: &[String], + user_dir: &Path, + system_dirs: &[PathBuf], + mandirs: &[PathBuf], + timeout_ms: u64, +) { + let mut dirs: Vec = system_dirs.to_vec(); + dirs.push(user_dir.to_path_buf()); + + // skip past elevation wrappers (sudo, doas) to find the real command + let mut explicit_cmd_path: Option = None; + let mut spans: Vec = match spans.first() { + Some(first) if ELEVATION_COMMANDS.contains(&first.as_str()) => { + let rest = &spans[1..]; + let mut real_spans = None; + for (idx, s) in rest.iter().enumerate() { + if let Some(path) = executable_span_path(s) + && let Some(name) = command_name_for_path(&path) + { + let mut target = rest[idx..].to_vec(); + target[0] = name; + explicit_cmd_path = Some(path); + real_spans = Some(target); + break; + } + if !s.is_empty() + && !s.starts_with('-') + && (lookup(&dirs, s).is_some() || find_in_path(s).is_some()) + { + real_spans = Some(rest[idx..].to_vec()); + break; + } + } + real_spans.unwrap_or_else(|| spans.to_vec()) + } + _ => spans.to_vec(), + }; + if explicit_cmd_path.is_none() + && let Some(first) = spans.first() + && let Some(path) = executable_span_path(first) + && let Some(name) = command_name_for_path(&path) + { + spans[0] = name; + explicit_cmd_path = Some(path); + } + + if spans.is_empty() { + println!("null"); + return; + } + + let cmd_name = spans[0].clone(); + let rest: Vec = spans[1..].to_vec(); + + if let Some(candidates) = + dynamic_value_completions(&cmd_name, &rest, explicit_cmd_path.as_deref(), timeout_ms) + { + print_completion_candidates(&candidates); + return; + } + + // strip intermediate flag tokens — they aren't part of subcommand path + let mut tokens: Vec = vec![cmd_name.clone()]; + if !rest.is_empty() { + let (last, leading) = rest.split_last().unwrap(); + for t in leading { + if !t.starts_with('-') || t.is_empty() { + tokens.push(t.clone()); + } + } + tokens.push(last.clone()); + } + + let last_token = rest.last().cloned().unwrap_or_default(); + // lookup tokens exclude the partial unless the user has typed a trailing space + let lookup_tokens: Vec = if last_token.is_empty() { + tokens.clone() + } else if tokens.len() > 1 { + tokens[..tokens.len() - 1].to_vec() + } else { + vec![cmd_name.clone()] + }; + + // try longest-prefix match: "git stash apply" → "git stash" → "git" + let find_result = |toks: &[String]| -> Option<(String, ManpageResult, usize)> { + let n = toks.len(); + for drop in 0..n { + let prefix = &toks[..n - drop]; + if prefix.is_empty() { + continue; + } + let name = prefix.join(" "); + if let Some(r) = lookup(&dirs, &name) { + return Some((name, r, prefix.len())); + } + } + None + }; + + let mut found = find_result(&lookup_tokens); + + // dynamic resolve: if nothing matches or only a parent matched, try --help + let resolve_tokens: Vec = lookup_tokens + .iter() + .filter(|t| !t.is_empty()) + .cloned() + .collect(); + let lookup_depth = lookup_tokens.len(); + let resolve_depth = resolve_tokens.len(); + let need_resolve = match &found { + Some((_, _, depth)) => *depth < resolve_depth, + None => resolve_depth > 0, + }; + if need_resolve + && let Some(path) = explicit_cmd_path + .as_ref() + .cloned() + .or_else(|| find_in_path(&cmd_name)) + { + // build extended mandirs from the binary's own prefix as well + let mut all_mandirs = mandirs.to_vec(); + if let Some(parent) = path.parent() + && let Some(prefix) = parent.parent() + { + let share_man = prefix.join("share/man"); + if share_man.is_dir() { + all_mandirs.push(share_man); + } + } + let sub_args = if resolve_tokens.len() > 1 { + resolve_tokens[1..].to_vec() + } else { + Vec::new() + }; + let resolved = if sub_args.is_empty() { + resolve_and_cache(user_dir, &all_mandirs, &cmd_name, &path, timeout_ms) + } else { + resolve_command_path_and_cache( + user_dir, + &all_mandirs, + &cmd_name, + &sub_args, + &path, + timeout_ms, + ) + }; + if resolved.is_some() { + found = find_result(&lookup_tokens); + } + } + + let typing_flag = last_token.starts_with('-') && !last_token.is_empty(); + let candidates: Vec = match &found { + None => Vec::new(), + Some((matched_name, r, depth)) => { + let mut scored: Vec<(i32, String)> = Vec::new(); + // subcommand candidates (skip if match is too shallow) + if *depth >= lookup_depth.saturating_sub(1) { + let subs: Vec = if !r.subcommands.is_empty() { + r.subcommands.clone() + } else { + subcommands_of(&dirs, matched_name) + }; + for sc in &subs { + let s = fuzzy_score(&last_token, &sc.name); + if s > 0 { + scored.push((s, completion_json(&sc.name, &sc.desc))); + } + } + } + // flag candidates + if typing_flag { + for e in &r.entries { + let base_desc = match &e.param { + Some(OwnedParam::Mandatory(p)) => { + if e.desc.is_empty() { + format!("<{p}>") + } else { + format!("{} <{p}>", e.desc) + } + } + Some(OwnedParam::Optional(p)) => { + if e.desc.is_empty() { + format!("[{p}]") + } else { + format!("{} [{p}]", e.desc) + } + } + None => e.desc.clone(), + }; + let (flag, desc) = match &e.switch { + OwnedSwitch::Long(l) => (format!("--{l}"), base_desc), + OwnedSwitch::Short(c) => (format!("-{c}"), base_desc), + OwnedSwitch::Both(c, l) => { + let long_flag = format!("--{l}"); + let short_flag = format!("-{c}"); + let ls = fuzzy_score(&last_token, &long_flag); + let ss = fuzzy_score(&last_token, &short_flag); + if ss > ls { + (short_flag, format!("(aka {long_flag}) {base_desc}")) + } else { + (long_flag.clone(), format!("(aka {short_flag}) {base_desc}")) + } + } + }; + let s = fuzzy_score(&last_token, &flag); + if s > 0 { + scored.push((s, completion_json(&flag, &desc))); + } + } + } + scored.sort_by(|a, b| b.0.cmp(&a.0)); + scored.into_iter().map(|(_, json)| json).collect() + } + }; + + // protocol: null = hand off to nushell's file completer; [...] = our candidates + let has_subs = match &found { + Some((matched_name, r, _)) => { + !r.subcommands.is_empty() || !subcommands_of(&dirs, matched_name).is_empty() + } + None => false, + }; + // hand off at non-flag leaf positions so file and dynamic completers can + // answer argument prefixes. when the token starts with "-", keep flags. + let want_files = !typing_flag && !has_subs && (last_token.is_empty() || candidates.is_empty()); + if want_files || candidates.is_empty() { + println!("null"); + } else { + print_completion_candidates(&candidates); + } +} + +// --- completions self-emission --- + +fn cmd_completions() { + // emit completions for inshellah itself. + let entries: Vec = vec![ManpageEntry { + switch: OwnedSwitch::Both('h', "help".to_string()), + param: None, + desc: "show help".to_string(), + }]; + let subs = [ + "index", + "manpage", + "manpage-dir", + "complete", + "query", + "dump", + "completions", + ]; + let mut subcommands = Vec::new(); + for s in subs { + subcommands.push(ManpageSubcommand { + name: s.to_string(), + desc: String::new(), + }); + } + let result = ManpageResult { + entries, + subcommands, + positionals: Default::default(), + description: "nushell completions engine".to_string(), + }; + print!("{}", generate_module("inshellah", &result)); +} + +// --- argument parsing --- + +struct IndexArgs { + prefixes: Vec, + dir: Option, + ignore: Option, + help_only: Option, + timeout_ms: u64, + workers: usize, +} + +fn parse_index_args(args: &[String]) -> IndexArgs { + let mut out = IndexArgs { + prefixes: Vec::new(), + dir: None, + ignore: None, + help_only: None, + timeout_ms: DEFAULT_TIMEOUT_MS, + workers: default_workers(), + }; + let mut i = 0; + while i < args.len() { + match args[i].as_str() { + "--dir" => { + i += 1; + if i < args.len() { + out.dir = Some(PathBuf::from(&args[i])); + } + } + "--ignore" => { + i += 1; + if i < args.len() { + out.ignore = Some(PathBuf::from(&args[i])); + } + } + "--help-only" => { + i += 1; + if i < args.len() { + out.help_only = Some(PathBuf::from(&args[i])); + } + } + "--timeout-ms" => { + i += 1; + if i < args.len() + && let Ok(n) = args[i].parse::() + { + out.timeout_ms = n; + } + } + "--workers" => { + i += 1; + if i < args.len() + && let Ok(n) = args[i].parse::() + { + out.workers = n.max(1); + } + } + other => { + out.prefixes.push(PathBuf::from(other)); + } + } + i += 1; + } + out +} + +/// best-effort thread count default: `available_parallelism` (1.59+), else 4. +fn default_workers() -> usize { + std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(4) +} + +fn man_dir_of_prefix(prefix: &Path) -> PathBuf { + prefix.join("share/man") +} + +/// parse --dir PATH[:PATH...], optional --timeout-ms N, plus any +/// positional args. when --dir isn't supplied, returns the default cache +/// dir as the single entry. +fn parse_dir_args(args: &[String]) -> (Vec, Vec, u64) { + let mut positional = Vec::new(); + let mut dirs: Option> = None; + let mut timeout_ms = DEFAULT_TIMEOUT_MS; + let mut i = 0; + while i < args.len() { + match args[i].as_str() { + "--dir" => { + i += 1; + if i < args.len() { + dirs = Some(args[i].split(':').map(PathBuf::from).collect()); + } + } + "--timeout-ms" => { + i += 1; + if i < args.len() + && let Ok(n) = args[i].parse::() + { + timeout_ms = n; + } + } + _ => { + positional.push(args[i].clone()); + } + } + i += 1; + } + let dirs = dirs.unwrap_or_else(|| vec![default_store_path()]); + (positional, dirs, timeout_ms) +} + +fn main() { + let args: Vec = std::env::args().collect(); + if args.len() < 2 { + usage(); + std::process::exit(1); + } + match args[1].as_str() { + "index" => { + let parsed = parse_index_args(&args[2..]); + if parsed.prefixes.is_empty() { + eprintln!("error: index requires at least one PREFIX"); + std::process::exit(1); + } + let dir = parsed.dir.unwrap_or_else(default_store_path); + let ignorelist = parsed + .ignore + .as_deref() + .map(load_ignorelist) + .unwrap_or_default(); + let help_only = parsed + .help_only + .as_deref() + .map(load_ignorelist) + .unwrap_or_default(); + let bindirs: Vec = parsed.prefixes.iter().map(|p| p.join("bin")).collect(); + let mandirs: Vec = parsed + .prefixes + .iter() + .map(|p| man_dir_of_prefix(p)) + .collect(); + if let Err(e) = cmd_index( + &bindirs, + &mandirs, + &ignorelist, + &help_only, + &dir, + parsed.timeout_ms, + parsed.workers, + ) { + eprintln!("index failed: {e}"); + std::process::exit(1); + } + } + "manpage" => { + if args.len() < 3 { + eprintln!("error: manpage requires a FILE argument"); + std::process::exit(1); + } + if let Err(e) = cmd_manpage(Path::new(&args[2])) { + eprintln!("manpage failed: {e}"); + std::process::exit(1); + } + } + "manpage-dir" => { + if args.len() < 3 { + eprintln!("error: manpage-dir requires a DIR argument"); + std::process::exit(1); + } + if let Err(e) = cmd_manpage_dir(Path::new(&args[2])) { + eprintln!("manpage-dir failed: {e}"); + std::process::exit(1); + } + } + "complete" => { + let (positional, dirs, timeout_ms) = parse_dir_args(&args[2..]); + // first dir is the writable user cache; rest are read-only system dirs + let (user_dir, system_dirs): (PathBuf, Vec) = match dirs.split_first() { + Some((first, rest)) => (first.clone(), rest.to_vec()), + None => (default_store_path(), Vec::new()), + }; + // mandirs default to share/man siblings of each system dir + let mandirs: Vec = system_dirs + .iter() + .filter_map(|d| d.parent().map(|p| p.join("share/man"))) + .filter(|p| p.is_dir()) + .collect(); + cmd_complete(&positional, &user_dir, &system_dirs, &mandirs, timeout_ms); + } + "query" => { + let (positional, dirs, _timeout_ms) = parse_dir_args(&args[2..]); + if positional.is_empty() { + eprintln!("error: query requires a CMD argument"); + std::process::exit(1); + } + let cmd = positional.join(" "); + if let Err(e) = cmd_query(&cmd, &dirs) { + eprintln!("query failed: {e}"); + std::process::exit(1); + } + } + "dump" => { + let (_, dirs, _timeout_ms) = parse_dir_args(&args[2..]); + cmd_dump(&dirs); + } + "completions" => cmd_completions(), + "--help" | "-h" | "help" => usage(), + other => { + eprintln!("unknown subcommand: {other}"); + usage(); + std::process::exit(1); + } + } + // make warning go away + let _ = filename_of_command; +} diff --git a/src/parsers/help.rs b/src/parsers/help.rs new file mode 100644 index 0000000..ab88656 --- /dev/null +++ b/src/parsers/help.rs @@ -0,0 +1,187 @@ +mod description; +mod helpers; +mod options; +mod positionals; +mod subcommands; + +pub use options::{param_parser, parse_usage_flags, switch_parser}; +pub use positionals::{ + extract_cli11_positionals, extract_usage_positionals, parse_usage_args, skip_command_name, +}; + +use std::collections::HashMap; + +use crate::{ + parsers::help::{description::description, helpers::get_indent, subcommands::subcommand_entry}, + types::*, +}; +use nom::{IResult, Parser, character::complete::space0, combinator::opt}; + +use crate::make_parser; + +type EntryParts<'a> = ( + &'a str, + (Switch<'a>, Option>), + (&'a str, Vec<&'a str>), +); + +// parse a single flag entry: indent + switch + optional param + description. +make_parser!(entry -> OptionEntry<'a>, + ( + space0, + (switch_parser, opt(param_parser)), + description, + ) + => |(_, (switch, param), (first, cont)) + : EntryParts<'a>| + { + let mut desc: Vec<&str> = Vec::with_capacity(1 + cont.len()); + if !first.trim().is_empty() { desc.push(first); } + desc.extend(cont.into_iter().filter(|l| !l.trim().is_empty())); + OptionEntry { switch, param, desc } + } +); + +/// dedup raw subcommands by case-insensitive name, keeping the entry with +/// the longest description. preserves first-seen ordering. +fn dedup_subcommands<'a>(raw: Vec>) -> Vec> { + let mut by_name: HashMap> = HashMap::new(); + let mut order: Vec = Vec::new(); + for sc in raw { + let key = sc.name.to_ascii_lowercase(); + match by_name.get(&key) { + Some(prev) if prev.desc.len() >= sc.desc.len() => {} + _ => { + if !by_name.contains_key(&key) { + order.push(key.clone()); + } + by_name.insert(key, sc); + } + } + } + order + .into_iter() + .map(|k| by_name.remove(&k).unwrap()) + .collect() +} + +#[derive(Clone, Copy, PartialEq, Eq)] +enum HelpSection { + Unknown, + Options, + Commands, + Other, +} + +fn classify_section_line(line: &str) -> Option { + let (idx, indent) = get_indent(line); + if indent > 4 { + return None; + } + let trimmed = line[idx..].trim(); + if trimmed.is_empty() { + return None; + } + let without_colon = trimmed.trim_end_matches(':').trim(); + let lower = without_colon.to_ascii_lowercase(); + + if lower.starts_with("usage") { + return Some(HelpSection::Unknown); + } + if lower.starts_with("valid arguments") + || lower.contains(" is one of the following") + || lower.contains(" defaults to") + || lower == "examples" + || lower == "example" + { + return Some(HelpSection::Other); + } + let command_header = matches!(lower.as_str(), "command" | "commands" | "subcommands") + || lower.ends_with(" commands") + || lower.ends_with(" subcommands"); + if command_header && !lower.contains("option") && !lower.contains("flag") { + return Some(HelpSection::Commands); + } + if lower.contains("argument") + || lower == "args" + || lower == "positionals" + || lower == "positional arguments" + { + return Some(HelpSection::Other); + } + if lower.contains("option") || lower.contains("flag") || trimmed.ends_with(':') { + return Some(HelpSection::Options); + } + None +} + +fn consume_line(s: &str) -> &str { + match s.find('\n') { + Some(idx) => &s[idx + 1..], + None => "", + } +} + +fn parser_made_progress(original: &str, rem: &str) -> bool { + rem.len() < original.len() +} + +/// build the final HelpResult by scanning help text with lightweight section +/// awareness. options are accepted in option-like sections and before a +/// section is known; subcommands are accepted only in command-like sections. +fn build_help_result<'a>(original: &'a str) -> HelpResult<'a> { + let mut entries = Vec::new(); + let mut raw_subcommands: Vec> = Vec::new(); + let mut section = HelpSection::Unknown; + let mut rem = original; + + while !rem.is_empty() { + let line = rem.split_once('\n').map(|(line, _)| line).unwrap_or(rem); + if let Some(next_section) = classify_section_line(line) { + section = next_section; + rem = consume_line(rem); + continue; + } + + if matches!(section, HelpSection::Unknown | HelpSection::Options) + && let Ok((next, parsed)) = entry(rem) + && parser_made_progress(rem, next) + { + entries.push(parsed); + rem = next; + continue; + } + + if section == HelpSection::Commands + && let Ok((next, parsed)) = subcommand_entry(rem) + && parser_made_progress(rem, next) + { + raw_subcommands.push(parsed); + rem = next; + continue; + } + + rem = consume_line(rem); + } + + let subcommands = dedup_subcommands(raw_subcommands); + // cli11 positional section takes priority over the usage-line scan + // when both are present — cli11 carries types and optionality. + let positionals = match extract_cli11_positionals(original) { + Ok((_, p)) if !p.is_empty() => p, + _ => extract_usage_positionals(original) + .map(|(_, p)| p) + .unwrap_or_default(), + }; + HelpResult { + entries, + subcommands, + positionals, + desc: "", + } +} + +/// top-level help parser. +pub fn help_parser(s: &str) -> IResult<&str, HelpResult<'_>> { + Ok(("", build_help_result(s))) +} diff --git a/src/parsers/help/description.rs b/src/parsers/help/description.rs new file mode 100644 index 0000000..b10ab72 --- /dev/null +++ b/src/parsers/help/description.rs @@ -0,0 +1,37 @@ +use nom::{ + IResult, Parser, + character::complete::space0, + combinator::verify, + multi::many0, + sequence::{preceded, terminated}, +}; + +use crate::make_parser; +use crate::parsers::help::helpers::{at_least_indent, eol, rest_of_line}; + +// continuation line: an indented (≥8 visual cols), non-flag-shaped line +// belonging to the previous flag's description. blank-but-indented lines +// are accepted (content = ""), filtered out by the caller's join. +make_parser!(continuation_line -> &'a str, + verify( + preceded( + // assert ≥8 visual cols of leading horizontal whitespace + // without consuming — space0 inside `rest_of_line`'s preceded + // will eat them next. + at_least_indent(8), + terminated(preceded(space0, rest_of_line), eol) + ), + // reject lines whose first non-space char is '-' — that's a new + // flag entry, not a continuation of the previous one. + |content: &&str| !content.starts_with('-') + ) +); + +// description: the line of text after the switch+param, plus any +// continuation lines. always succeeds — first line may be empty (when +// the switch is followed immediately by a newline, "clap long" style). +make_parser!(pub description -> (&'a str, Vec<&'a str>), +( + terminated(preceded(space0, rest_of_line), eol), + many0(continuation_line), +)); diff --git a/src/parsers/help/helpers.rs b/src/parsers/help/helpers.rs new file mode 100644 index 0000000..c6892a3 --- /dev/null +++ b/src/parsers/help/helpers.rs @@ -0,0 +1,105 @@ +use nom::{ + AsChar, IResult, Parser, branch::alt, bytes::complete::take_till, + character::complete::line_ending, combinator::eof, +}; +#[allow(unused_imports)] +use nom::{bytes::complete::take_while, combinator::peek, combinator::verify}; + +#[macro_export] +macro_rules! make_parser { + (pub $name:ident -> $out:ty, $parser:expr => $wrap:expr) => { + #[allow(clippy::needless_lifetimes)] + #[allow(mismatched_lifetime_syntaxes)] + pub fn $name<'a>(s: &'a str) -> IResult<&'a str, $out> { + let (rem, val) = $parser.parse(s)?; + Ok((rem, $wrap(val))) + } + }; + (pub $name:ident -> $out:ty, $parser:expr) => { + #[allow(clippy::needless_lifetimes)] + #[allow(mismatched_lifetime_syntaxes)] + pub fn $name<'a>(s: &'a str) -> IResult<&'a str, $out> { + $parser.parse(s) + } + }; + ($name:ident -> $out:ty, $parser:expr => $wrap:expr) => { + #[allow(clippy::needless_lifetimes)] + #[allow(mismatched_lifetime_syntaxes)] + fn $name<'a>(s: &'a str) -> IResult<&'a str, $out> { + let (rem, val) = $parser.parse(s)?; + Ok((rem, $wrap(val))) + } + }; + ($name:ident -> $out:ty, $parser:expr) => { + #[allow(clippy::needless_lifetimes)] + #[allow(mismatched_lifetime_syntaxes)] + fn $name<'a>(s: &'a str) -> IResult<&'a str, $out> { + $parser.parse(s) + } + }; +} + +#[macro_export] +macro_rules! make_predicate { + (pub $name:ident, |$c:ident| $($body:tt)*) => { + pub fn $name($c: char) -> bool { $($body)* } + }; + ($name:ident, |$c:ident| $($body:tt)*) => { + fn $name($c: char) -> bool { $($body)* } + }; +} + +make_predicate!(pub is_option_char, |c| c.is_alphanumeric() || matches!(c, '-' | '_')); + +make_parser!(pub rest_of_line -> &'a str, + take_till(|c: char| c.is_newline()) +); + +// end of line — matches either a newline or end of input. +// permissive version used in most line-consuming parsers. +make_parser!(pub eol -> &'a str, alt((line_ending, eof))); + +/// compute the visual indent of a leading whitespace run. +/// spaces count 1, tabs count 8 (typical terminal default). +pub fn visual_indent(s: &str) -> u8 { + s.chars().fold(0u8, |acc, c| { + acc.saturating_add(match c { + ' ' => 1, + '\t' => 8, + _ => 0, + }) + }) +} + +/// nom-shaped check that the input begins with at least `min` visual +/// columns of horizontal whitespace (spaces or tabs). doesn't consume — +/// pair with `space0`/`take_while` to actually eat the indent. +pub fn at_least_indent<'a>( + min: u8, +) -> impl Parser<&'a str, Output = &'a str, Error = nom::error::Error<&'a str>> { + verify( + peek(take_while(|c: char| c == ' ' || c == '\t')), + move |s: &str| visual_indent(s) >= min, + ) +} + +/// legacy helper: returns (byte index of first non-space, visual indent). +/// used by callers that still need the byte index. +pub fn get_indent(s: &str) -> (usize, u8) { + let mut traversed = 0; + let mut indent = 0; + for (i, c) in s.char_indices() { + let incr = match c { + ' ' => 1, + '\t' => 8, + _ => 0, + }; + if incr == 0 { + traversed = i; + break; + } else { + indent += incr; + } + } + (traversed, indent) +} diff --git a/src/parsers/help/options.rs b/src/parsers/help/options.rs new file mode 100644 index 0000000..73b2170 --- /dev/null +++ b/src/parsers/help/options.rs @@ -0,0 +1,192 @@ +use crate::make_parser; +use crate::parsers::help::helpers::is_option_char; +use crate::types::*; + +use nom::bytes::complete::{take_till, take_till1}; +use nom::character::complete::{space0, space1}; +use nom::combinator::{map, opt}; +use nom::multi::many0; +use nom::sequence::separated_pair; +use nom::{ + IResult, Parser, + branch::alt, + bytes::complete::{tag, take_while1}, + character::complete::{char, satisfy}, + combinator::{value, verify}, + sequence::{delimited, preceded}, +}; + +make_parser!(short_switch -> char, + preceded(char('-'), satisfy(|c| c.is_alphanumeric()))); + +make_parser!(long_switch -> &'a str, + preceded(tag("--"), take_while1(is_option_char))); + +make_parser!(negatable_long_switch -> &'a str, + preceded(tag("--[no-]"), take_while1(is_option_char))); + +make_parser!(comma -> (), + value((), preceded(char(','), space0))); + +make_parser!(eq_optional_param -> Param<'a>, + delimited(tag("[="), take_while1(is_option_char), char(']')) => Param::Optional); + +make_parser!(eq_optional_angle_param -> Param<'a>, + delimited(tag("[=<"), take_till1(|c| c == '>'), tag(">]")) => Param::Optional); + +make_parser!(eq_mandatory_param -> Param<'a>, + preceded(char('='), take_while1(is_option_char)) => Param::Mandatory); + +// take a wide alphanumeric/_/- token then verify the WHOLE thing looks +// like an ALL_CAPS-style param name. taking only uppercase chars would +// match just "N" of " Needs: ..." and leave "eeds:..." as desc, so we +// widen, then reject anything that doesn't pass the all-caps check. +make_parser!(spaced_uppercase_param -> Param<'a>, + preceded( + char(' '), + verify( + take_while1(|c: char| + c.is_ascii_alphabetic() || c.is_ascii_digit() || c == '_' || c == '-' + ), + |s: &str| { + let first = match s.chars().next() { Some(c) => c, None => return false }; + if !(first.is_ascii_uppercase() || first == '_') { return false; } + s.chars().all(|c| c.is_ascii_uppercase() || c.is_ascii_digit() || c == '_') + } + ) + ) => Param::Mandatory); + +make_parser!(spaced_angle_param -> Param<'a>, + preceded(char(' '), delimited(char('<'), take_till1(|c| c == '>'), char('>'))) => Param::Mandatory); + +make_parser!(spaced_opt_angle_param -> Param<'a>, + preceded(char(' '), delimited(char('<'), + delimited(char('['), take_while1(|c| c != ']'), char(']')), + char('>'))) => Param::Optional); + +make_parser!(spaced_angle_param_after_space -> Param<'a>, + preceded(space1, delimited(char('<'), take_till1(|c| c == '>'), char('>'))) => Param::Mandatory); + +// take the full lowercase token then verify it's <=10 chars. a +// take_while_m_n with a 10-char cap would leave a partial match — e.g. +// "--foo nanoseconds" would extract param "nanosecond" and leave "s" as +// the description. a word longer than 10 chars is almost certainly the +// start of the description, not a type annotation. +make_parser!(spaced_type_param -> Param<'a>, + preceded( + char(' '), + verify( + take_while1(|c: char| !c.is_whitespace()), + |s: &str| s.len() <= 10 && s.chars().all(|c| c.is_ascii_lowercase()) + ) + ) => Param::Mandatory +); + +make_parser!(pub param_parser -> Param<'a>, alt(( + eq_optional_angle_param, + eq_optional_param, + eq_mandatory_param, + spaced_opt_angle_param, + spaced_angle_param_after_space, + spaced_angle_param, + spaced_uppercase_param, + spaced_type_param, +))); + +macro_rules! switch_pair { + ($name:ident, $left:expr, $sep:expr, $right:expr => |$a:ident, $b:ident| $body:expr) => { + fn $name<'a>(s: &'a str) -> IResult<&'a str, Switch<'a>> { + use nom::sequence::separated_pair; + let (rem, ($a, $b)) = separated_pair($left, $sep, $right).parse(s)?; + Ok((rem, $body)) + } + }; +} + +switch_pair!(short_comma_long, + short_switch, comma, long_switch => |s, l| Switch::Both(s, l)); + +switch_pair!(short_comma_negatable_long, + short_switch, comma, negatable_long_switch => |s, l| Switch::Both(s, l)); + +switch_pair!(short_space_long, + short_switch, char(' '), long_switch => |s, l| Switch::Both(s, l)); + +switch_pair!(short_space_negatable_long, + short_switch, char(' '), negatable_long_switch => |s, l| Switch::Both(s, l)); + +make_parser!(slash_sep -> (), + value((), delimited(space0, char('/'), space0))); + +switch_pair!(long_slash_short, + long_switch, slash_sep, short_switch => |l, s| Switch::Both(s, l)); + +make_parser!(short_as_switch -> Switch<'a>, short_switch => Switch::Short); +make_parser!(negatable_long_as_switch -> Switch<'a>, negatable_long_switch => Switch::Long); +make_parser!(long_as_switch -> Switch<'a>, long_switch => Switch::Long); + +make_parser!(pub switch_parser -> Switch<'a>, + alt(( + short_comma_negatable_long, + short_space_negatable_long, + short_comma_long, + short_space_long, + long_slash_short, + short_as_switch, + negatable_long_as_switch, + long_as_switch, + )) +); + +// `{--long | -s}` — manpage SYNOPSIS-line switch pair. nix-env's +// synopsis is the canonical case: `[{--file | -f} path] [{--profile | +// -p} path]`. emits Switch::Both with the long name. +make_parser!(brace_pipe_long_short -> Switch<'a>, + separated_pair(long_switch, (space0, char('|'), space0), short_switch) + => |(l, s): (&'a str, char)| Switch::Both(s, l) +); + +make_parser!(brace_pipe_short_long -> Switch<'a>, + separated_pair(short_switch, (space0, char('|'), space0), long_switch) + => |(s, l): (char, &'a str)| Switch::Both(s, l) +); + +make_parser!(brace_pipe_switch -> Switch<'a>, + delimited( + (char('{'), space0), + alt((brace_pipe_long_short, brace_pipe_short_long)), + (space0, char('}')) + ) +); + +make_parser!(usage_switch_parser -> Switch<'a>, + alt((brace_pipe_switch, switch_parser)) +); + +// consume any chars except `]`. used to swallow trailing tokens inside a +// flag bracket — e.g. `[--option name value]` keeps switch=Long("option") +// and param=Mandatory("name"), discarding ` value` before the closing `]`. +make_parser!(take_till_bracket -> &'a str, take_till(|c: char| c == ']')); + +// `[ [param] ]` inside the SYNOPSIS line. +make_parser!(flag_in_bracket -> (Switch<'a>, Option>), + delimited( + (char('['), space0), + (usage_switch_parser, opt(param_parser)), + (take_till_bracket, char(']')) + ) +); + +// walk the joined SYNOPSIS-line text, collecting every flag-bracketed +// switch + its first param. non-flag tokens (positional brackets, +// command name, ellipses) are skipped one char at a time. +make_parser!(pub parse_usage_flags -> Vec<(Switch<'a>, Option>)>, + many0(alt(( + map(flag_in_bracket, Some), + // `value(None, ...)` requires `None: Clone` which forces Clone + // on Switch/Param; `map(..., |_| None)` doesn't. + map(satisfy(|c| c != '\n' && c != '\r'), |_| None), + ))) + => |v: Vec, Option>)>>| + v.into_iter().flatten().collect() +); diff --git a/src/parsers/help/positionals.rs b/src/parsers/help/positionals.rs new file mode 100644 index 0000000..5c137c2 --- /dev/null +++ b/src/parsers/help/positionals.rs @@ -0,0 +1,400 @@ +use crate::parsers::help::helpers::rest_of_line; +use crate::types::Positional; +use crate::{make_parser, make_predicate}; +use nom::branch::alt; +use nom::bytes::complete::{tag, tag_no_case, take_till, take_till1, take_while, take_while1}; +use nom::character::complete::{char, line_ending, satisfy, space0, space1}; +use nom::combinator::{map, not, opt, peek, recognize, value, verify}; +use nom::multi::many0; +use nom::sequence::{delimited, preceded, terminated}; +use nom::{AsChar, IResult, Parser}; + +#[derive(Clone)] +enum PositionalParse<'a> { + Curly, + Flag, + Skip, + Mandatory(&'a str), + Optional(&'a str), + ManVariadic(&'a str), + OptVariadic(&'a str), +} + +make_predicate!(is_word_char, |c| c.is_alphanumeric() + || matches!(c, '-' | '_' | '/' | '.')); + +make_predicate!(is_pos_char, |c| c.is_ascii_uppercase() + || c.is_numeric() + || matches!(c, '_' | '-')); + +make_parser!(section_label -> (), + value((), alt(( + tag_no_case("options"), + tag_no_case("option"), + tag_no_case("flags"), + tag_no_case("flag") + ))) +); + +make_parser!(ellipses -> (), + value((), + alt((tag("..."), tag("\u{2026}"))) + ) +); + +make_parser!(braces -> PositionalParse<'a>, + value(PositionalParse::Curly, delimited(char('{'), take_till1(|c| c == '}'), char('}'))) +); + +// FIXME should this be a take_while is_option_char? +// why tf do we have a ']' condition +make_parser!(flag -> PositionalParse<'a>, + value(PositionalParse::Flag, preceded(char('-'), take_till1(|c: char| c.is_space() || c == ']'))) +); + +fn check_positional(s: &str) -> bool { + let s = s.trim(); + if s.is_empty() { + return false; + } + // reject names starting with '-' — these are flag tokens accidentally + // captured by the bracket parser, e.g. "[--at-operation]" in jj's + // synopsis. without this guard every `[--flag]` token would be + // recorded as a positional named "--flag". + if s.starts_with('-') { + return false; + } + if section_label.parse(s).is_ok() { + return false; + } + let upper = s.to_ascii_uppercase(); + if matches!(upper.as_str(), "OPTIONS" | "OPTION" | "FLAGS" | "FLAG") { + return false; + } + s.chars() + .all(|c| c.is_alphanumeric() || matches!(c, '-' | '_' | '/' | '.')) +} + +// recognize a balanced `[...]` block, tolerating ONE level of nested +// brackets inside. expressed entirely via nom combinators: +// +// `[` + many0(alt((nested_bracket_block, non_bracket_char))) + `]` +// +// nested_bracket_block is `[ chars_until_] ]`, which means we accept a +// single inner `[...]` correctly but not arbitrarily-deep nesting — +// manpages don't go deeper than two levels. +// returns the inner content (everything between the outer brackets). +make_parser!(balanced_bracket_inner -> &'a str, + recognize(delimited( + char('['), + many0(alt(( + recognize((char('['), take_till(|c: char| c == ']'), char(']'))), + recognize(satisfy(|c: char| c != ']' && c != '[')), + ))), + char(']'), + )) + => |whole: &'a str| &whole[1..whole.len() - 1] +); + +/// extract a positional name from already-trimmed bracket-inner content. +/// returns the name slice and a flag indicating whether the bracket inner +/// carried a trailing `...` (in-bracket variadic marker). +fn parse_bracket_inner_name(inner: &str) -> Option<(&str, bool)> { + let inner = inner.trim(); + // strip trailing "..." for in-bracket variadic. + let (rest, has_dots) = if let Some(stripped) = inner.strip_suffix("...") { + (stripped.trim_end(), true) + } else if let Some(stripped) = inner.strip_suffix('\u{2026}') { + (stripped.trim_end(), true) + } else { + (inner, false) + }; + if rest.starts_with('[') { + let mut found = None; + let mut remaining = rest; + while let Some(start) = remaining.find('[') { + let after_start = &remaining[start + 1..]; + let Some(end) = after_start.find(']') else { + break; + }; + let nested = &after_start[..end]; + if let Some((nested_name, nested_dots)) = parse_bracket_inner_name(nested) + && check_positional(nested_name) + { + found = Some((nested_name, has_dots || nested_dots)); + } + remaining = &after_start[end + 1..]; + } + return found; + } + let name = if let Some(after_lt) = rest.strip_prefix('<') { + // angle-bracket name: take everything up to the matching '>' + let end = after_lt.find('>')?; + let inner = after_lt[..end].trim(); + let (inner, inner_dots) = if let Some(stripped) = inner.strip_suffix("...") { + (stripped.trim_end(), true) + } else if let Some(stripped) = inner.strip_suffix('\u{2026}') { + (stripped.trim_end(), true) + } else { + (inner, false) + }; + return Some((inner, has_dots || inner_dots)); + } else { + // bare name: take leading word + let end = rest + .find(|c: char| c.is_whitespace() || c == '[' || c == ']') + .unwrap_or(rest.len()); + if end == 0 { + return None; + } + &rest[..end] + }; + Some((name, has_dots)) +} + +// extract a balanced `[...]` block and decompose its inner content into +// (name, has-inner-`...` flag). `map_opt` turns a `None` from +// `parse_bracket_inner_name` into a nom parse error. +make_parser!(opt_bracket_name -> (&'a str, bool), + nom::combinator::map_opt(balanced_bracket_inner, parse_bracket_inner_name) +); + +make_parser!( + opt_positional -> PositionalParse<'a>, + verify( + // tuple parser: (name + in-bracket variadic, post-bracket ellipsis). + // matches "[name]", "[name...]", "[name ...]", "[name] ...", + // "[]", and one-level nests like "[ [...]]". + (opt_bracket_name, opt(ellipses)), + |((name, _), _): &((&'a str, bool), Option<()>)| check_positional(name) + ) => |((name, has_inner_dots), post_dots): ((&'a str, bool), Option<()>)| { + if has_inner_dots || post_dots.is_some() { + PositionalParse::OptVariadic(name) + } else { + PositionalParse::Optional(name) + } + } +); + +make_parser!(man_positional -> PositionalParse<'a>, + verify( + ( + delimited( + char('<'), + ( + take_till1(|c| c == '.' || c == '\u{2026}' || c == '>'), + opt(ellipses) + ), + char('>') + ), + opt(ellipses) + ), + |((ss, _), _)| check_positional(ss) + ) => |((p, v), v1): ((&'a str, Option<()>), Option<()>)| + if v.is_some() || v1.is_some() { PositionalParse::ManVariadic(p) } + else { PositionalParse::Mandatory(p) } +); + +make_parser!(allcaps_positional -> PositionalParse<'a>, + verify( + ( + preceded( + peek( + satisfy(|c: char| c.is_ascii_uppercase()) + ), + take_while1(is_pos_char) + ), + opt( + alt(( + tag("..."), + tag("\u{2026}")) + ) + ) + ), + |(ss, _): &(&str, _)| check_positional(ss) + ) => |(p, v): (&'a str, Option<&'a str>)| + if v.is_some() { PositionalParse::ManVariadic(p) } else { PositionalParse::Mandatory(p) } +); + +fn caseless_push<'a>(k: &'a str, v: Positional, acc: &mut Vec<(&'a str, Positional)>) { + let dupe = acc.iter().any(|(ik, _)| ik.eq_ignore_ascii_case(k)); + if !dupe { + acc.push((k, v)); + } +} + +// parse_usage_args runs on a single logical usage line. SKIP refuses to +// cross a newline boundary so many0 stops at end-of-line — without this +// the parser would happily wander into the OPTIONS section and treat +// every `--flag ` angle-bracket parameter as a positional. +// +// the inner positional terminator uses peek(line_ending) instead of +// consuming the newline, so the trailing `opt(line_ending)` in the +// outer delimited eats it cleanly and we never advance past the usage +// line. +make_parser!(pub parse_usage_args -> Vec<(&'a str, Positional)>, + (delimited( + space0, + many0( + alt(( + map( + ( + terminated( + alt(( + braces, + opt_positional, + value(PositionalParse::Skip, balanced_bracket_inner), + man_positional, + flag, + allcaps_positional, + )), + alt(( + space1, + value("", peek(line_ending)), + value("", peek(nom::combinator::eof)), + )) + ), + // catch "[section] ..." patterns where the ellipsis is + // on the *next* token, separated by whitespace. + opt(terminated( + alt((tag("..."), tag("\u{2026}"))), + alt(( + space1, + value("", peek(line_ending)), + value("", peek(nom::combinator::eof)), + )) + )) + ), + |(positional, trailing): (PositionalParse<'a>, Option<_>)| { + if trailing.is_none() { positional } + else { + match positional { + PositionalParse::Optional(n) => PositionalParse::OptVariadic(n), + PositionalParse::Mandatory(n) => PositionalParse::ManVariadic(n), + other => other, + } + } + } + ), + // SKIP must NOT consume a newline. without this, many0 keeps + // iterating past the usage line into OPTIONS-section flag + // syntax and over-extracts positionals. + value(PositionalParse::Skip, satisfy(|c: char| c != '\n' && c != '\r')), + )) + ), + opt((space0, line_ending)) + )) => |p: Vec>| + p.into_iter().fold(Vec::new(), |mut acc, parse| + { + match parse { + PositionalParse::Curly => (), + PositionalParse::Flag => (), + PositionalParse::Skip => (), + PositionalParse::OptVariadic(arg) => caseless_push(arg, Positional { + optional: true, + variadic: true + }, &mut acc), + PositionalParse::ManVariadic(arg) => caseless_push(arg, Positional { + optional: false, + variadic: true + }, &mut acc), + PositionalParse::Optional(arg) => caseless_push(arg, Positional { + optional: true, + variadic: false, + }, &mut acc), + PositionalParse::Mandatory(arg) => caseless_push(arg, Positional { + optional: false, + variadic: false + }, &mut acc), + } + acc + }) +); + +make_parser!(pub skip_command_name -> (), + value((), preceded(space0, + many0( + ( + verify( + preceded(not(char('-')), take_while1(is_word_char)), + |ss: &str| ss.chars().any(|c: char| c.is_ascii_lowercase()) + ), + space1 + ) + ) + )) +); + +make_parser!(find_usage_line -> (), + value((), preceded( + space0, + terminated( + tag_no_case("usage"), + // accept any of: + // "Usage:" — inline form with colon + // "Usage args" — inline form, space follows the word + // "USAGE\n cmd args" — clap-style header on its own line + alt( + ( + value((), char(':')), + value((), peek(line_ending)), + value((), peek(satisfy(|c: char| c == ' ' || c == '\t'))), + ) + ) + ) + )) +); + +make_parser!(pub extract_usage_positionals -> Vec<(&'a str, Positional)>, + preceded( + many0(preceded(not(find_usage_line), (rest_of_line, line_ending))), + preceded( + (find_usage_line, space0, opt(line_ending), space0, skip_command_name), + parse_usage_args + ) + ) +); + +make_predicate!(is_cli11_name_char, |c| c.is_alphanumeric() + || matches!(c, '_' | '-')); + +make_parser!(cli11_section_header -> (), + value((), + delimited( + space0, + alt((tag("POSITIONALS:"), tag("Positionals:"))), + (rest_of_line, opt(line_ending)) + ) + ) +); + +make_parser!(cli11_pos_line -> (&'a str, bool), + preceded( + verify(space0, |ss: &str| !ss.is_empty()), + terminated( + ( + verify(take_while1(is_cli11_name_char), |s: &str| s.len() >= 2), + preceded( + (space0, take_while(|c: char| c.is_ascii_uppercase()), space0), + opt(tag("...")) + ) + ), + (rest_of_line, opt(line_ending)) + ) + ) => |(name, variadic): (&'a str, Option<_>)| (name, variadic.is_some()) +); + +make_parser!(parse_cli11_body -> Vec<(&'a str, Positional)>, + many0(cli11_pos_line) => |entries: Vec<(&'a str, bool)>| + entries.into_iter().fold(Vec::new(), |mut acc, (name, variadic)| { + caseless_push(name, Positional { optional: false, variadic }, &mut acc); + acc + }) +); + +make_parser!(pub extract_cli11_positionals -> Vec<(&'a str, Positional)>, + preceded( + many0(preceded(not(cli11_section_header), (rest_of_line, line_ending))), + preceded(cli11_section_header, parse_cli11_body) + ) +); diff --git a/src/parsers/help/subcommands.rs b/src/parsers/help/subcommands.rs new file mode 100644 index 0000000..7b3a0c6 --- /dev/null +++ b/src/parsers/help/subcommands.rs @@ -0,0 +1,83 @@ +use nom::{ + AsChar, IResult, Parser, + branch::alt, + bytes::complete::{tag, take_till, take_while1}, + character::complete::{char, space0}, + combinator::{not, value, verify}, + multi::many0, + sequence::{delimited, preceded, terminated}, +}; + +use crate::make_parser; +use crate::parsers::help::helpers::{eol, is_option_char}; +use crate::types::Subcommand; + +fn is_placeholder(c: char) -> bool { + match c { + _ if c.is_alphanumeric() => true, + '_' | '-' | '.' | '|' | ',' => true, + _ => false, + } +} + +/// chars allowed inside a bare (unbracketed) placeholder token, e.g. +/// "FILE", "PATTERN...", "A|B". excludes lowercase letters so mixed-case +/// description words like "NixOS" or "Home-manager" don't get swallowed +/// as placeholders. +fn is_bare_placeholder_char(c: char) -> bool { + matches!(c, 'A'..='Z' | '0'..='9' | '_' | '-' | '.' | '|' | ',') +} + +make_parser!( + skip_arg_placeholders -> (), + value( + (), + many0(preceded( + // peek ahead one char (don't consume) so the per-branch parser can + // see the full token. needed because the bare ALL_CAPS branch must + // verify the *entire* token before deciding to consume. + char(' '), + alt(( + // <...> bracketed placeholder + delimited(char('<'), take_while1(is_placeholder), char('>')), + // [...] optional bracketed placeholder + delimited(char('['), take_while1(is_placeholder), char(']')), + // bare ALL_CAPS placeholder — first char must be uppercase or + // a digit (allows e.g. "N", "M2"), and the whole token must + // be uppercase-friendly. rejects "NixOS"-style mixed-case so + // descriptions don't get swallowed. + verify( + take_while1(is_bare_placeholder_char), + |s: &str| { + let first = s.chars().next().unwrap(); + first.is_ascii_uppercase() || first.is_ascii_digit() + } + ), + )), + )), + ) +); + +// parse a subcommand entry: leading whitespace, then a name (2+ option +// chars, not starting with '-'), optional argument placeholders, exactly +// two spaces, optional padding, then the description text and eol. +make_parser!(pub subcommand_entry -> Subcommand<'a>, + ( + preceded( + space0, + verify( + preceded(not(char('-')), take_while1(is_option_char)), + |n: &str| n.len() >= 2, + ), + ), + skip_arg_placeholders, + tag(" "), + space0, + terminated(take_till(|c: char| c.is_newline()), eol), + ) => |(name, _, _, _, desc): (&'a str, _, _, _, &'a str)| { + // some help formats prefix desc with "- " (manpage-style); strip it. + let d = desc.trim_start(); + let desc = d.strip_prefix("- ").map(|s| s.trim_start()).unwrap_or(d); + Subcommand { name, desc } + } +); diff --git a/src/parsers/manpage.rs b/src/parsers/manpage.rs new file mode 100644 index 0000000..651598b --- /dev/null +++ b/src/parsers/manpage.rs @@ -0,0 +1,335 @@ +//! parse unix manpages (groff/mdoc format) into a structured result. +//! +//! manpages are written in roff/groff markup — a decades-old typesetting language +//! used by man(1). this module strips the formatting and extracts structured data +//! (flags, subcommands, positionals) from the raw groff source. +//! +//! there are two major manpage macro packages: +//! - man (groff) — used by gnu/linux tools. uses macros like .SH, .TP, .IP, .PP +//! - mdoc (bsd) — used by bsd tools. uses .Sh, .Fl, .Ar, .Op, .It, .Bl/.El +//! +//! this module handles both, auto-detecting the format by checking for .Sh macros. +//! +//! for groff manpages, flag extraction uses multiple "strategies" that target +//! different common formatting patterns: +//! - strategy_tp: .TP tagged paragraphs (gnu coreutils, help2man) +//! - strategy_ip: .IP indented paragraphs (curl, hand-written) +//! - strategy_pp_rs: .PP + .RS/.RE blocks (git, docbook) +//! - strategy_nix: nix3-style bullet .IP with .UR/.UE hyperlinks +//! - strategy_deroff: fallback — strip all groff, feed to help text parser +//! +//! the module tries all applicable strategies and picks the one that extracts +//! the most flag entries, on the theory that more results = better match. + +mod commands; +mod groff; +mod mdoc; +mod sections; +mod strategies; + +use std::io::{self, Read}; +use std::path::Path; + +use crate::types::{HelpResult, OptionEntry, Param, Positional, Subcommand, Switch}; + +pub use self::groff::{GroffLine, classify_line, strip_groff_escapes}; +pub use self::sections::{extract_subcommand_sections, extract_synopsis_command}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum OwnedSwitch { + Short(char), + Long(String), + Both(char, String), +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum OwnedParam { + Mandatory(String), + Optional(String), +} + +#[derive(Debug, Clone)] +pub struct ManpageEntry { + pub switch: OwnedSwitch, + pub param: Option, + pub desc: String, +} + +#[derive(Debug, Clone)] +pub struct ManpageSubcommand { + pub name: String, + pub desc: String, +} + +#[derive(Debug, Clone, Default)] +pub struct ManpageResult { + pub entries: Vec, + pub subcommands: Vec, + pub positionals: Vec<(String, Positional)>, + pub description: String, +} + +impl From<&Switch<'_>> for OwnedSwitch { + fn from(s: &Switch<'_>) -> Self { + match s { + Switch::Short(c) => OwnedSwitch::Short(*c), + Switch::Long(l) => OwnedSwitch::Long((*l).to_string()), + Switch::Both(c, l) => OwnedSwitch::Both(*c, (*l).to_string()), + } + } +} + +impl From<&Param<'_>> for OwnedParam { + fn from(p: &Param<'_>) -> Self { + match p { + Param::Mandatory(s) => OwnedParam::Mandatory((*s).to_string()), + Param::Optional(s) => OwnedParam::Optional((*s).to_string()), + } + } +} + +impl From<&OptionEntry<'_>> for ManpageEntry { + fn from(e: &OptionEntry<'_>) -> Self { + let desc: String = e + .desc + .iter() + .map(|s| s.trim()) + .filter(|s| !s.is_empty()) + .collect::>() + .join(" "); + ManpageEntry { + switch: (&e.switch).into(), + param: e.param.as_ref().map(Into::into), + desc, + } + } +} + +impl From<&Subcommand<'_>> for ManpageSubcommand { + fn from(sc: &Subcommand<'_>) -> Self { + // lowercase the subcommand name here so (a) file naming is + // consistent (meat_yum.json vs meat_YUM.json) and (b) recursive + // --help probes use the lowercase form, which is what most real + // CLIs accept — even tools like meat that DISPLAY uppercase + // names in their help text dispatch on the lowercased argument. + ManpageSubcommand { + name: sc.name.to_ascii_lowercase(), + desc: sc.desc.to_string(), + } + } +} + +impl From<&HelpResult<'_>> for ManpageResult { + fn from(r: &HelpResult<'_>) -> Self { + ManpageResult { + entries: r.entries.iter().map(Into::into).collect(), + subcommands: r.subcommands.iter().map(Into::into).collect(), + // positional names are stored lowercased so output is + // stable across the various places we extract them from + // (synopsis, usage, cli11 sections). + positionals: r + .positionals + .iter() + .map(|(k, v)| (k.to_ascii_lowercase(), v.clone())) + .collect(), + description: r.desc.to_string(), + } + } +} + +/// parse a manpage from its classified lines. +/// auto-detects mdoc vs groff format. for groff, runs the multi-strategy +/// extraction pipeline. +pub fn parse_manpage_lines(lines: &[GroffLine]) -> ManpageResult { + if mdoc::is_mdoc(lines) { + mdoc::parse_mdoc_lines(lines) + } else { + let options_section = sections::extract_options_section(lines); + let mut entries = strategies::extract_entries(&options_section); + // merge SYNOPSIS-only flags (nix-env's `[{--profile | -p} path]` + // pattern, where the flag is declared in the synopsis but never + // listed as an entry in the OPTIONS body). body entries take + // precedence on duplicate names — they carry the descriptions. + let synopsis_flags = sections::extract_synopsis_flags(lines); + if !synopsis_flags.is_empty() { + let have_long: std::collections::HashSet = entries + .iter() + .filter_map(|e| match &e.switch { + OwnedSwitch::Long(l) | OwnedSwitch::Both(_, l) => Some(l.to_ascii_lowercase()), + _ => None, + }) + .collect(); + let have_short: std::collections::HashSet = entries + .iter() + .filter_map(|e| match &e.switch { + OwnedSwitch::Short(c) | OwnedSwitch::Both(c, _) => Some(*c), + _ => None, + }) + .collect(); + for e in synopsis_flags { + let dup = match &e.switch { + OwnedSwitch::Long(l) => have_long.contains(&l.to_ascii_lowercase()), + OwnedSwitch::Short(c) => have_short.contains(c), + OwnedSwitch::Both(c, l) => { + have_short.contains(c) || have_long.contains(&l.to_ascii_lowercase()) + } + }; + if !dup { + entries.push(e); + } + } + } + let positionals = sections::extract_synopsis_positionals(lines); + let commands_section = sections::extract_commands_section(lines); + let mut subcommands = commands::extract_subcommands_from_commands(&commands_section); + for positional in sections::extract_description_positionals(lines) { + if !subcommands + .iter() + .any(|sc| sc.name.eq_ignore_ascii_case(&positional.name)) + { + subcommands.push(positional); + } + } + ManpageResult { + entries, + subcommands, + positionals, + description: String::new(), + } + } +} + +/// parse a manpage from its raw string contents. +/// splits into lines, parses, then extracts the NAME section description. +pub fn parse_manpage_string(contents: &str) -> ManpageResult { + let lines: Vec = contents.split('\n').map(classify_line).collect(); + let mut result = parse_manpage_lines(&lines); + if let Some(desc) = sections::extract_name_description(&lines) { + result.description = desc; + } + result +} + +/// parse a manpage and also pull out clap-style `.SH SUBCOMMAND` sections +/// as separate per-subcommand results. each subcommand section in a +/// clap-generated manpage is its own command with its own flags; the +/// parent's subcommand list is populated from their names. +/// +/// returns (main_result, sub_results) where each sub_result has +/// name=full_command ("nh os"), desc, and its own ManpageResult. +pub fn parse_manpage_with_subs(contents: &str) -> (ManpageResult, Vec<(String, ManpageResult)>) { + let lines: Vec = contents.split('\n').map(classify_line).collect(); + let mut result = parse_manpage_lines(&lines); + if let Some(desc) = sections::extract_name_description(&lines) { + result.description = desc; + } + let sub_sections = sections::extract_subcommand_sections(&lines); + if !sub_sections.is_empty() { + // overwrite subcommands with the SUBCOMMAND-section names — + // these are the authoritative list for clap-generated manpages. + result.subcommands = sub_sections + .iter() + .map(|(name, desc, _)| ManpageSubcommand { + name: name.to_ascii_lowercase(), + desc: desc.clone(), + }) + .collect(); + } + // each SUBCOMMAND section body is parsed via the same strategy-picker + // as the top-level OPTIONS section — clap puts flag definitions + // directly under the .SH SUBCOMMAND header with no inner .SH wrapping, + // so parse_manpage_lines (which looks for a child OPTIONS section) + // would come back empty. + let subs: Vec<(String, ManpageResult)> = sub_sections + .into_iter() + .map(|(name, desc, lines)| { + let entries = strategies::extract_entries(&lines); + let sub_result = ManpageResult { + entries, + subcommands: Vec::new(), + positionals: Default::default(), + description: desc, + }; + (name, sub_result) + }) + .collect(); + (result, subs) +} + +/// read a manpage file from disk. handles .gz compressed files (the common +/// case — most installed manpages are gzipped). plain text files are read directly. +pub fn read_manpage_file>(path: P) -> io::Result { + let path = path.as_ref(); + let bytes = std::fs::read(path)?; + if path.extension().and_then(|e| e.to_str()) == Some("gz") { + let mut decoder = flate2::read::GzDecoder::new(&bytes[..]); + let mut out = String::new(); + decoder.read_to_string(&mut out)?; + Ok(out) + } else { + String::from_utf8(bytes).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e)) + } +} + +/// read + parse a manpage file in one step. +pub fn parse_manpage_file>(path: P) -> io::Result { + let contents = read_manpage_file(path)?; + Ok(parse_manpage_string(&contents)) +} + +#[cfg(test)] +mod tests { + use super::*; + + const TP_MANPAGE: &str = r#".TH FOO 1 "2024" "1.0" "User Commands" +.SH NAME +foo \- a synthetic test command +.SH SYNOPSIS +.B foo +[\fIOPTIONS\fR] [output] +.SH OPTIONS +.TP +\fB\-v\fR, \fB\-\-verbose\fR +increase output verbosity +.TP +\fB\-o\fR \fIFILE\fR, \fB\-\-output\fR=\fIFILE\fR +write to FILE +.TP +\fB\-h\fR, \fB\-\-help\fR +show this help and exit +"#; + + #[test] + fn tp_strategy_extracts_flags() { + let r = parse_manpage_string(TP_MANPAGE); + assert_eq!( + r.entries.len(), + 3, + "expected 3 entries, got {:?}", + r.entries + ); + assert_eq!(r.description, "a synthetic test command"); + assert!(matches!( + r.entries[0].switch, + OwnedSwitch::Both('v', ref l) if l == "verbose" + )); + assert!(matches!( + r.entries[2].switch, + OwnedSwitch::Both('h', ref l) if l == "help" + )); + assert!(r.entries[0].desc.contains("verbosity")); + } + + #[test] + fn mdoc_format_detected() { + let src = ".Sh NAME\n.Nm test\n.Nd a test\n.Sh DESCRIPTION\nstuff\n"; + let lines: Vec = src.split('\n').map(classify_line).collect(); + assert!(mdoc::is_mdoc(&lines)); + } + + #[test] + fn groff_escapes_stripped() { + let stripped = groff::strip_groff_escapes("\\fB\\-v\\fR \\fIfile\\fR"); + assert_eq!(stripped.trim(), "-v file"); + } +} diff --git a/src/parsers/manpage/commands.rs b/src/parsers/manpage/commands.rs new file mode 100644 index 0000000..942de34 --- /dev/null +++ b/src/parsers/manpage/commands.rs @@ -0,0 +1,157 @@ +//! COMMANDS section subcommand extraction. +//! +//! some manpages (notably systemctl) have a dedicated COMMANDS section +//! listing subcommands with descriptions. these use .PP + bold name + +//! .RS/.RE blocks: +//! .PP +//! \fBstart\fR \fIUNIT\fR... +//! .RS 4 +//! Start (activate) one or more units. +//! .RE + +use crate::parsers::manpage::ManpageSubcommand; +use crate::parsers::manpage::groff::{GroffLine, strip_groff_escapes, strip_inline_macro_args}; + +/// validate that the extracted name looks like a subcommand: lowercase, +/// at least 2 chars, no leading dash. +fn is_valid_subcmd(name: &str) -> bool { + name.len() >= 2 + && !name.starts_with('-') + && name + .chars() + .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-' || c == '_') +} + +/// extract subcommand name from a bold groff text like +/// "\fBlist\-units\fR [\fIPATTERN\fR...]" -> "list-units" +fn extract_bold_command_name(text: &str) -> Option { + let trimmed = text.trim(); + if trimmed.len() >= 4 && trimmed.starts_with("\\fB") { + // look for \fB...\fR at the start: find the next '\\' and take + // the segment between \fB and there. + let after = &trimmed[3..]; + let segment_end = after.find('\\').unwrap_or(after.len()); + let name_part = &after[..segment_end]; + let reconstructed = format!("\\fB{name_part}\\fR"); + let name = normalize_command_token(strip_groff_escapes(&reconstructed).trim()); + if is_valid_subcmd(&name) { + return Some(name); + } + return None; + } + // fallback: take the first whitespace-delimited word of the stripped text + let stripped = strip_groff_escapes(trimmed); + let first_word = stripped.split_whitespace().next().unwrap_or(""); + let name = normalize_command_token(first_word); + if is_valid_subcmd(&name) { + Some(name) + } else { + None + } +} + +fn normalize_command_token(token: &str) -> String { + let token = token.trim(); + let token = token + .find('(') + .map(|idx| &token[..idx]) + .unwrap_or(token) + .trim_end_matches(','); + token.to_string() +} + +fn extract_command_name_from_line(line: &GroffLine) -> Option { + match line { + GroffLine::Text(tag) => extract_bold_command_name(tag), + GroffLine::Macro { name, args } + if matches!( + name.as_str(), + "B" | "BI" | "BR" | "I" | "IR" | "IB" | "RB" | "RI" + ) => + { + let rendered = strip_groff_escapes(&strip_inline_macro_args(args)); + extract_bold_command_name(&rendered) + } + _ => None, + } +} + +/// walk through commands section lines, extracting subcommand name+description +/// pairs from .PP + Text + .RS/.RE blocks. +pub fn extract_subcommands_from_commands(lines: &[GroffLine]) -> Vec { + let mut out = Vec::new(); + let mut i = 0; + while i < lines.len() { + if let GroffLine::Macro { name, .. } = &lines[i] + && name == "PP" + { + i += 1; + if i >= lines.len() { + continue; + } + if let Some(name) = extract_command_name_from_line(&lines[i]) { + let (desc, new_i) = collect_subcmd_desc(lines, i + 1); + let short_desc = first_sentence(&desc); + out.push(ManpageSubcommand { + name: name.to_ascii_lowercase(), + desc: short_desc, + }); + i = new_i; + continue; + } else { + i += 1; + } + } else { + i += 1; + } + } + out +} + +/// collect the description for a subcommand entry. handles .RS/.RE blocks +/// and stops at the next .PP/.SH/.SS boundary. +fn collect_subcmd_desc(lines: &[GroffLine], start: usize) -> (String, usize) { + let mut acc: Vec = Vec::new(); + let mut i = start; + while i < lines.len() { + match &lines[i] { + GroffLine::Macro { name, .. } if name == "RS" => { + i += 1; + // inside .RS — collect until .RE or boundary + while i < lines.len() { + match &lines[i] { + GroffLine::Macro { name, .. } if name == "RE" => { + return (acc.join(" "), i + 1); + } + GroffLine::Text(t) => { + acc.push(t.clone()); + i += 1; + } + GroffLine::Macro { name, .. } + if name == "PP" || name == "SH" || name == "SS" => + { + return (acc.join(" "), i); + } + _ => i += 1, + } + } + return (acc.join(" "), i); + } + GroffLine::Text(t) => { + acc.push(t.clone()); + i += 1; + } + _ => return (acc.join(" "), i), + } + } + (acc.join(" "), i) +} + +/// take the first sentence (up to '.') as the description. +fn first_sentence(s: &str) -> String { + let s = s.trim(); + match s.find('.') { + Some(idx) if idx > 0 => s[..idx].trim().to_string(), + _ => s.to_string(), + } +} diff --git a/src/parsers/manpage/groff.rs b/src/parsers/manpage/groff.rs new file mode 100644 index 0000000..4196fac --- /dev/null +++ b/src/parsers/manpage/groff.rs @@ -0,0 +1,385 @@ +//! groff escape/formatting stripping and line classification. +//! +//! groff escapes start with backslash and use various continuation syntaxes. +//! we strip them, replacing named characters (like \(aq for apostrophe) with +//! their text equivalents and discarding formatting directives. +//! +//! also exports `make_macro_walker!`, the manpage-side analogue of the +//! help parser's `make_parser!`. all of our strategy_* functions are +//! "scan lines, on each .MACRO_NAME run a handler, advance, accumulate" +//! — this macro factors out the loop scaffolding so each strategy reduces +//! to its specific extraction logic. + +/// walk a `&[GroffLine]` slice, and on each macro whose name matches +/// `$mname`, invoke the body with `(lines, i, args)` where: +/// - `lines` is the full slice (for slicing further bodies) +/// - `i` is the current index of the matched macro +/// - `args` is the macro's argument string (by reference) +/// +/// the body returns `Option<(T, usize)>`. `Some((value, new_i))` pushes +/// `value` and advances the cursor to `new_i` (typically computed as +/// `lines.len() - rest.len()` after `collect_text_lines`). `None` +/// advances by one line and keeps scanning. +/// +/// matches the help-parser pattern `make_parser!(name -> T, parser => wrap)`: +/// the macro hides the loop scaffolding, the handler expresses the actual +/// extraction logic. +#[macro_export] +macro_rules! make_macro_walker { + (pub $name:ident -> Vec<$t:ty>, on macro $mname:expr => + |$lines:ident, $i:ident, $args:ident| $body:expr) => { + pub fn $name(lines_input: &[$crate::parsers::manpage::GroffLine]) -> Vec<$t> { + let mut out = Vec::new(); + let mut cursor = 0; + let $lines: &[$crate::parsers::manpage::GroffLine] = lines_input; + while cursor < $lines.len() { + if let $crate::parsers::manpage::GroffLine::Macro { + name: macro_name, + args: $args, + } = &$lines[cursor] + { + if macro_name == $mname { + let $i = cursor; + // wrap the handler body in an IIFE so an early + // `return None` inside the handler returns from the + // closure, not from the surrounding strategy function. + #[allow(clippy::redundant_closure_call)] + let result: Option<($t, usize)> = (|| $body)(); + if let Some((value, new_i)) = result { + out.push(value); + cursor = new_i; + continue; + } + } + } + cursor += 1; + } + out + } + }; +} + +/// every line in a manpage is classified as one of four types. +/// this classification drives all subsequent parsing — strategies +/// pattern-match on sequences of classified lines. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum GroffLine { + /// macro name + args, e.g. ("SH", "OPTIONS") or ("TP", "") + Macro { name: String, args: String }, + /// plain text after groff stripping + Text(String), + /// empty line + Blank, + /// groff comment: .backslash-quote or backslash-quote + Comment, +} + +/// translate a groff named character escape to its text equivalent. +/// groff uses two-letter codes like "aq" for apostrophe, "lq"/"rq" for +/// left/right quotes, "em"/"en" for dashes. +fn named_char_of(name: &str) -> Option { + match name { + "aq" => Some('\''), + "lq" | "Lq" | "rq" | "Rq" => Some('"'), + "em" | "en" => Some('-'), + _ => None, + } +} + +fn is_alnum(c: u8) -> bool { + c.is_ascii_alphanumeric() +} + +/// strip groff escape sequences, replacing named characters with text +/// equivalents and discarding formatting directives. +pub fn strip_groff_escapes(source: &str) -> String { + let bytes = source.as_bytes(); + let len = bytes.len(); + let mut buffer = String::with_capacity(len); + let mut pos = 0; + let mut prev_char: u8 = 0; + + while pos < len { + if bytes[pos] == b'\\' && pos + 1 < len { + let next = bytes[pos + 1]; + match next { + b'f' => { + // font escape: \fB, \fI, \fP, \fR, \f(XX, \f[...] + if pos + 2 < len { + let font_char = bytes[pos + 2]; + // insert space before italic font to preserve word boundaries + // e.g. \fB--max-results\fR\fIcount\fR -> "--max-results count" + if font_char == b'I' && is_alnum(prev_char) { + buffer.push(' '); + prev_char = b' '; + } + if font_char == b'(' { + pos += 5; // \f(XX — two-character font name + } else if font_char == b'[' { + pos += 3; + skip_to_byte(bytes, len, &mut pos, b']'); + if pos < len { + pos += 1; + } + } else { + pos += 3; // \fX — single-character font selector + } + } else { + pos += 2; + } + } + b'-' => { + // escaped hyphen-minus — emit a plain hyphen + buffer.push('-'); + prev_char = b'-'; + pos += 2; + } + b'&' | b'/' | b',' => { + // zero-width characters — discard without output + pos += 2; + } + b'(' => { + // two-char named character: \(aq, \(lq, \(rq, etc. + if pos + 3 < len { + let name = &source[pos + 2..pos + 4]; + if let Some(c) = named_char_of(name) { + buffer.push(c); + prev_char = c as u8; + } + pos += 4; + } else { + pos += 2; + } + } + b'[' => { + // bracketed named character: \[aq], \[lq], etc. + pos += 2; + let start = pos; + skip_to_byte(bytes, len, &mut pos, b']'); + if pos < len { + let name = &source[start..pos]; + if let Some(c) = named_char_of(name) { + buffer.push(c); + prev_char = c as u8; + } + pos += 1; + } + } + b's' => { + // size escape: \sN, \s+N, \s-N — skip the numeric argument + pos += 2; + if pos < len && (bytes[pos] == b'+' || bytes[pos] == b'-') { + pos += 1; + } + if pos < len && bytes[pos].is_ascii_digit() { + pos += 1; + } + if pos < len && bytes[pos].is_ascii_digit() { + pos += 1; + } + } + b'm' => { + // color escape: \m[...] — skip the bracketed color name + pos += 2; + if pos < len && bytes[pos] == b'[' { + pos += 1; + skip_to_byte(bytes, len, &mut pos, b']'); + if pos < len { + pos += 1; + } + } + } + b'X' => { + // device control: \X'...' — skip the single-quoted payload + pos += 2; + if pos < len && bytes[pos] == b'\'' { + pos += 1; + skip_to_byte(bytes, len, &mut pos, b'\''); + if pos < len { + pos += 1; + } + } + } + b'*' => { + // string variable: \*X or \*(XX or \*[...] — skip the reference + pos += 2; + skip_groff_reference(bytes, len, &mut pos); + } + b'n' => { + // number register: \nX or \n(XX or \n[...] — skip the reference + pos += 2; + skip_groff_reference(bytes, len, &mut pos); + } + b'e' => { + // escaped backslash literal + buffer.push('\\'); + prev_char = b'\\'; + pos += 2; + } + b'\\' => { + // double backslash — emit one + buffer.push('\\'); + prev_char = b'\\'; + pos += 2; + } + b' ' | b'~' => { + // escaped/non-breaking space — emit a regular space + buffer.push(' '); + prev_char = b' '; + pos += 2; + } + _ => { + // unknown escape — skip the two-character sequence + pos += 2; + } + } + } else { + // copy a full utf-8 char from source to buffer + let c = source[pos..].chars().next().unwrap(); + buffer.push(c); + prev_char = if c.is_ascii() { c as u8 } else { 0 }; + pos += c.len_utf8(); + } + } + buffer +} + +fn skip_to_byte(bytes: &[u8], len: usize, pos: &mut usize, delim: u8) { + while *pos < len && bytes[*pos] != delim { + *pos += 1; + } +} + +/// skip a groff reference that uses one of three sub-forms: +/// single char — e.g. \*X or \nX +/// ( + 2 chars — e.g. \*(XX or \n(XX +/// [ to ] — e.g. \*[name] or \n[name] +fn skip_groff_reference(bytes: &[u8], len: usize, pos: &mut usize) { + if *pos < len { + if bytes[*pos] == b'(' { + *pos += 3; // skip past '(' + two-character name + } else if bytes[*pos] == b'[' { + *pos += 1; + skip_to_byte(bytes, len, pos, b']'); + if *pos < len { + *pos += 1; + } + } else { + *pos += 1; + } + } +} + +/// strip inline macro formatting: .BI, .BR, .IR, etc. +/// these macros alternate between fonts for their arguments, e.g.: +/// .BI "--output " "FILE" +/// becomes "--outputFILE" (arguments concatenated without spaces). +/// +/// quoted strings are kept together (quotes stripped), but unquoted spaces +/// are consumed. this matches groff's actual rendering of these macros. +pub fn strip_inline_macro_args(text: &str) -> String { + let bytes = text.as_bytes(); + let len = bytes.len(); + let mut buffer = String::with_capacity(len); + let mut pos = 0; + while pos < len { + if bytes[pos] == b'"' { + // quoted argument — copy characters up to the closing quote + pos += 1; + while pos < len && bytes[pos] != b'"' { + let c = text[pos..].chars().next().unwrap(); + buffer.push(c); + pos += c.len_utf8(); + } + if pos < len { + pos += 1; + } + } else if bytes[pos] == b' ' || bytes[pos] == b'\t' { + // unquoted whitespace — skip (arguments are concatenated) + pos += 1; + } else { + let c = text[pos..].chars().next().unwrap(); + buffer.push(c); + pos += c.len_utf8(); + } + } + buffer +} + +/// render same-font macro arguments (.B/.I) where arguments are separated +/// by spaces. quote delimiters group arguments in roff source but should +/// not become part of the visible text. +pub fn strip_space_macro_args(text: &str) -> String { + strip_groff_escapes(&text.replace('"', "")) + .trim() + .to_string() +} + +/// strip escapes and trim whitespace. +pub fn strip_groff(line: &str) -> String { + strip_groff_escapes(line).trim().to_string() +} + +/// refined comment detection — the base classify_line may miss some comment +/// forms, so this wrapper checks more carefully before falling through. +fn is_comment_line(line: &str) -> bool { + let bytes = line.as_bytes(); + let len = bytes.len(); + (len >= 3 && bytes[0] == b'.' && bytes[1] == b'\\' && bytes[2] == b'"') + || (len >= 2 && bytes[0] == b'\\' && bytes[1] == b'"') +} + +/// classify a single line of manpage source. +/// macro lines start with '.' or '\'' (groff alternate control char). +/// the macro name is split from its arguments at the first space/tab. +/// arguments wrapped in double quotes are unquoted. +pub fn classify_line(line: &str) -> GroffLine { + if is_comment_line(line) { + return GroffLine::Comment; + } + let len = line.len(); + if len == 0 { + return GroffLine::Blank; + } + let bytes = line.as_bytes(); + // base classify also flags dot-backslash forms as comments + if len >= 2 && bytes[0] == b'.' && bytes[1] == b'\\' && (len < 3 || bytes[2] == b'"') { + return GroffLine::Comment; + } + if len >= 3 && bytes[0] == b'\\' && bytes[1] == b'"' { + return GroffLine::Comment; + } + if bytes[0] == b'.' || bytes[0] == b'\'' { + // macro line — extract macro name and arguments + let rest = line[1..].trim(); + let split_at = rest.find([' ', '\t']); + match split_at { + Some(idx) => { + let name = rest[..idx].to_string(); + let args = rest[idx + 1..].trim(); + // strip surrounding quotes from arguments + let args = if args.len() >= 2 + && args.starts_with('"') + && args.ends_with('"') + && !args[1..args.len() - 1].contains('"') + { + args[1..args.len() - 1].to_string() + } else { + args.to_string() + }; + GroffLine::Macro { name, args } + } + None => GroffLine::Macro { + name: rest.to_string(), + args: String::new(), + }, + } + } else { + let stripped = strip_groff(line); + if stripped.is_empty() { + GroffLine::Blank + } else { + GroffLine::Text(stripped) + } + } +} diff --git a/src/parsers/manpage/mdoc.rs b/src/parsers/manpage/mdoc.rs new file mode 100644 index 0000000..a4af154 --- /dev/null +++ b/src/parsers/manpage/mdoc.rs @@ -0,0 +1,237 @@ +//! BSD mdoc format support. +//! +//! mdoc is the bsd manpage macro package. it uses semantic macros rather than +//! presentation macros: +//! .Fl v -> flag: -v +//! .Ar file -> argument: file +//! .Op ... -> optional: [...] +//! .Bl/.It/.El -> list begin/item/end +//! .Sh -> section header (note lowercase 'h', vs groff's .SH) + +use crate::parsers::manpage::groff::{GroffLine, strip_groff_escapes}; +use crate::parsers::manpage::{ManpageEntry, ManpageResult, OwnedParam, OwnedSwitch}; +use crate::types::Positional; + +/// detect mdoc format by looking for any .Sh macro. +pub fn is_mdoc(lines: &[GroffLine]) -> bool { + lines + .iter() + .any(|l| matches!(l, GroffLine::Macro { name, .. } if name == "Sh")) +} + +/// extract renderable text from an mdoc line, skipping structural macros. +fn mdoc_text_of(line: &GroffLine) -> Option { + match line { + GroffLine::Text(t) => Some(strip_groff_escapes(t)), + GroffLine::Macro { name, args } => match name.as_str() { + "Pp" | "Bl" | "El" | "Sh" | "Ss" | "Os" | "Dd" | "Dt" | "Oo" | "Oc" | "Op" => None, + _ => { + let text = strip_groff_escapes(args); + let text = text.trim(); + if text.is_empty() { + None + } else { + Some(text.to_string()) + } + } + }, + _ => None, + } +} + +/// parse an mdoc .It (list item) line that contains flag definitions. +/// mdoc .It lines look like: ".It Fl v Ar file" +/// where Fl = flag, Ar = argument. +fn parse_mdoc_it(args: &str) -> Option { + let words: Vec<&str> = args + .split(' ') + .filter(|w| !w.is_empty() && *w != "Ns") + .collect(); + let param = match words.as_slice() { + [_, _, "Ar", name, ..] => Some(OwnedParam::Mandatory(name.to_string())), + _ => None, + }; + match words.as_slice() { + ["Fl", ch, ..] if ch.len() == 1 && ch.chars().next().unwrap().is_ascii_alphanumeric() => { + Some(ManpageEntry { + switch: OwnedSwitch::Short(ch.chars().next().unwrap()), + param, + desc: String::new(), + }) + } + ["Fl", name, ..] if name.len() > 1 && name.starts_with('-') => Some(ManpageEntry { + switch: OwnedSwitch::Long(name[1..].to_string()), + param, + desc: String::new(), + }), + _ => None, + } +} + +/// extract a positional argument from an mdoc line (.Ar or .Op Ar). +fn positional_of_mdoc_line(args: &str) -> Option<(String, bool)> { + let words: Vec<&str> = args.split(' ').filter(|w| !w.is_empty()).collect(); + let variadic = words.contains(&"..."); + match words.first() { + Some(name) if name.len() >= 2 => Some((name.to_ascii_lowercase(), variadic)), + _ => None, + } +} + +/// parse an entire mdoc-format manpage. +/// walks through all classified lines looking for: +/// 1. .Bl/.It/.El list blocks containing flag definitions +/// 2. .Sh SYNOPSIS sections containing positional arguments (.Ar, .Op Ar) +pub fn parse_mdoc_lines(lines: &[GroffLine]) -> ManpageResult { + // collect description for an entry — until next structural macro + fn desc_of(lines: &[GroffLine], start: usize) -> (String, usize) { + let mut acc: Vec = Vec::new(); + let mut i = start; + while i < lines.len() { + if let GroffLine::Macro { name, .. } = &lines[i] + && matches!(name.as_str(), "It" | "El" | "Sh" | "Ss") + { + break; + } + if let Some(t) = mdoc_text_of(&lines[i]) { + acc.push(t); + } + i += 1; + } + (acc.join(" ").trim().to_string(), i) + } + + fn skip_to_el(lines: &[GroffLine], start: usize) -> usize { + let mut i = start; + while i < lines.len() { + if let GroffLine::Macro { name, .. } = &lines[i] + && name == "El" + { + return i + 1; + } + i += 1; + } + i + } + + /// parse a single .It entry: extract flag, collect description. + fn parse_it( + args: &str, + lines: &[GroffLine], + start: usize, + entries: &mut Vec, + ) -> usize { + let (desc, new_start) = desc_of(lines, start); + if let Some(mut entry) = parse_mdoc_it(args) { + entry.desc = desc; + entries.push(entry); + } + new_start + } + + /// parse all .It entries within a .Bl/.El option list. + fn parse_option_list( + entries: &mut Vec, + lines: &[GroffLine], + start: usize, + ) -> usize { + let mut i = start; + while i < lines.len() { + match &lines[i] { + GroffLine::Macro { name, .. } if name == "El" => return i + 1, + GroffLine::Macro { name, args } if name == "It" => { + i = parse_it(args, lines, i + 1, entries); + } + _ => i += 1, + } + } + i + } + + fn parse_synopsis( + positionals: &mut Vec<(String, bool, bool)>, + lines: &[GroffLine], + start: usize, + ) -> usize { + let mut i = start; + while i < lines.len() { + match &lines[i] { + GroffLine::Macro { name, .. } if name == "Sh" => return i, + GroffLine::Macro { name, args } if name == "Ar" => { + if let Some((n, v)) = positional_of_mdoc_line(args) { + positionals.push((n, false, v)); + } + i += 1; + } + GroffLine::Macro { name, args } if name == "Op" => { + let words: Vec<&str> = args.split(' ').filter(|w| !w.is_empty()).collect(); + if matches!(words.first(), Some(&"Ar")) { + let rest = if args.len() > 3 { &args[3..] } else { "" }; + if let Some((n, v)) = positional_of_mdoc_line(rest) { + positionals.push((n, true, v)); + } + } + i += 1; + } + _ => i += 1, + } + } + i + } + + let mut entries: Vec = Vec::new(); + let mut positionals: Vec<(String, bool, bool)> = Vec::new(); + let mut i = 0; + while i < lines.len() { + // .Bl + .It header sequence — peek at first .It to decide if this is a flag list + if let GroffLine::Macro { name: n1, .. } = &lines[i] + && n1 == "Bl" + { + let j = i + 1; + if j < lines.len() + && let GroffLine::Macro { + name: n2, + args: it_args, + } = &lines[j] + && n2 == "It" + { + let words: Vec<&str> = it_args.split(' ').filter(|w| !w.is_empty()).collect(); + if matches!(words.first(), Some(&"Fl")) { + let k = parse_it(it_args, lines, j + 1, &mut entries); + i = parse_option_list(&mut entries, lines, k); + continue; + } else { + i = skip_to_el(lines, j + 1); + continue; + } + } + i = skip_to_el(lines, j); + continue; + } + if let GroffLine::Macro { name, args } = &lines[i] + && name == "Sh" + && args.trim().eq_ignore_ascii_case("SYNOPSIS") + { + i = parse_synopsis(&mut positionals, lines, i + 1); + continue; + } + i += 1; + } + + // deduplicate positionals by name, preserving first-seen order + let mut seen: Vec = Vec::new(); + let mut deduped: Vec<(String, Positional)> = Vec::new(); + for (name, optional, variadic) in positionals { + if !seen.contains(&name) { + seen.push(name.clone()); + deduped.push((name, Positional { optional, variadic })); + } + } + + ManpageResult { + entries, + subcommands: Vec::new(), + positionals: deduped, + description: String::new(), + } +} diff --git a/src/parsers/manpage/sections.rs b/src/parsers/manpage/sections.rs new file mode 100644 index 0000000..423fa81 --- /dev/null +++ b/src/parsers/manpage/sections.rs @@ -0,0 +1,851 @@ +//! section extraction from manpages. +//! +//! manpages are divided into sections by .SH macros. we extract OPTIONS, +//! NAME, SYNOPSIS, and COMMANDS sections for their specific content. + +use nom::{Parser, sequence::preceded}; + +use crate::parsers::help::{parse_usage_args, parse_usage_flags, skip_command_name}; +use crate::parsers::manpage::groff::{ + GroffLine, strip_groff_escapes, strip_inline_macro_args, strip_space_macro_args, +}; +use crate::parsers::manpage::{ManpageEntry, ManpageSubcommand, OwnedParam, OwnedSwitch}; +use crate::types::{Param, Positional, Switch}; + +fn is_options_section(name: &str) -> bool { + let upper = name.trim().to_ascii_uppercase(); + upper == "OPTIONS" || upper.contains("OPTION") +} + +/// extract the lines from the OPTIONS section(s). collects from all +/// option-like .SH sections and concatenates them (handles the nix pattern +/// of "Options" and "Common Options" being separate sections). +/// falls back to DESCRIPTION if no OPTIONS section exists. +pub fn extract_options_section(lines: &[GroffLine]) -> Vec { + let mut acc: Vec = Vec::new(); + let mut i = 0; + while i < lines.len() { + if let GroffLine::Macro { name, args } = &lines[i] + && name == "SH" + && is_options_section(args) + { + i += 1; + // synthetic separator between concatenated sections so that + // collect_desc_text (which stops on SH/SS) does not let descriptions + // bleed between sections. + if !acc.is_empty() { + acc.push(GroffLine::Macro { + name: "SH".to_string(), + args: String::new(), + }); + } + while i < lines.len() { + if let GroffLine::Macro { name, .. } = &lines[i] + && name == "SH" + { + break; + } + acc.push(lines[i].clone()); + i += 1; + } + } else { + i += 1; + } + } + if !acc.is_empty() { + return acc; + } + // fallback: DESCRIPTION section + let mut i = 0; + while i < lines.len() { + if let GroffLine::Macro { name, args } = &lines[i] + && name == "SH" + && args.trim().eq_ignore_ascii_case("DESCRIPTION") + { + i += 1; + let mut desc_acc: Vec = Vec::new(); + while i < lines.len() { + if let GroffLine::Macro { name, .. } = &lines[i] + && name == "SH" + { + break; + } + desc_acc.push(lines[i].clone()); + i += 1; + } + return desc_acc; + } + i += 1; + } + Vec::new() +} + +fn extract_named_section(lines: &[GroffLine], section_name: &str) -> Vec { + let mut i = 0; + while i < lines.len() { + if let GroffLine::Macro { name, args } = &lines[i] + && name == "SH" + && args.trim().eq_ignore_ascii_case(section_name) + { + i += 1; + let mut acc: Vec = Vec::new(); + while i < lines.len() { + if let GroffLine::Macro { name, .. } = &lines[i] + && name == "SH" + { + break; + } + acc.push(lines[i].clone()); + i += 1; + } + return acc; + } + i += 1; + } + Vec::new() +} + +/// the NAME section follows the convention "command \- short description". +/// extract the part after "\-" as the command's description. +/// handles both "\-" (groff) and " - " (plain text) separators. +pub fn extract_name_description(lines: &[GroffLine]) -> Option { + let mut i = 0; + while i < lines.len() { + if let GroffLine::Macro { name, args } = &lines[i] + && name == "SH" + && args.trim().eq_ignore_ascii_case("NAME") + { + i += 1; + let mut acc: Vec = Vec::new(); + while i < lines.len() { + if let GroffLine::Macro { name, .. } = &lines[i] + && name == "SH" + { + break; + } + match &lines[i] { + GroffLine::Text(t) => acc.push(t.clone()), + GroffLine::Macro { name, args } + if matches!(name.as_str(), "B" | "BI" | "BR" | "I" | "IR") => + { + let text = strip_groff_escapes(&strip_inline_macro_args(args)); + let text = text.trim(); + if !text.is_empty() { + acc.push(text.to_string()); + } + } + GroffLine::Macro { name, args } if name == "Nm" => { + let text = strip_groff_escapes(args); + let text = text.trim(); + if !text.is_empty() { + acc.push(text.to_string()); + } + } + GroffLine::Macro { name, args } if name == "Nd" => { + let text = strip_groff_escapes(args); + let text = text.trim(); + if !text.is_empty() { + acc.push(format!("\\- {text}")); + } + } + _ => (), + } + i += 1; + } + let full = acc.join(" ").trim().to_string(); + return split_name_separator(&full); + } + i += 1; + } + None +} + +/// split a NAME line on either "\-" (groff) or " - " (plain). +/// returns the part after the separator, trimmed. +fn split_name_separator(full: &str) -> Option { + // search for either marker + let groff_idx = find_padded(full, "\\-"); + let dash_idx = find_padded(full, " - "); + let idx = match (groff_idx, dash_idx) { + (Some(a), Some(b)) => Some(a.min(b)), + (Some(a), None) => Some(a), + (None, Some(b)) => Some(b), + (None, None) => None, + }?; + // skip past the matched separator + let after = if full[idx..].starts_with("\\-") { + &full[idx + 2..] + } else { + &full[idx + 3..] + }; + let desc = after.trim().to_string(); + if desc.is_empty() { None } else { Some(desc) } +} + +/// find a marker preceded and followed by optional surrounding space. +/// approximated by a simple substring search — accepts spaces on either +/// side without enforcing how many. +fn find_padded(s: &str, needle: &str) -> Option { + s.find(needle) +} + +/// extract the command name from the SYNOPSIS section. +/// +/// the SYNOPSIS section shows how to invoke the command: +/// .SH SYNOPSIS +/// .B git add +/// [\fIOPTIONS\fR] [\fB\-\-\fR] [\fI\fR...] +/// +/// we extract the command name by taking consecutive "word" tokens until +/// we hit something that looks like an argument (starts with [, <, -, etc.). +pub fn extract_synopsis_command(contents: &str) -> Option { + // pre-replace italic text (\fI...\fR) with angle-bracketed placeholders + // before classification strips the font info. italic in groff indicates + // a parameter/placeholder (e.g. \fIoperation\fR), not a command word. + // the angle brackets cause extract_cmd to stop at these tokens since + // '<' is in its stop set. + let preprocessed: Vec = contents + .split('\n') + .map(replace_italic_with_angles) + .collect(); + let classified: Vec = preprocessed + .iter() + .map(|line| crate::parsers::manpage::groff::classify_line(line)) + .collect(); + let mut i = 0; + while i < classified.len() { + if let Some((stop_on_ss, content_start)) = synopsis_heading_at(&classified, i) { + i = content_start; + while i < classified.len() { + match &classified[i] { + GroffLine::Macro { name, .. } + if name == "SH" || (stop_on_ss && name == "SS") => + { + return None; + } + GroffLine::Text(text) => { + let trimmed = text.trim(); + if let Some(cmd) = synopsis_command_candidate(trimmed, true) { + return Some(cmd); + } + i += 1; + } + GroffLine::Macro { name, args } if name == "SY" => { + let text = strip_groff_escapes(args); + if let Some(cmd) = synopsis_command_candidate(text.trim(), false) { + return Some(cmd); + } + i += 1; + } + GroffLine::Macro { name, args } + if matches!(name.as_str(), "B" | "BI" | "BR") => + { + let text = render_synopsis_command_macro(name, args); + if let Some(cmd) = synopsis_command_candidate(text.trim(), false) { + return Some(cmd); + } + i += 1; + } + _ => i += 1, + } + } + return None; + } + i += 1; + } + None +} + +fn synopsis_heading_at(lines: &[GroffLine], i: usize) -> Option<(bool, usize)> { + let GroffLine::Macro { name, args } = &lines[i] else { + return None; + }; + if !matches!(name.as_str(), "SH" | "SS") { + return None; + } + if args.trim().eq_ignore_ascii_case("SYNOPSIS") { + return Some((name == "SS", i + 1)); + } + if !args.trim().is_empty() { + return None; + } + let mut j = i + 1; + while j < lines.len() { + match &lines[j] { + GroffLine::Text(text) if text.trim().eq_ignore_ascii_case("SYNOPSIS") => { + return Some((name == "SS", j + 1)); + } + GroffLine::Blank | GroffLine::Comment => j += 1, + _ => return None, + } + } + None +} + +fn render_synopsis_command_macro(name: &str, args: &str) -> String { + match name { + "B" | "I" => strip_space_macro_args(args), + _ => strip_groff_escapes(&strip_inline_macro_args(args)) + .trim() + .to_string(), + } +} + +fn synopsis_command_candidate(line: &str, reject_long_unmarked: bool) -> Option { + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.ends_with(':') { + return None; + } + let cmd = extract_cmd(trimmed)?; + if cmd.starts_with('.') { + return None; + } + if looks_like_synopsis_prose(trimmed, &cmd, reject_long_unmarked) { + None + } else { + Some(cmd) + } +} + +fn looks_like_synopsis_prose(line: &str, cmd: &str, reject_long_unmarked: bool) -> bool { + let Some(first) = cmd.split_whitespace().next() else { + return true; + }; + if matches!( + first.to_ascii_lowercase().as_str(), + "a" | "an" | "and" | "or" | "the" | "this" | "these" + ) { + return true; + } + + let line_has_invocation_marker = line.split_whitespace().any(|word| { + word.starts_with('[') + || word.starts_with('<') + || word.starts_with('-') + || word.starts_with('{') + }) || line.contains('|'); + if line.ends_with('.') && !line_has_invocation_marker { + return true; + } + if reject_long_unmarked && cmd.split_whitespace().count() > 3 && !line_has_invocation_marker { + return true; + } + let looks_like_sentence_starter = first.chars().next().is_some_and(|c| c.is_ascii_uppercase()) + && first.chars().skip(1).all(|c| c.is_ascii_lowercase()); + looks_like_sentence_starter + && line.split_whitespace().count() > 1 + && !line_has_invocation_marker +} + +/// replace \fI...\f[RP] sequences with <...> so italic params are seen as +/// non-word tokens by extract_cmd. +/// +/// exception: some manpages put the command name itself in italics (e.g. +/// git-am.1's synopsis reads `\fIgit am\fR ...`). when the first italic +/// block on the line appears at the very start (preceded only by +/// whitespace) and its content looks like a command word, we strip the +/// font markers but leave the content bare so extract_cmd treats it as +/// the command name rather than a placeholder. +fn replace_italic_with_angles(line: &str) -> String { + let bytes = line.as_bytes(); + let len = bytes.len(); + let mut out = String::with_capacity(len); + let mut i = 0; + let mut command_consumed = false; + while i < len { + // byte-compare to avoid panicking on non-ASCII char boundaries + if i + 3 <= len && &bytes[i..i + 3] == b"\\fI" { + // find closing \fR or \fP — scan to next '\\' + let inner_start = i + 3; + let mut j = inner_start; + while j < len && bytes[j] != b'\\' { + j += 1; + } + if j + 3 <= len + && bytes[j] == b'\\' + && bytes[j + 1] == b'f' + && (bytes[j + 2] == b'R' || bytes[j + 2] == b'P') + { + let inner = &line[inner_start..j]; + let at_line_start = !command_consumed && line[..i].chars().all(char::is_whitespace); + if at_line_start && italic_looks_like_command(inner) { + out.push_str(inner); + command_consumed = true; + } else { + out.push('<'); + out.push_str(inner); + out.push('>'); + } + i = j + 3; + continue; + } + } + let c = line[i..].chars().next().unwrap(); + out.push(c); + i += c.len_utf8(); + } + out +} + +/// is the italic content something that looks like a command name (rather +/// than a placeholder)? lowercase letters, digits, hyphens, underscores, +/// dots, and spaces only, after groff escapes (like `\-`) are resolved. +fn italic_looks_like_command(inner: &str) -> bool { + let stripped = strip_groff_escapes(inner); + let trimmed = stripped.trim(); + !trimmed.is_empty() + && trimmed.chars().all(|c| { + c.is_ascii_lowercase() || c.is_ascii_digit() || matches!(c, '-' | '_' | '.' | ' ') + }) +} + +/// extract the command name from a synopsis line by taking leading word tokens. +fn extract_cmd(line: &str) -> Option { + let words: Vec<&str> = line.split(' ').filter(|w| !w.is_empty()).collect(); + let is_cmd_char = |c: char| c.is_ascii_alphanumeric() || matches!(c, '-' | '_' | '.'); + let mut taken: Vec<&str> = Vec::new(); + for word in words { + let first = word.chars().next().unwrap(); + if matches!(first, '[' | '-' | '<' | '(' | '{') { + break; + } + if word.chars().all(is_cmd_char) { + taken.push(word); + } else { + break; + } + } + if taken.is_empty() { + None + } else { + Some(taken.join(" ")) + } +} + +/// extract the lines that form the SYNOPSIS section. +fn extract_synopsis_section(lines: &[GroffLine]) -> Vec { + extract_named_section(lines, "SYNOPSIS") +} + +/// extract positional arguments from the SYNOPSIS section. +/// joins all text/formatting macro lines via `join_synopsis_text`, then +/// skips the command name prefix and runs `parse_usage_args` on the rest. +pub fn extract_synopsis_positionals(lines: &[GroffLine]) -> Vec<(String, Positional)> { + let full = join_synopsis_text(lines); + if full.is_empty() { + return Vec::new(); + } + let result: nom::IResult<&str, Vec<(&str, Positional)>> = + preceded(skip_command_name, parse_usage_args).parse(&full); + match result { + Ok((_, map)) => map + .into_iter() + .map(|(k, v)| (k.to_ascii_lowercase(), v)) + .collect(), + Err(_) => Vec::new(), + } +} + +/// join the SYNOPSIS section into a single line of plain text, stripping +/// groff escapes and inline font macros. shared by both the positional +/// and flag extractors so they see identical input. +fn join_synopsis_text(lines: &[GroffLine]) -> String { + let section = extract_synopsis_section(lines); + let mut acc: Vec = Vec::new(); + for line in section { + match line { + GroffLine::Macro { name, .. } if name == "SS" || name == "br" => break, + GroffLine::Macro { name, args } if name == "SY" => { + let text = strip_groff_escapes(&args).trim().to_string(); + if !text.is_empty() { + acc.push(text); + } + } + GroffLine::Macro { name, args } if name == "I" => { + let text = strip_groff_escapes(&args).trim().to_string(); + if !text.is_empty() { + acc.push(format!("<{text}>")); + } + } + GroffLine::Macro { name, args } if name == "IR" => { + let text = render_leading_italic_arg(&args); + if !text.is_empty() { + acc.push(text); + } + } + GroffLine::Text(t) => { + let text = strip_groff_escapes(&t).trim().to_string(); + if !text.is_empty() { + acc.push(text); + } + } + GroffLine::Macro { name, args } if name == "B" => { + let text = strip_space_macro_args(&args); + if !text.is_empty() { + acc.push(text); + } + } + GroffLine::Macro { name, args } + if matches!(name.as_str(), "B" | "BI" | "BR" | "IB" | "RB" | "RI") => + { + let text = strip_groff_escapes(&strip_inline_macro_args(&args)); + let text = text.trim(); + if !text.is_empty() { + acc.push(text.to_string()); + } + } + _ => (), + } + } + acc.join(" ").trim().to_string() +} + +fn render_leading_italic_arg(args: &str) -> String { + let trimmed = args.trim(); + if trimmed.is_empty() { + return String::new(); + } + let (first, rest) = match trimmed.find(char::is_whitespace) { + Some(idx) => (&trimmed[..idx], trimmed[idx..].trim()), + None => (trimmed, ""), + }; + let first = strip_groff_escapes(first).trim().to_string(); + if first.is_empty() { + return String::new(); + } + let rest = strip_groff_escapes(&strip_inline_macro_args(rest)); + let rest = rest.trim(); + if rest.is_empty() { + format!("<{first}>") + } else { + format!("<{first}> {rest}") + } +} + +fn to_owned_switch(s: Switch<'_>) -> OwnedSwitch { + match s { + Switch::Short(c) => OwnedSwitch::Short(c), + Switch::Long(l) => OwnedSwitch::Long(l.to_string()), + Switch::Both(c, l) => OwnedSwitch::Both(c, l.to_string()), + } +} + +fn to_owned_param(p: Param<'_>) -> OwnedParam { + match p { + Param::Mandatory(s) => OwnedParam::Mandatory(s.to_string()), + Param::Optional(s) => OwnedParam::Optional(s.to_string()), + } +} + +/// extract flag-tagged entries from the SYNOPSIS line. some manpages +/// (notably nix-env, sed) declare flags only in the synopsis and never +/// repeat them as entries in the OPTIONS body, so the body-only pass +/// misses them. we join the synopsis text the same way the positional +/// extractor does, then run `parse_usage_flags` over every bracketed +/// switch+param. callers merge with body entries; body wins on duplicate +/// flag names since body descriptions are richer. +pub fn extract_synopsis_flags(lines: &[GroffLine]) -> Vec { + let full = join_synopsis_text(lines); + if full.is_empty() { + return Vec::new(); + } + let result: nom::IResult<&str, Vec<(Switch<'_>, Option>)>> = + preceded(skip_command_name, parse_usage_flags).parse(&full); + match result { + Ok((_, pairs)) => pairs + .into_iter() + .map(|(switch, param)| ManpageEntry { + switch: to_owned_switch(switch), + param: param.map(to_owned_param), + desc: String::new(), + }) + .collect(), + Err(_) => Vec::new(), + } +} + +/// extract first-positional choices from prose lists in DESCRIPTION. +/// +/// getent(1) is the motivating shape: the synopsis has a `database` +/// positional, while the actual database names are documented as a tagged +/// list under DESCRIPTION rather than as subcommands or options. The +/// completion model currently has no separate "positional choices" channel, +/// so these are represented as subcommand-like candidates for completion. +pub fn extract_description_positionals(lines: &[GroffLine]) -> Vec { + let description = extract_named_section(lines, "DESCRIPTION"); + if description.is_empty() || !description_mentions_listed_database(&description) { + return Vec::new(); + } + + let mut out = Vec::new(); + let mut seen = std::collections::HashSet::new(); + let mut i = 0; + let mut in_database_list = false; + while i < description.len() { + match &description[i] { + GroffLine::Text(text) + if text.to_ascii_lowercase().contains("listed below") + || text.to_ascii_lowercase().contains("may be any of") => + { + in_database_list = true; + i += 1; + } + GroffLine::Macro { name, .. } if name == "TP" && in_database_list => { + if i + 1 >= description.len() { + break; + } + let Some(name) = description_tag_name(&description[i + 1]) else { + i += 1; + continue; + }; + if !is_description_choice_name(&name) { + i += 1; + continue; + } + let (desc, new_i) = collect_description_choice_desc(&description, i + 2); + if seen.insert(name.clone()) { + out.push(ManpageSubcommand { name, desc }); + } + i = new_i; + } + _ => { + i += 1; + } + } + } + out +} + +fn description_mentions_listed_database(lines: &[GroffLine]) -> bool { + let mut saw_database = false; + let mut saw_list = false; + for line in lines { + let text = match line { + GroffLine::Text(text) => text.clone(), + GroffLine::Macro { name, args } + if matches!(name.as_str(), "B" | "BI" | "BR" | "I" | "IR" | "RI") => + { + strip_groff_escapes(&strip_inline_macro_args(args)) + } + _ => String::new(), + }; + let lower = text.to_ascii_lowercase(); + saw_database |= lower.contains("database"); + saw_list |= lower.contains("listed below") || lower.contains("may be any of"); + } + saw_database && saw_list +} + +fn description_tag_name(line: &GroffLine) -> Option { + match line { + GroffLine::Text(text) => Some(text.trim().to_string()), + GroffLine::Macro { name, args } + if matches!(name.as_str(), "B" | "BI" | "BR" | "I" | "IR") => + { + Some( + strip_groff_escapes(&strip_inline_macro_args(args)) + .trim() + .to_string(), + ) + } + _ => None, + } +} + +fn is_description_choice_name(name: &str) -> bool { + !name.is_empty() + && name.len() <= 32 + && !name.starts_with('-') + && name + .chars() + .all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-' || c == '_') +} + +fn collect_description_choice_desc(lines: &[GroffLine], start: usize) -> (String, usize) { + let mut parts = Vec::new(); + let mut i = start; + while i < lines.len() { + match &lines[i] { + GroffLine::Macro { name, .. } if matches!(name.as_str(), "TP" | "SH" | "SS") => { + break; + } + GroffLine::Text(text) => { + parts.push(text.clone()); + i += 1; + } + GroffLine::Macro { name, args } + if matches!(name.as_str(), "B" | "BI" | "BR" | "I" | "IR" | "RI") => + { + let text = strip_groff_escapes(&strip_inline_macro_args(args)); + let text = text.trim(); + if !text.is_empty() { + parts.push(text.to_string()); + } + i += 1; + } + GroffLine::Blank | GroffLine::Comment => { + i += 1; + } + GroffLine::Macro { .. } => { + i += 1; + } + } + } + (first_sentence(&parts.join(" ")), i) +} + +fn first_sentence(text: &str) -> String { + let text = text.split_whitespace().collect::>().join(" "); + for marker in [". ", ".) "] { + if let Some(idx) = text.find(marker) { + return text[..idx + 1].trim().to_string(); + } + } + text.trim().to_string() +} + +fn is_commands_section(name: &str) -> bool { + let trimmed = name.trim(); + // strip a trailing parenthetical group so "HIGH-LEVEL COMMANDS (PORCELAIN)" + // (which is git.1's pattern) is treated as "HIGH-LEVEL COMMANDS". + let core = match (trimmed.rfind('('), trimmed.ends_with(')')) { + (Some(open), true) => trimmed[..open].trim(), + _ => trimmed, + }; + let upper = core.to_ascii_uppercase(); + if upper == "COMMAND" || upper == "COMMANDS" { + return true; + } + // accept headings ending in " COMMANDS" — catches "GIT COMMANDS", + // "MAIN COMMANDS", "HIGH-LEVEL COMMANDS", "LOW-LEVEL COMMANDS". the + // leading space prevents matches against "COMMAND LINE OPTIONS" etc. + upper.ends_with(" COMMANDS") +} + +/// find all COMMANDS/.COMMAND sections and collect their lines. +pub fn extract_commands_section(lines: &[GroffLine]) -> Vec { + let mut acc: Vec = Vec::new(); + let mut i = 0; + while i < lines.len() { + if let GroffLine::Macro { name, args } = &lines[i] + && name == "SH" + && is_commands_section(args) + { + i += 1; + while i < lines.len() { + if let GroffLine::Macro { name, .. } = &lines[i] + && name == "SH" + { + break; + } + acc.push(lines[i].clone()); + i += 1; + } + } else { + i += 1; + } + } + acc +} + +/// extract SUBCOMMAND-style sections (clap-generated manpages put each +/// subcommand under its own .SH SUBCOMMAND header with a Usage: line). +/// returns triples of (name, description, lines) so the caller can re-parse +/// each section as its own help_result. +pub fn extract_subcommand_sections(lines: &[GroffLine]) -> Vec<(String, String, Vec)> { + // split into sections at .SH boundaries, keeping only SUBCOMMAND(S) ones + let mut sections: Vec> = Vec::new(); + let mut current_name: Option = None; + let mut current: Vec = Vec::new(); + for line in lines { + if let GroffLine::Macro { name, args } = line + && name == "SH" + { + if current_name.is_some() { + sections.push(std::mem::take(&mut current)); + } + let n = args.trim().to_ascii_uppercase(); + if n == "SUBCOMMAND" || n == "SUBCOMMANDS" { + current_name = Some(n); + } else { + current_name = None; + } + continue; + } + if current_name.is_some() { + current.push(line.clone()); + } + } + if current_name.is_some() { + sections.push(current); + } + + let mut out = Vec::new(); + for section in sections { + // scan section lines for the Usage: line to get the subcommand name + let mut subcmd_name: Option = None; + let mut desc_lines: Vec = Vec::new(); + for line in §ion { + if subcmd_name.is_some() { + break; + } + match line { + GroffLine::Text(t) => match find_usage_name(t) { + Some(name) => subcmd_name = Some(name), + None => desc_lines.push(t.clone()), + }, + GroffLine::Macro { name, args } + if matches!(name.as_str(), "TP" | "B" | "BI" | "BR") => + { + let text = strip_groff_escapes(&strip_inline_macro_args(args)); + let text = text.trim(); + subcmd_name = find_usage_name(text); + } + _ => (), + } + } + if let Some(name) = subcmd_name { + let desc_raw = desc_lines.join(" "); + let desc = strip_groff_escapes(&desc_raw).trim().to_string(); + let desc = strip_backtick_words(&desc); + out.push((name, desc, section)); + } + } + out +} + +/// look for "Usage: NAME" and return NAME if found. +/// NAME contains alphanumeric, underscore, or dash. +fn find_usage_name(text: &str) -> Option { + const MARKER: &str = "Usage: "; + let idx = text.find(MARKER)?; + let after = &text[idx + MARKER.len()..]; + let end = after + .find(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '-')) + .unwrap_or(after.len()); + if end == 0 { + None + } else { + Some(after[..end].to_string()) + } +} + +/// strip backtick-quoted words: `word` -> word. +fn strip_backtick_words(s: &str) -> String { + let mut out = String::with_capacity(s.len()); + let mut i = 0; + let bytes = s.as_bytes(); + while i < bytes.len() { + if bytes[i] == b'`' + && let Some(end) = s[i + 1..].find('`') + { + out.push_str(&s[i + 1..i + 1 + end]); + i += end + 2; + continue; + } + let c = s[i..].chars().next().unwrap(); + out.push(c); + i += c.len_utf8(); + } + out +} diff --git a/src/parsers/manpage/strategies.rs b/src/parsers/manpage/strategies.rs new file mode 100644 index 0000000..855c468 --- /dev/null +++ b/src/parsers/manpage/strategies.rs @@ -0,0 +1,456 @@ +//! strategy-based entry extraction. +//! +//! rather than a single monolithic parser, we use multiple "strategies" that +//! each target a specific groff formatting pattern. this is necessary because +//! manpage authors use very different macro combinations for the same purpose. + +use nom::{Parser, combinator::opt}; + +use crate::make_macro_walker; +use crate::parsers::help::{help_parser, param_parser, switch_parser}; +use crate::parsers::manpage::groff::{ + GroffLine, strip_groff_escapes, strip_inline_macro_args, strip_space_macro_args, +}; +use crate::parsers::manpage::{ManpageEntry, OwnedParam, OwnedSwitch}; +use crate::types::{Param, Switch}; + +/// collect consecutive text lines, joining them with spaces. +/// returns (collected, remaining). +fn collect_text_lines(lines: &[GroffLine]) -> (String, &[GroffLine]) { + let mut acc: Vec<&str> = Vec::new(); + let mut i = 0; + while i < lines.len() { + match &lines[i] { + GroffLine::Text(t) => acc.push(t), + _ => break, + } + i += 1; + } + (acc.join(" "), &lines[i..]) +} + +fn collect_description_lines(lines: &[GroffLine], start: usize) -> (String, usize) { + let mut acc: Vec = Vec::new(); + let mut i = start; + while i < lines.len() { + match &lines[i] { + GroffLine::Macro { name, .. } + if matches!(name.as_str(), "TP" | "TQ" | "IP" | "PP" | "SH" | "SS") => + { + break; + } + GroffLine::Text(t) => { + acc.push(t.clone()); + i += 1; + } + GroffLine::Macro { name, args } + if matches!( + name.as_str(), + "B" | "BI" | "BR" | "I" | "IR" | "IB" | "RB" | "RI" + ) => + { + let text = tag_of_macro(name, args); + if !text.is_empty() { + acc.push(text); + } + i += 1; + } + GroffLine::Blank | GroffLine::Comment => { + i += 1; + } + GroffLine::Macro { .. } => { + i += 1; + } + } + } + (acc.join(" "), i) +} + +fn to_owned_switch(s: Switch<'_>) -> OwnedSwitch { + match s { + Switch::Short(c) => OwnedSwitch::Short(c), + Switch::Long(l) => OwnedSwitch::Long(l.to_string()), + Switch::Both(c, l) => OwnedSwitch::Both(c, l.to_string()), + } +} + +fn to_owned_param(p: Param<'_>) -> OwnedParam { + match p { + Param::Mandatory(s) => OwnedParam::Mandatory(s.to_string()), + Param::Optional(s) => OwnedParam::Optional(s.to_string()), + } +} + +/// attempt to parse a tag string (e.g. "-v, --verbose FILE") into an entry. +/// uses the nom switch_parser + param_parser from the help module. +/// returns None if the tag doesn't look like a flag definition. +pub fn parse_tag_to_entry(tag: &str, desc: String) -> Option { + let tag = strip_groff_escapes(tag); + let tag = tag.trim(); + let result: nom::IResult<&str, (Switch<'_>, Option>)> = + (switch_parser, opt(param_parser)).parse(tag); + match result { + Ok((_, (switch, param))) => Some(ManpageEntry { + switch: to_owned_switch(switch), + param: param.map(to_owned_param), + desc, + }), + Err(_) => None, + } +} + +/// extract tag text from a macro line. +/// .B and .I preserve spaces (single argument); .BI, .BR, .IR alternate +/// fonts and concatenate arguments. +pub fn tag_of_macro(name: &str, args: &str) -> String { + match name { + "B" | "I" => strip_space_macro_args(args), + _ => strip_groff_escapes(&strip_inline_macro_args(args)) + .trim() + .to_string(), + } +} + +// strategy a: .TP style (most common — gnu coreutils, help2man). +// .TP introduces a tagged paragraph: the next line is the "tag" (flag name) +// and subsequent text lines are the description. the tag can be plain text +// or wrapped in a formatting macro (.B, .BI, etc.). +pub fn strategy_tp(lines: &[GroffLine]) -> Vec { + let mut out = Vec::new(); + let mut i = 0; + while i < lines.len() { + let GroffLine::Macro { name, .. } = &lines[i] else { + i += 1; + continue; + }; + if name != "TP" { + i += 1; + continue; + } + + let (tags, body_start) = collect_tp_tags(lines, i + 1); + if tags.is_empty() { + i += 1; + continue; + } + let (desc, new_i) = collect_description_lines(lines, body_start); + out.extend(entries_from_tag_alternates(&tags, desc)); + i = new_i; + } + out +} + +fn collect_tp_tags(lines: &[GroffLine], start: usize) -> (Vec, usize) { + let mut tags = Vec::new(); + let mut i = start; + loop { + if i >= lines.len() { + break; + } + let Some(tag) = tag_from_line(&lines[i]) else { + break; + }; + tags.push(tag); + i += 1; + if i < lines.len() && matches!(&lines[i], GroffLine::Macro { name, .. } if name == "TQ") { + i += 1; + continue; + } + break; + } + (tags, i) +} + +fn tag_from_line(line: &GroffLine) -> Option { + match line { + GroffLine::Text(tag) => Some(tag.clone()), + GroffLine::Macro { name, args } + if matches!( + name.as_str(), + "B" | "I" | "BI" | "BR" | "IR" | "IB" | "RB" | "RI" + ) => + { + Some(tag_of_macro(name, args)) + } + _ => None, + } +} + +fn entries_from_tag_alternates(tags: &[String], desc: String) -> Vec { + let entries: Vec = tags + .iter() + .filter_map(|tag| parse_tag_to_entry(tag, desc.clone())) + .collect(); + if entries.len() == 2 + && let Some(combined) = combine_short_long_alternates(&entries[0], &entries[1]) + { + return vec![combined]; + } + entries +} + +fn combine_short_long_alternates( + left: &ManpageEntry, + right: &ManpageEntry, +) -> Option { + match (&left.switch, &right.switch) { + (OwnedSwitch::Long(l), OwnedSwitch::Short(c)) => Some(ManpageEntry { + switch: OwnedSwitch::Both(*c, l.clone()), + param: left.param.clone().or_else(|| right.param.clone()), + desc: left.desc.clone(), + }), + (OwnedSwitch::Short(c), OwnedSwitch::Long(l)) => Some(ManpageEntry { + switch: OwnedSwitch::Both(*c, l.clone()), + param: right.param.clone().or_else(|| left.param.clone()), + desc: left.desc.clone(), + }), + _ => None, + } +} + +// strategy b: .IP style (curl, hand-written manpages). +// .IP takes an inline tag argument: .IP "-v, --verbose" +// the description follows as text lines. +make_macro_walker!(pub strategy_ip -> Vec, on macro "IP" => + |lines, i, args| { + let tag = strip_groff_escapes(args); + let (desc, rest) = collect_text_lines(&lines[i + 1..]); + let new_i = lines.len() - rest.len(); + parse_tag_to_entry(&tag, desc).map(|e| (e, new_i)) + } +); + +// strategy c: .PP + .RS/.RE style (git, docbook-generated manpages). +// flag entries are introduced by .PP (paragraph), with the flag name as +// plain text, followed by a .RS (indent) block containing the description, +// closed by .RE (de-indent). +make_macro_walker!(pub strategy_pp_rs -> Vec, on macro "PP" => + |lines, i, _args| { + if i + 1 >= lines.len() { return None; } + if let GroffLine::Text(tag) = &lines[i + 1] { + let (desc, new_i) = collect_pp_rs_desc(lines, i + 2); + parse_tag_to_entry(tag, desc).map(|e| (e, new_i)) + } else { + None + } + } +); + +fn collect_pp_rs_desc(lines: &[GroffLine], start: usize) -> (String, usize) { + let mut acc: Vec = Vec::new(); + let mut i = start; + // outer: look for .RS marker or text + while i < lines.len() { + match &lines[i] { + GroffLine::Macro { name, .. } if name == "RS" => { + i += 1; + // inside .RS — collect until .RE or boundary macro + while i < lines.len() { + match &lines[i] { + GroffLine::Macro { name, .. } if name == "RE" => { + return (acc.join(" "), i + 1); + } + GroffLine::Text(t) => { + acc.push(t.clone()); + i += 1; + } + GroffLine::Macro { name, .. } if name == "PP" || name == "SH" => { + return (acc.join(" "), i); + } + _ => i += 1, + } + } + return (acc.join(" "), i); + } + GroffLine::Text(t) => { + acc.push(t.clone()); + i += 1; + } + _ => return (acc.join(" "), i), + } + } + (acc.join(" "), i) +} + +/// strategy d: deroff fallback — strip all groff markup, then feed the +/// resulting plain text through the help parser. +pub fn strategy_deroff(lines: &[GroffLine]) -> Vec { + let mut buffer = String::with_capacity(256); + for line in lines { + match line { + GroffLine::Text(text) => { + buffer.push_str(text); + buffer.push('\n'); + } + GroffLine::Macro { name, args } + if matches!(name.as_str(), "BI" | "BR" | "IR" | "B" | "I") => + { + let text = strip_groff_escapes(&strip_inline_macro_args(args)); + buffer.push_str(&text); + buffer.push('\n'); + } + GroffLine::Blank => buffer.push('\n'), + _ => (), + } + } + match help_parser(&buffer) { + Ok((_, result)) => result + .entries + .into_iter() + .map(|e| ManpageEntry { + switch: to_owned_switch(e.switch), + param: e.param.map(to_owned_param), + desc: e.desc.join(" "), + }) + .collect(), + Err(_) => Vec::new(), + } +} + +fn is_bullet_ip(args: &str) -> bool { + !args.trim().is_empty() +} + +// strategy e: nix3-style bullet .IP with .UR/.UE hyperlinks. +// nix's manpages use .IP with bullet markers for flag entries, interleaved +// with .UR/.UE hyperlink macros. the flag tag is in text lines after the +// bullet .IP, and the description follows a non-bullet .IP marker. +make_macro_walker!(pub strategy_nix -> Vec, on macro "IP" => + |lines, i, args| { + if !is_bullet_ip(args) { return None; } + // collect tag: skip .UR/.UE macros, gather Text lines + let mut tag_idx = i + 1; + let mut tag_parts: Vec = Vec::new(); + while tag_idx < lines.len() { + match &lines[tag_idx] { + GroffLine::Macro { name, .. } if name == "UR" || name == "UE" => { + tag_idx += 1; + } + GroffLine::Text(t) => { + tag_parts.push(t.clone()); + tag_idx += 1; + } + _ => break, + } + } + let tag = tag_parts.join(" "); + let (desc, new_i) = collect_nix_desc(lines, tag_idx); + parse_tag_to_entry(&tag, desc).map(|e| (e, new_i)) + } +); + +fn collect_nix_desc(lines: &[GroffLine], start: usize) -> (String, usize) { + if start >= lines.len() { + return (String::new(), start); + } + let mut i = start; + // require non-bullet .IP marker for description + if let GroffLine::Macro { name, args } = &lines[i] + && name == "IP" + && args.trim().is_empty() + { + i += 1; + } else { + return (String::new(), start); + } + let mut parts: Vec = Vec::new(); + while i < lines.len() { + match &lines[i] { + GroffLine::Text(t) => { + parts.push(t.clone()); + i += 1; + } + GroffLine::Macro { name, args } if name == "IP" => { + if !args.trim().is_empty() { + // next bullet entry — stop + return (parts.join(" "), i); + } + // non-bullet .IP = continuation paragraph + i += 1; + } + GroffLine::Macro { name, .. } if name == "SS" || name == "SH" => { + return (parts.join(" "), i); + } + GroffLine::Macro { name, .. } if name == "RS" => { + i = skip_rs(lines, i + 1, 1); + } + GroffLine::Macro { .. } => { + i += 1; + } + GroffLine::Blank | GroffLine::Comment => { + i += 1; + } + } + } + (parts.join(" "), i) +} + +fn skip_rs(lines: &[GroffLine], start: usize, mut depth: usize) -> usize { + let mut i = start; + while i < lines.len() { + if let GroffLine::Macro { name, .. } = &lines[i] { + if name == "RE" { + depth -= 1; + if depth == 0 { + return i + 1; + } + } else if name == "RS" { + depth += 1; + } + } + i += 1; + } + i +} + +/// count occurrences of a specific macro in the section. +fn count_macro(name: &str, lines: &[GroffLine]) -> usize { + lines + .iter() + .filter(|line| matches!(line, GroffLine::Macro { name: n, .. } if n == name)) + .count() +} + +/// auto-detect and try strategies, return the one with most entries. +/// first counts macros to determine which strategies are applicable, +/// then runs all applicable ones and picks the winner by entry count. +/// if no specialized strategy produces results, falls back to deroff. +pub fn extract_entries(lines: &[GroffLine]) -> Vec { + let tp = count_macro("TP", lines); + let ip = count_macro("IP", lines); + let pp = count_macro("PP", lines); + let rs = count_macro("RS", lines); + let ur = count_macro("UR", lines); + + let mut specialized: Vec<(&str, Vec)> = Vec::new(); + if tp > 0 { + specialized.push(("TP", strategy_tp(lines))); + } + if ip > 0 { + specialized.push(("IP", strategy_ip(lines))); + } + if pp > 0 && rs > 0 { + specialized.push(("PP+RS", strategy_pp_rs(lines))); + } + if ur > 0 && ip > 0 { + specialized.push(("nix", strategy_nix(lines))); + } + let candidates: Vec<(&str, Vec)> = { + let filtered: Vec<_> = specialized + .into_iter() + .filter(|(_, e)| !e.is_empty()) + .collect(); + if filtered.is_empty() { + vec![("deroff", strategy_deroff(lines))] + } else { + filtered + } + }; + let mut best: Vec = Vec::new(); + for (_, entries) in candidates { + if entries.len() >= best.len() { + best = entries; + } + } + best +} diff --git a/src/parsers/mod.rs b/src/parsers/mod.rs new file mode 100644 index 0000000..1f8090a --- /dev/null +++ b/src/parsers/mod.rs @@ -0,0 +1,3 @@ +pub mod help; +pub mod manpage; +pub mod nushell; diff --git a/src/parsers/nushell.rs b/src/parsers/nushell.rs new file mode 100644 index 0000000..eaf4bcc --- /dev/null +++ b/src/parsers/nushell.rs @@ -0,0 +1,475 @@ +//! generate nushell `extern` definitions from parsed help data. +//! +//! this module is the code generation backend. it takes a [`ManpageResult`] +//! (from the help or manpage parsers) and produces nushell source that defines +//! `extern` declarations — nushell's mechanism for teaching the shell about +//! external commands' flags and subcommands so it can offer completions. +//! +//! key responsibilities: +//! - deduplicating flag entries (same flag from multiple help sources) +//! - mapping parameter names to nushell types (path, int, string) +//! - formatting flags in nushell syntax: --flag(-f): type # description +//! - handling positional arguments with nushell's ordering constraints +//! - escaping special characters for nushell string literals + +use std::borrow::Cow; +use std::collections::{HashMap, HashSet}; +use std::sync::OnceLock; + +use crate::parsers::manpage::{ + ManpageEntry, ManpageResult, ManpageSubcommand, OwnedParam, OwnedSwitch, +}; +use crate::types::Positional; + +/// nushell built-in commands and keywords — we must never generate `extern` +/// definitions for these because it would shadow nushell's own implementations. +/// maintained manually and should be updated with new nushell releases. +pub const NUSHELL_BUILTINS: &[&str] = &[ + "alias", + "all", + "ansi", + "any", + "append", + "ast", + "attr", + "bits", + "break", + "bytes", + "cal", + "cd", + "char", + "chunk-by", + "chunks", + "clear", + "collect", + "columns", + "commandline", + "compact", + "complete", + "config", + "const", + "continue", + "cp", + "date", + "debug", + "decode", + "def", + "default", + "describe", + "detect", + "do", + "drop", + "du", + "each", + "echo", + "encode", + "enumerate", + "error", + "every", + "exec", + "exit", + "explain", + "explore", + "export", + "export-env", + "extern", + "fill", + "filter", + "find", + "first", + "flatten", + "for", + "format", + "from", + "generate", + "get", + "glob", + "grid", + "group-by", + "hash", + "headers", + "help", + "hide", + "hide-env", + "histogram", + "history", + "http", + "if", + "ignore", + "input", + "insert", + "inspect", + "interleave", + "into", + "is-admin", + "is-empty", + "is-not-empty", + "is-terminal", + "items", + "job", + "join", + "keybindings", + "kill", + "last", + "length", + "let", + "let-env", + "lines", + "load-env", + "loop", + "ls", + "match", + "math", + "merge", + "metadata", + "mkdir", + "mktemp", + "module", + "move", + "mut", + "mv", + "nu-check", + "nu-highlight", + "open", + "overlay", + "panic", + "par-each", + "parse", + "path", + "plugin", + "port", + "prepend", + "print", + "ps", + "query", + "random", + "reduce", + "reject", + "rename", + "return", + "reverse", + "rm", + "roll", + "rotate", + "run-external", + "save", + "schema", + "scope", + "select", + "seq", + "shuffle", + "skip", + "sleep", + "slice", + "sort", + "sort-by", + "source", + "source-env", + "split", + "start", + "stor", + "str", + "sys", + "table", + "take", + "tee", + "term", + "timeit", + "to", + "touch", + "transpose", + "try", + "tutor", + "ulimit", + "umask", + "uname", + "uniq", + "uniq-by", + "unlet", + "update", + "upsert", + "url", + "use", + "values", + "version", + "view", + "watch", + "where", + "which", + "while", + "whoami", + "window", + "with-env", + "wrap", + "zip", +]; + +fn builtin_set() -> &'static HashSet<&'static str> { + static SET: OnceLock> = OnceLock::new(); + SET.get_or_init(|| NUSHELL_BUILTINS.iter().copied().collect()) +} + +/// returns true if the given command name collides with a nushell built-in. +pub fn is_nushell_builtin(cmd: &str) -> bool { + builtin_set().contains(cmd) +} + +/// map parameter names to nushell types. +/// nushell's `extern` declarations use typed parameters, so we infer the type +/// from the parameter name. file/path-related names become "path" (enables +/// path completion), numeric names become "int", everything else is "string". +pub fn nushell_type_of_param(name: &str) -> &'static str { + match name { + "FILE" | "file" | "PATH" | "path" | "DIR" | "dir" | "DIRECTORY" | "FILENAME" + | "PATTERNFILE" => "path", + "NUM" | "N" | "COUNT" | "NUMBER" | "int" | "INT" | "COLS" | "WIDTH" | "LINES" | "DEPTH" + | "depth" => "int", + _ => "string", + } +} + +/// escape a string for use inside nushell double-quoted string literals. +/// only double quotes and backslashes need escaping in nushell's syntax. +pub fn escape_nu(s: &str) -> Cow<'_, str> { + if !s.contains('"') && !s.contains('\\') { + Cow::Borrowed(s) + } else { + let mut buf = String::with_capacity(s.len() + 4); + for c in s.chars() { + match c { + '"' => buf.push_str("\\\""), + '\\' => buf.push_str("\\\\"), + c => buf.push(c), + } + } + Cow::Owned(buf) + } +} + +fn entry_key(e: &ManpageEntry) -> String { + match &e.switch { + OwnedSwitch::Short(c) => format!("-{c}"), + OwnedSwitch::Long(l) | OwnedSwitch::Both(_, l) => format!("--{l}"), + } +} + +fn entry_score(e: &ManpageEntry) -> i32 { + let switch_bonus = if matches!(e.switch, OwnedSwitch::Both(_, _)) { + 10 + } else { + 0 + }; + let param_bonus = if e.param.is_some() { 5 } else { 0 }; + let desc_bonus = (e.desc.len() / 10).min(5) as i32; + switch_bonus + param_bonus + desc_bonus +} + +/// deduplicate flag entries that refer to the same flag. +/// +/// when the same flag appears multiple times (e.g. from overlapping manpage +/// sections or repeated help text), we keep the "best" version using a score: +/// - both short+long form present: +10 (most informative) +/// - has a parameter: +5 +/// - description length bonus: up to +5 +/// +/// after deduplication by long name, we also remove standalone short flags +/// whose letter is already covered by a Both(short, long) entry. this prevents +/// emitting both "-v" and "--verbose(-v)" which nushell would reject as a +/// duplicate. the filtering preserves original ordering from the help text. +pub fn dedup_entries(entries: &[ManpageEntry]) -> Vec { + let mut best: HashMap = HashMap::new(); + for e in entries { + let key = entry_key(e); + match best.get(&key) { + Some(prev) if entry_score(prev) >= entry_score(e) => {} + _ => { + best.insert(key, e); + } + } + } + let mut covered: HashSet = HashSet::new(); + for e in best.values() { + if let OwnedSwitch::Both(c, _) = &e.switch { + covered.insert(*c); + } + } + let mut seen: HashSet = HashSet::new(); + let mut out: Vec = Vec::new(); + for e in entries { + let key = entry_key(e); + if seen.contains(&key) { + continue; + } + if let OwnedSwitch::Short(c) = &e.switch + && covered.contains(c) + { + continue; + } + seen.insert(key.clone()); + out.push((*best.get(&key).unwrap()).clone()); + } + out +} + +/// format a single flag entry as a nushell `extern` parameter line. +/// output examples: +/// " --verbose(-v) # increase verbosity" +/// " --output(-o): path # write output to file" +/// " -n: int # number of results" +/// +/// the description is right-padded to column 40 with a "# " comment prefix. +pub fn format_flag(entry: &ManpageEntry) -> String { + let name = match &entry.switch { + OwnedSwitch::Both(c, l) => format!("--{l}(-{c})"), + OwnedSwitch::Long(l) => format!("--{l}"), + OwnedSwitch::Short(c) => format!("-{c}"), + }; + let typed = match &entry.param { + Some(OwnedParam::Mandatory(p)) | Some(OwnedParam::Optional(p)) => { + format!(": {}", nushell_type_of_param(p)) + } + None => String::new(), + }; + let flag = format!(" {name}{typed}"); + if entry.desc.is_empty() { + flag + } else { + let pad_len = 40usize.saturating_sub(flag.len()).max(1); + format!("{flag}{}# {}", " ".repeat(pad_len), entry.desc) + } +} + +/// format a positional argument as a nushell `extern` parameter line. +/// nushell syntax: "...name: type" for variadic, "name?: type" for optional. +/// hyphens in names are converted to underscores since nushell identifiers +/// cannot contain hyphens. +pub fn format_positional(name: &str, p: &Positional) -> String { + let name_underscored: String = name + .chars() + .map(|c| if c == '-' { '_' } else { c }) + .collect(); + let prefix = if p.variadic { "..." } else { "" }; + let suffix = if p.optional && !p.variadic { "?" } else { "" }; + let typ = nushell_type_of_param(&name.to_ascii_uppercase()); + format!(" {prefix}{name_underscored}{suffix}: {typ}") +} + +/// enforce nushell's positional argument ordering rules: +/// 1. no required positional may follow an optional one +/// 2. at most one variadic ("rest") parameter is allowed +/// +/// if a required positional appears after an optional one, it is silently +/// promoted to optional. duplicate variadic params are dropped. +pub fn fixup_positionals(positionals: Vec<(String, Positional)>) -> Vec<(String, Positional)> { + let mut seen_optional = false; + let mut seen_variadic = false; + let mut out = Vec::with_capacity(positionals.len()); + for (name, mut p) in positionals { + if p.variadic { + if seen_variadic { + continue; + } + seen_variadic = true; + seen_optional = true; + out.push((name, p)); + } else if seen_optional { + p.optional = true; + out.push((name, p)); + } else { + seen_optional = p.optional; + out.push((name, p)); + } + } + out +} + +/// derive a nushell `module` name from a command name. +/// replaces non-alphanumeric characters with hyphens and appends "-completions". +pub fn module_name_of(cmd_name: &str) -> String { + let mut s: String = cmd_name + .chars() + .map(|c| { + if c.is_ascii_alphanumeric() || c == '-' || c == '_' { + c + } else { + '-' + } + }) + .collect(); + s.push_str("-completions"); + s +} + +/// generate the full nushell `extern` block for a command. +/// +/// produces output like: +/// export extern "git add" [ +/// ...pathspec?: path +/// --verbose(-v) # be verbose +/// --dry-run(-n) # dry run +/// ] +/// +/// subcommands that weren't resolved into their own full definitions get +/// stub `extern` blocks with just a comment containing their description: +/// export extern "git stash" [ # stash changes +/// ] +pub fn generate_extern(cmd_name: &str, result: &ManpageResult) -> String { + let entries = dedup_entries(&result.entries); + let escaped_name = escape_nu(cmd_name); + let positionals = fixup_positionals(result.positionals.clone()); + + let mut out = String::new(); + out.push_str(&format!("export extern \"{escaped_name}\" [\n")); + for (name, p) in &positionals { + out.push_str(&format_positional(name, p)); + out.push('\n'); + } + for entry in &entries { + out.push_str(&format_flag(entry)); + out.push('\n'); + } + out.push_str("]\n"); + + for sc in &result.subcommands { + out.push_str(&format!( + "\nexport extern \"{} {}\" [ # {}\n]\n", + escaped_name, + escape_nu(&sc.name), + escape_nu(&sc.desc) + )); + } + out +} + +/// generate a complete nushell `module` wrapping the `extern`. +/// output: "module git-completions { ... }\n\nuse git-completions *\n" +/// the `use` at the end makes the `extern` immediately available in scope. +pub fn generate_module(cmd_name: &str, result: &ManpageResult) -> String { + let mod_name = module_name_of(cmd_name); + format!( + "module {mod_name} {{\n{}}}\n\nuse {mod_name} *\n", + generate_extern(cmd_name, result) + ) +} + +/// convenience wrapper: generate an `extern` from just a list of entries. +pub fn generate_extern_from_entries(cmd_name: &str, entries: Vec) -> String { + generate_extern( + cmd_name, + &ManpageResult { + entries, + subcommands: Vec::new(), + positionals: Vec::new(), + description: String::new(), + }, + ) +} + +/// stub subcommand entry used when extracting subcommands from a parsed +/// help result for nushell output. +pub fn manpage_subcommand_from(name: &str, desc: &str) -> ManpageSubcommand { + ManpageSubcommand { + name: name.to_string(), + desc: desc.to_string(), + } +} diff --git a/src/pool.rs b/src/pool.rs new file mode 100644 index 0000000..76fee66 --- /dev/null +++ b/src/pool.rs @@ -0,0 +1,233 @@ +//! BFS-queue worker pool for parallel subprocess scraping. +//! +//! workers pull jobs from a shared queue and call a user-supplied +//! handler; the handler gets a `Submitter` to push newly-discovered +//! child jobs back onto the same queue. when the in-flight count +//! reaches zero the pool shuts down and `wait` returns. +//! +//! the queue-back design is deliberate: command-help trees are uneven +//! (one binary has 30 subs, another has 1). queue-back keeps every +//! worker fed; spawn-in-place would leave cores idle on lopsided trees. +//! +//! synchronization: `parking_lot::Condvar` parks workers when the queue is +//! empty. the queue, in-flight count, and close state live under one mutex so +//! the condvar predicate cannot miss a wakeup. +//! parking_lot gives no-poison locks (no `Result` noise on every +//! `lock()`) and a single-syscall fast path in the uncontended case. + +use std::collections::VecDeque; +use std::sync::Arc; +use std::thread::{self, JoinHandle}; + +use parking_lot::{Condvar, Mutex}; + +struct State { + queue: VecDeque, + /// jobs created but not yet completed. counts both queued and + /// in-progress jobs. workers can exit once wait() has closed the pool + /// and this reaches 0. + in_flight: usize, + /// set by wait(), which is also the point where top-level submission is + /// done. workers must not exit on transient empty periods before this. + closed: bool, +} + +/// shared state held behind an `Arc` by every worker and by the +/// submitter handles handed to the per-job handler. +struct Inner { + state: Mutex>, + notify: Condvar, +} + +impl Inner { + fn submit(&self, job: J) { + let mut state = self.state.lock(); + state.in_flight += 1; + state.queue.push_back(job); + self.notify.notify_one(); + } + + fn next(&self) -> Option { + let mut state = self.state.lock(); + loop { + if let Some(job) = state.queue.pop_front() { + return Some(job); + } + if state.closed && state.in_flight == 0 { + return None; + } + self.notify.wait(&mut state); + } + } + + fn complete(&self) { + let mut state = self.state.lock(); + state.in_flight -= 1; + if state.closed && state.in_flight == 0 { + // we were the last in-flight job after wait() closed top-level + // submission, so parked workers can wake and exit. + self.notify.notify_all(); + } + } +} + +/// cheap-to-clone handle that lets a job handler enqueue further jobs. +/// passed by reference to the handler closure. +pub struct Submitter { + inner: Arc>, +} + +impl Clone for Submitter { + fn clone(&self) -> Self { + Submitter { + inner: self.inner.clone(), + } + } +} + +impl Submitter { + pub fn submit(&self, job: J) { + self.inner.submit(job); + } +} + +/// BFS-queue worker pool. each worker pulls a job, calls the handler +/// (which may submit further jobs via the passed `Submitter`), then marks +/// the job complete. when in-flight reaches zero the pool shuts down and +/// `wait` returns. +pub struct ScrapePool { + inner: Arc>, + workers: Vec>, +} + +impl ScrapePool { + /// spawn `num_workers` threads that run `handler` on each job pulled + /// from the queue. the handler receives the job by value and a + /// `&Submitter` for enqueuing children. + pub fn new(num_workers: usize, handler: F) -> Self + where + F: Fn(J, &Submitter) + Send + Sync + 'static, + { + let inner = Arc::new(Inner { + state: Mutex::new(State { + queue: VecDeque::new(), + in_flight: 0, + closed: false, + }), + notify: Condvar::new(), + }); + let handler = Arc::new(handler); + let workers = (0..num_workers.max(1)) + .map(|_| { + let inner = inner.clone(); + let handler = handler.clone(); + thread::spawn(move || { + let submitter = Submitter { + inner: inner.clone(), + }; + while let Some(job) = inner.next() { + handler(job, &submitter); + inner.complete(); + } + }) + }) + .collect(); + ScrapePool { inner, workers } + } + + /// submit a top-level job. typically called by the orchestrating + /// thread before `wait`; handlers should use `Submitter::submit`. + pub fn submit(&self, job: J) { + self.inner.submit(job); + } + + /// block until all jobs (initial + transitively discovered) have + /// completed, then join every worker thread. + pub fn wait(self) { + { + let mut state = self.inner.state.lock(); + state.closed = true; + // Wake workers so they can either drain queued work or exit if + // the pool was empty. The close flag is guarded by this same lock, + // so this cannot race with a worker entering the condvar wait. + self.inner.notify.notify_all(); + } + for w in self.workers { + let _ = w.join(); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::sync::atomic::{AtomicUsize, Ordering}; + use std::time::Duration; + + #[test] + fn flat_jobs_processed_once_each() { + let collected: Arc>> = Arc::new(Mutex::new(Vec::new())); + let pool = ScrapePool::new(4, { + let collected = collected.clone(); + move |n: u32, _: &Submitter| { + collected.lock().push(n); + } + }); + for i in 0..100u32 { + pool.submit(i); + } + pool.wait(); + let mut got = collected.lock().clone(); + got.sort(); + assert_eq!(got, (0..100).collect::>()); + } + + #[test] + fn discovered_children_processed_to_completion() { + // BFS expansion: every odd number under 10 spawns its successor. + let collected: Arc>> = Arc::new(Mutex::new(Vec::new())); + let pool = ScrapePool::new(2, { + let collected = collected.clone(); + move |n: u32, sub: &Submitter| { + collected.lock().push(n); + if n < 10 && n % 2 == 1 { + sub.submit(n + 1); + } + } + }); + for i in [1u32, 3, 5, 7, 9] { + pool.submit(i); + } + pool.wait(); + let mut got = collected.lock().clone(); + got.sort(); + assert_eq!(got, vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]); + } + + #[test] + fn transient_empty_queue_before_wait_does_not_stop_workers() { + let processed = Arc::new(AtomicUsize::new(0)); + let pool = ScrapePool::new(1, { + let processed = processed.clone(); + move |_: u32, _: &Submitter| { + processed.fetch_add(1, Ordering::SeqCst); + } + }); + + pool.submit(1); + while processed.load(Ordering::SeqCst) == 0 { + thread::yield_now(); + } + thread::sleep(Duration::from_millis(10)); + pool.submit(2); + pool.wait(); + + assert_eq!(processed.load(Ordering::SeqCst), 2); + } + + #[test] + fn wait_with_no_jobs_returns_immediately() { + let pool: ScrapePool<()> = ScrapePool::new(2, |_, _| {}); + pool.wait(); + } +} diff --git a/src/store.rs b/src/store.rs new file mode 100644 index 0000000..fd1a09c --- /dev/null +++ b/src/store.rs @@ -0,0 +1,657 @@ +//! filesystem store for parsed completion data. +//! +//! write side: serialize ManpageResult to JSON, derive sanitised +//! filenames from command names ("git add" → git_add.json). +//! +//! read side: look up a command by name across the user cache + system +//! dirs, deserialize JSON or parse a .nu extern blob back into a result. + +use std::collections::HashMap; +use std::fs; +use std::io; +use std::path::{Path, PathBuf}; + +use serde_json::Value; + +use crate::parsers::manpage::{ + ManpageEntry, ManpageResult, ManpageSubcommand, OwnedParam, OwnedSwitch, +}; +use crate::types::Positional; + +/// default cache directory: $XDG_CACHE_HOME/inshellah, falling back to +/// $HOME/.cache/inshellah. +pub fn default_store_path() -> PathBuf { + if let Ok(xdg) = std::env::var("XDG_CACHE_HOME") + && !xdg.is_empty() + { + return PathBuf::from(xdg).join("inshellah"); + } + if let Ok(home) = std::env::var("HOME") { + return PathBuf::from(home).join(".cache/inshellah"); + } + PathBuf::from(".cache/inshellah") +} + +/// create directory and all parents. +pub fn ensure_dir(dir: &Path) -> io::Result<()> { + fs::create_dir_all(dir) +} + +/// derive a safe filename from a command name. +/// spaces in subcommand names ("git add") become "_" ("git_add"). +/// any other non-filesystem-safe characters are also replaced. +pub fn filename_of_command(cmd: &str) -> String { + cmd.chars() + .map(|c| match c { + 'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_' | '.' => c, + ' ' => '_', + _ => '_', + }) + .collect() +} + +/// reverse: a filename "git_add" produces command name "git add". +/// underscores are flipped to spaces unconditionally — names that +/// genuinely contained an underscore round-trip as spaces, which is +/// acceptable since the read side is only used for display. +pub fn command_of_filename(base: &str) -> String { + base.replace('_', " ") +} + +fn escape_json(s: &str) -> String { + let mut out = String::with_capacity(s.len() + 2); + for c in s.chars() { + match c { + '"' => out.push_str("\\\""), + '\\' => out.push_str("\\\\"), + '\n' => out.push_str("\\n"), + '\r' => out.push_str("\\r"), + '\t' => out.push_str("\\t"), + '\x08' => out.push_str("\\b"), + '\x0c' => out.push_str("\\f"), + c if (c as u32) < 0x20 => { + out.push_str(&format!("\\u{:04x}", c as u32)); + } + c => out.push(c), + } + } + out +} + +fn json_string(s: &str) -> String { + format!("\"{}\"", escape_json(s)) +} + +fn json_switch(s: &OwnedSwitch) -> String { + match s { + OwnedSwitch::Short(c) => { + format!( + r#"{{"type":"short","char":{}}}"#, + json_string(&c.to_string()) + ) + } + OwnedSwitch::Long(l) => { + format!(r#"{{"type":"long","name":{}}}"#, json_string(l)) + } + OwnedSwitch::Both(c, l) => format!( + r#"{{"type":"both","char":{},"name":{}}}"#, + json_string(&c.to_string()), + json_string(l) + ), + } +} + +fn json_param(p: &Option) -> String { + match p { + None => "null".to_string(), + Some(OwnedParam::Mandatory(n)) => { + format!(r#"{{"kind":"mandatory","name":{}}}"#, json_string(n)) + } + Some(OwnedParam::Optional(n)) => { + format!(r#"{{"kind":"optional","name":{}}}"#, json_string(n)) + } + } +} + +fn json_entry(e: &ManpageEntry) -> String { + format!( + r#"{{"switch":{},"param":{},"desc":{}}}"#, + json_switch(&e.switch), + json_param(&e.param), + json_string(&e.desc) + ) +} + +fn json_subcommand(sc: &ManpageSubcommand) -> String { + format!( + r#"{{"name":{},"desc":{}}}"#, + json_string(&sc.name), + json_string(&sc.desc) + ) +} + +fn json_positional(name: &str, p: &Positional) -> String { + format!( + r#"{{"name":{},"optional":{},"variadic":{}}}"#, + json_string(name), + p.optional, + p.variadic + ) +} + +fn json_list String>(items: &[T], f: F) -> String { + let parts: Vec = items.iter().map(f).collect(); + format!("[{}]", parts.join(",")) +} + +/// serialize a ManpageResult to JSON: +/// {"source":..., "description":..., "entries":[...], +/// "subcommands":[...], "positionals":[...]} +pub fn json_of_result(source: &str, result: &ManpageResult) -> String { + let entries = json_list(&result.entries, json_entry); + let subcommands = json_list(&result.subcommands, json_subcommand); + let positionals_parts: Vec = result + .positionals + .iter() + .map(|(name, p)| json_positional(name, p)) + .collect(); + let positionals = format!("[{}]", positionals_parts.join(",")); + format!( + r#"{{"source":{},"description":{},"entries":{},"subcommands":{},"positionals":{}}}"#, + json_string(source), + json_string(&result.description), + entries, + subcommands, + positionals, + ) +} + +pub fn write_file(path: &Path, contents: &str) -> io::Result<()> { + if let Some(parent) = path.parent() { + fs::create_dir_all(parent)?; + } + fs::write(path, contents) +} + +/// write the parsed result for `command` into `dir` as JSON. +pub fn write_result( + dir: &Path, + command: &str, + source: &str, + result: &ManpageResult, +) -> io::Result<()> { + let path = dir.join(format!("{}.json", filename_of_command(command))); + write_file(&path, &json_of_result(source, result)) +} + +/// write a native-nushell completion blob (the binary supplied its own). +pub fn write_native(dir: &Path, command: &str, data: &str) -> io::Result<()> { + let path = dir.join(format!("{}.nu", filename_of_command(command))); + write_file(&path, data) +} + +// --- read side --- + +fn read_file(path: &Path) -> Option { + fs::read_to_string(path).ok() +} + +fn read_json_result(path: &Path) -> Option<(String, ManpageResult)> { + let data = read_file(path)?; + let v = serde_json::from_str::(&data).ok()?; + let source = v + .get("source") + .and_then(|x| x.as_str()) + .unwrap_or("json") + .to_string(); + Some((source, result_from_json(&v))) +} + +fn switch_from_json(v: &Value) -> Option { + let t = v.get("type")?.as_str()?; + match t { + "short" => { + let c = v.get("char")?.as_str()?.chars().next()?; + Some(OwnedSwitch::Short(c)) + } + "long" => Some(OwnedSwitch::Long(v.get("name")?.as_str()?.to_string())), + "both" => { + let c = v.get("char")?.as_str()?.chars().next()?; + let n = v.get("name")?.as_str()?.to_string(); + Some(OwnedSwitch::Both(c, n)) + } + _ => None, + } +} + +fn param_from_json(v: &Value) -> Option { + if v.is_null() { + return None; + } + let kind = v.get("kind")?.as_str()?; + let name = v.get("name")?.as_str()?.to_string(); + Some(match kind { + "mandatory" => OwnedParam::Mandatory(name), + "optional" => OwnedParam::Optional(name), + _ => return None, + }) +} + +fn entry_from_json(v: &Value) -> Option { + let switch = switch_from_json(v.get("switch")?)?; + let param = v.get("param").and_then(param_from_json); + let desc = v + .get("desc") + .and_then(|d| d.as_str()) + .unwrap_or("") + .to_string(); + Some(ManpageEntry { + switch, + param, + desc, + }) +} + +fn subcommand_from_json(v: &Value) -> Option { + let name = v.get("name")?.as_str()?.to_string(); + let desc = v + .get("desc") + .and_then(|d| d.as_str()) + .unwrap_or("") + .to_string(); + Some(ManpageSubcommand { name, desc }) +} + +fn positional_from_json(v: &Value) -> Option<(String, Positional)> { + let name = v.get("name")?.as_str()?.to_string(); + let optional = v.get("optional").and_then(|x| x.as_bool()).unwrap_or(false); + let variadic = v.get("variadic").and_then(|x| x.as_bool()).unwrap_or(false); + Some((name, Positional { optional, variadic })) +} + +/// deserialize a JSON cache entry into ManpageResult. +pub fn result_from_json(v: &Value) -> ManpageResult { + let description = v + .get("description") + .and_then(|d| d.as_str()) + .unwrap_or("") + .to_string(); + let entries = v + .get("entries") + .and_then(|x| x.as_array()) + .map(|arr| arr.iter().filter_map(entry_from_json).collect()) + .unwrap_or_default(); + let subcommands = v + .get("subcommands") + .and_then(|x| x.as_array()) + .map(|arr| arr.iter().filter_map(subcommand_from_json).collect()) + .unwrap_or_default(); + let positionals = v + .get("positionals") + .and_then(|x| x.as_array()) + .map(|arr| arr.iter().filter_map(positional_from_json).collect()) + .unwrap_or_default(); + ManpageResult { + entries, + subcommands, + positionals, + description, + } +} + +/// parse nushell `export extern` blocks out of a .nu source file. +/// +/// returns the help_result that matches `target_cmd` — its entries, +/// positionals, and any other extern blocks under it (`cmd sub`) are +/// folded into the subcommands list. +pub fn parse_nu_completions(target_cmd: &str, contents: &str) -> ManpageResult { + let mut blocks: Vec = Vec::new(); + let mut current_desc = String::new(); + let mut in_block = false; + let mut block = NuBlock::default(); + + for line in contents.split('\n') { + let trimmed = line.trim(); + if !in_block { + if let Some(stripped) = trimmed.strip_prefix("# ") { + current_desc = stripped.trim().to_string(); + } else if trimmed.contains("export extern") + && let Some(cmd) = extract_extern_name(trimmed) + { + in_block = true; + block = NuBlock { + cmd, + description: std::mem::take(&mut current_desc), + ..Default::default() + }; + } else { + current_desc.clear(); + } + } else if trimmed.starts_with(']') { + blocks.push(std::mem::take(&mut block)); + in_block = false; + } else { + let (param_part, desc) = match trimmed.find('#') { + Some(idx) => (trimmed[..idx].trim(), trimmed[idx + 1..].trim()), + None => (trimmed, ""), + }; + parse_nu_param_line_into(param_part, desc, &mut block); + } + } + if in_block { + blocks.push(block); + } + + // find the block matching target_cmd + let Some(matched) = blocks.iter().find(|b| b.cmd == target_cmd) else { + return ManpageResult::default(); + }; + + // collect immediate subcommands from other blocks ("target sub" pattern) + let prefix = format!("{target_cmd} "); + let mut subcommands: Vec = Vec::new(); + for b in &blocks { + if let Some(suffix) = b.cmd.strip_prefix(&prefix) + && !suffix.contains(' ') + && !suffix.is_empty() + { + subcommands.push(ManpageSubcommand { + name: suffix.to_string(), + desc: b.description.clone(), + }); + } + } + + ManpageResult { + entries: matched.entries.clone(), + subcommands, + positionals: matched.positionals.clone(), + description: matched.description.clone(), + } +} + +fn extract_extern_name(line: &str) -> Option { + let idx = line.find("export extern")?; + let after = line[idx + "export extern".len()..].trim_start(); + if let Some(rest) = after.strip_prefix('"') { + let end = rest.find('"')?; + Some(rest[..end].to_string()) + } else { + let end = after + .find(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '-')) + .unwrap_or(after.len()); + if end == 0 { + None + } else { + Some(after[..end].to_string()) + } + } +} + +fn parse_nu_param_line_into(param_part: &str, desc: &str, block: &mut NuBlock) { + if param_part.len() < 2 { + return; + } + if let Some(after) = param_part.strip_prefix("--") { + // long flag: --name(-c): type or --name: type or --name + let (name, rest) = split_at_non_name_char(after); + if name.is_empty() { + return; + } + let mut short: Option = None; + let mut rest = rest; + if let Some(after_open) = rest.strip_prefix("(-") + && let Some(c) = after_open.chars().next() + && after_open[c.len_utf8()..].starts_with(')') + { + short = Some(c); + rest = &after_open[c.len_utf8() + 1..]; + } + let param = parse_type_suffix(rest); + let switch = match short { + Some(c) => OwnedSwitch::Both(c, name.to_string()), + None => OwnedSwitch::Long(name.to_string()), + }; + block.entries.push(ManpageEntry { + switch, + param, + desc: desc.to_string(), + }); + } else if param_part.starts_with('-') { + // short flag: -c + if let Some(c) = param_part.chars().nth(1) + && c.is_ascii_alphanumeric() + { + block.entries.push(ManpageEntry { + switch: OwnedSwitch::Short(c), + param: None, + desc: desc.to_string(), + }); + } + } else { + // positional: name: type or name?: type or ...name: type + let variadic = param_part.starts_with("..."); + let after_prefix = if variadic { + ¶m_part[3..] + } else { + param_part + }; + let optional = after_prefix.contains('?'); + let name_end = after_prefix.find([':', '?']).unwrap_or(after_prefix.len()); + let name = after_prefix[..name_end].trim(); + let name: String = name + .chars() + .map(|c| if c == '-' { '_' } else { c }) + .collect(); + if !name.is_empty() && !name.starts_with('-') { + let duplicate = block + .positionals + .iter() + .any(|(existing, _)| existing.eq_ignore_ascii_case(&name)); + if !duplicate { + block.positionals.push(( + name, + Positional { + optional: optional || variadic, + variadic, + }, + )); + } + } + } +} + +fn split_at_non_name_char(s: &str) -> (&str, &str) { + let end = s + .find(|c: char| !(c.is_ascii_alphanumeric() || c == '-')) + .unwrap_or(s.len()); + (&s[..end], &s[end..]) +} + +/// parse a `: type` suffix into an OwnedParam (always Mandatory since the +/// nushell extern syntax doesn't distinguish optional-with-default). +fn parse_type_suffix(s: &str) -> Option { + let s = s.trim_start(); + let s = s.strip_prefix(':')?; + let s = s.trim_start(); + let end = s + .find(|c: char| !c.is_ascii_alphabetic()) + .unwrap_or(s.len()); + if end == 0 { + None + } else { + Some(OwnedParam::Mandatory(s[..end].to_string())) + } +} + +#[derive(Default)] +struct NuBlock { + cmd: String, + entries: Vec, + positionals: Vec<(String, Positional)>, + description: String, +} + +/// look up a command's parsed result. source priority is native nushell, +/// then manpage JSON, then help JSON. parent .nu files are searched for +/// subcommand lookups because clap-generated .nu files contain all extern +/// blocks in a single file. +pub fn lookup(dirs: &[PathBuf], command: &str) -> Option { + let base_name = filename_of_command(command); + let parent_base = command + .find(' ') + .map(|i| filename_of_command(&command[..i])); + + for directory in dirs { + let nu_path = directory.join(format!("{base_name}.nu")); + if let Some(data) = read_file(&nu_path) { + return Some(parse_nu_completions(command, &data)); + } + if let Some(pb) = &parent_base { + let parent_nu = directory.join(format!("{pb}.nu")); + if let Some(data) = read_file(&parent_nu) { + let r = parse_nu_completions(command, &data); + if !r.entries.is_empty() || !r.subcommands.is_empty() || !r.positionals.is_empty() { + return Some(r); + } + } + } + } + + for directory in dirs { + let json_path = directory.join(format!("{base_name}.json")); + if let Some((source, result)) = read_json_result(&json_path) + && source != "help" + { + return Some(result); + } + } + + for directory in dirs { + let json_path = directory.join(format!("{base_name}.json")); + if let Some((_, result)) = read_json_result(&json_path) { + return Some(result); + } + } + None +} + +/// look up a command's raw stored data (JSON or .nu source). +pub fn lookup_raw(dirs: &[PathBuf], command: &str) -> Option { + let base_name = filename_of_command(command); + for directory in dirs { + let nu_path = directory.join(format!("{base_name}.nu")); + if let Some(data) = read_file(&nu_path) { + return Some(data); + } + } + for directory in dirs { + let json_path = directory.join(format!("{base_name}.json")); + if let Some(data) = read_file(&json_path) { + return Some(data); + } + } + None +} + +fn chop_extension(filename: &str) -> Option<&str> { + filename + .strip_suffix(".json") + .or_else(|| filename.strip_suffix(".nu")) +} + +/// list all indexed commands across all store directories. +/// returns a sorted, deduplicated list of command names. +pub fn all_commands(dirs: &[PathBuf]) -> Vec { + let mut out: std::collections::BTreeSet = std::collections::BTreeSet::new(); + for directory in dirs { + let Ok(entries) = fs::read_dir(directory) else { + continue; + }; + for entry in entries.flatten() { + if let Some(name) = entry.file_name().to_str() + && let Some(base) = chop_extension(name) + { + out.insert(command_of_filename(base)); + } + } + } + out.into_iter().collect() +} + +/// discover subcommands of a command by scanning filenames in the store +/// (e.g. for "git", finds "git_add.json", "git_log.json"). +pub fn subcommands_of(dirs: &[PathBuf], command: &str) -> Vec { + let prefix = format!("{}_", filename_of_command(command)); + let mut seen: HashMap = HashMap::new(); + for directory in dirs { + let Ok(entries) = fs::read_dir(directory) else { + continue; + }; + for entry in entries.flatten() { + let Some(filename) = entry.file_name().to_str().map(|s| s.to_string()) else { + continue; + }; + if !filename.starts_with(&prefix) { + continue; + } + let is_json = filename.ends_with(".json"); + let Some(base) = chop_extension(&filename) else { + continue; + }; + let rest = &base[prefix.len()..]; + if rest.is_empty() || rest.contains('_') { + continue; + } + if seen.contains_key(rest) { + continue; + } + let desc = if is_json { + read_file(&entry.path()) + .and_then(|d| serde_json::from_str::(&d).ok()) + .and_then(|v| { + v.get("description") + .and_then(|x| x.as_str()) + .map(|s| s.to_string()) + }) + .unwrap_or_default() + } else { + String::new() + }; + seen.insert( + rest.to_string(), + ManpageSubcommand { + name: rest.to_string(), + desc, + }, + ); + } + } + let mut out: Vec = seen.into_values().collect(); + out.sort_by(|a, b| a.name.cmp(&b.name)); + out +} + +/// determine how a command was indexed: "help", "manpage", "native", etc. +/// for JSON files, returns the "source" field. for .nu files, returns "native". +pub fn file_type_of(dirs: &[PathBuf], command: &str) -> Option { + let base = filename_of_command(command); + for directory in dirs { + let nu_path = directory.join(format!("{base}.nu")); + if nu_path.exists() { + return Some("native".to_string()); + } + } + for directory in dirs { + let json_path = directory.join(format!("{base}.json")); + if json_path.exists() { + return Some( + read_file(&json_path) + .and_then(|d| serde_json::from_str::(&d).ok()) + .and_then(|v| v.get("source").and_then(|x| x.as_str()).map(String::from)) + .unwrap_or_else(|| "json".to_string()), + ); + } + } + None +} diff --git a/src/types.rs b/src/types.rs new file mode 100644 index 0000000..ac6b01d --- /dev/null +++ b/src/types.rs @@ -0,0 +1,34 @@ +pub enum Switch<'a> { + Short(char), + Long(&'a str), + Both(char, &'a str), +} + +pub enum Param<'a> { + Mandatory(&'a str), + Optional(&'a str), +} + +pub struct OptionEntry<'a> { + pub switch: Switch<'a>, + pub param: Option>, + pub desc: Vec<&'a str>, +} + +pub struct Subcommand<'a> { + pub name: &'a str, + pub desc: &'a str, +} + +#[derive(Debug, Clone)] +pub struct Positional { + pub optional: bool, + pub variadic: bool, +} + +pub struct HelpResult<'a> { + pub entries: Vec>, + pub subcommands: Vec>, + pub positionals: Vec<(&'a str, Positional)>, + pub desc: &'a str, +} diff --git a/test/dune b/test/dune deleted file mode 100644 index d54a2fb..0000000 --- a/test/dune +++ /dev/null @@ -1,3 +0,0 @@ -(test - (name test_inshellah) - (libraries inshellah str)) diff --git a/test/test_inshellah.ml b/test/test_inshellah.ml deleted file mode 100644 index 8f7b25e..0000000 --- a/test/test_inshellah.ml +++ /dev/null @@ -1,610 +0,0 @@ -open Inshellah.Parser -open Inshellah.Manpage -open Inshellah.Nushell - -let failures = ref 0 -let passes = ref 0 - -let check name condition = - if condition then begin - incr passes; - Printf.printf " PASS: %s\n" name - end else begin - incr failures; - Printf.printf " FAIL: %s\n" name - end - -let parse txt = - match parse_help txt with - | Ok r -> r - | Error msg -> failwith (Printf.sprintf "parse_help failed: %s" msg) - -(* --- Help parser tests --- *) - -let test_gnu_basic () = - Printf.printf "\n== GNU basic flags ==\n"; - let r = parse " -a, --all do not ignore entries starting with .\n" in - check "one entry" (List.length r.entries = 1); - let e = List.hd r.entries in - check "both switch" (e.switch = Both ('a', "all")); - check "no param" (e.param = None); - check "desc" (String.length e.desc > 0) - -let test_gnu_eq_param () = - Printf.printf "\n== GNU = param ==\n"; - let r = parse " --block-size=SIZE scale sizes by SIZE\n" in - check "one entry" (List.length r.entries = 1); - let e = List.hd r.entries in - check "long switch" (e.switch = Long "block-size"); - check "mandatory param" (e.param = Some (Mandatory "SIZE")) - -let test_gnu_opt_param () = - Printf.printf "\n== GNU optional param ==\n"; - let r = parse " --color[=WHEN] color the output WHEN\n" in - check "one entry" (List.length r.entries = 1); - let e = List.hd r.entries in - check "long switch" (e.switch = Long "color"); - check "optional param" (e.param = Some (Optional "WHEN")) - -let test_underscore_param () = - Printf.printf "\n== Underscore in param (TIME_STYLE) ==\n"; - let r = parse " --time-style=TIME_STYLE time/date format\n" in - check "one entry" (List.length r.entries = 1); - let e = List.hd r.entries in - check "param with underscore" (e.param = Some (Mandatory "TIME_STYLE")) - -let test_short_only () = - Printf.printf "\n== Short-only flag ==\n"; - let r = parse " -v verbose output\n" in - check "one entry" (List.length r.entries = 1); - check "short switch" ((List.hd r.entries).switch = Short 'v') - -let test_long_only () = - Printf.printf "\n== Long-only flag ==\n"; - let r = parse " --help display help\n" in - check "one entry" (List.length r.entries = 1); - check "long switch" ((List.hd r.entries).switch = Long "help") - -let test_multiline_desc () = - Printf.printf "\n== Multi-line description ==\n"; - let r = parse {| --block-size=SIZE with -l, scale sizes by SIZE when printing them; - e.g., '--block-size=M'; see SIZE format below -|} in - check "one entry" (List.length r.entries = 1); - let e = List.hd r.entries in - check "desc includes continuation" (String.length e.desc > 50) - -let test_multiple_entries () = - Printf.printf "\n== Multiple entries ==\n"; - let r = parse {| -a, --all do not ignore entries starting with . - -A, --almost-all do not list implied . and .. - --author with -l, print the author of each file -|} in - check "three entries" (List.length r.entries = 3) - -let test_clap_short_sections () = - Printf.printf "\n== Clap short with section headers ==\n"; - let r = parse {|INPUT OPTIONS: - -e, --regexp=PATTERN A pattern to search for. - -f, --file=PATTERNFILE Search for patterns from the given file. -SEARCH OPTIONS: - -s, --case-sensitive Search case sensitively. -|} in - check "three entries" (List.length r.entries = 3); - let e = List.hd r.entries in - check "first is regexp" (e.switch = Both ('e', "regexp")); - check "first has param" (e.param = Some (Mandatory "PATTERN")) - -let test_clap_long_style () = - Printf.printf "\n== Clap long style (desc below flag) ==\n"; - let r = parse {| -H, --hidden - Include hidden directories and files. - - --no-ignore - Do not respect ignore files. -|} in - check "two entries" (List.length r.entries = 2); - let e = List.hd r.entries in - check "hidden switch" (e.switch = Both ('H', "hidden")); - check "desc below" (String.length e.desc > 0) - -let test_clap_long_angle_param () = - Printf.printf "\n== Clap long angle bracket param ==\n"; - let r = parse {| --nonprintable-notation - Set notation for non-printable characters. -|} in - check "one entry" (List.length r.entries = 1); - let e = List.hd r.entries in - check "long switch" (e.switch = Long "nonprintable-notation"); - check "angle param" (e.param = Some (Mandatory "notation")) - -let test_space_upper_param () = - Printf.printf "\n== Space-separated ALL_CAPS param ==\n"; - let r = parse " -f, --foo FOO foo help\n" in - check "one entry" (List.length r.entries = 1); - let e = List.hd r.entries in - check "switch" (e.switch = Both ('f', "foo")); - check "space param" (e.param = Some (Mandatory "FOO")) - -let test_go_cobra_flags () = - Printf.printf "\n== Go/Cobra flags ==\n"; - let r = parse {|Flags: - -D, --debug Enable debug mode - -H, --host string Daemon socket to connect to - -v, --version Print version information -|} in - check "three flag entries" (List.length r.entries = 3); - (* Check the host flag has a type param *) - let host = List.nth r.entries 1 in - check "host switch" (host.switch = Both ('H', "host")); - check "host type param" (host.param = Some (Mandatory "string")) - -let test_go_cobra_subcommands () = - Printf.printf "\n== Go/Cobra subcommands ==\n"; - let r = parse {|Common Commands: - run Create and run a new container from an image - exec Execute a command in a running container - build Build an image from a Dockerfile -|} in - check "has subcommands" (List.length r.subcommands > 0) - -let test_busybox_tab () = - Printf.printf "\n== Busybox tab-indented ==\n"; - let r = parse "\t-1\tOne column output\n\t-a\tInclude names starting with .\n" in - check "two entries" (List.length r.entries = 2); - check "first is -1" ((List.hd r.entries).switch = Short '1') - -let test_no_debug_prints () = - Printf.printf "\n== No debug side effects ==\n"; - (* The old parser had print_endline at module load time. - If we got here without "opt param is running" on stdout, we're good. *) - check "no debug prints" true - -(* --- Manpage parser tests --- *) - -let test_manpage_tp_style () = - Printf.printf "\n== Manpage .TP style ==\n"; - let groff = {|.SH OPTIONS -.TP -\fB\-a\fR, \fB\-\-all\fR -do not ignore entries starting with . -.TP -\fB\-A\fR, \fB\-\-almost\-all\fR -do not list implied . and .. -.TP -\fB\-\-block\-size\fR=\fISIZE\fR -with \fB\-l\fR, scale sizes by SIZE -.SH AUTHOR -Written by someone. -|} in - let result = parse_manpage_string groff in - check "three entries" (List.length result.entries = 3); - if List.length result.entries >= 1 then begin - let e = List.hd result.entries in - check "first is -a/--all" (e.switch = Both ('a', "all")); - check "first desc" (String.length e.desc > 0) - end; - if List.length result.entries >= 3 then begin - let e = List.nth result.entries 2 in - check "block-size switch" (e.switch = Long "block-size"); - check "block-size param" (e.param = Some (Mandatory "SIZE")) - end - -let test_manpage_ip_style () = - Printf.printf "\n== Manpage .IP style ==\n"; - let groff = {|.SH OPTIONS -.IP "\fB\-k\fR, \fB\-\-insecure\fR" -Allow insecure connections. -.IP "\fB\-o\fR, \fB\-\-output\fR \fIfile\fR" -Write output to file. -.SH SEE ALSO -|} in - let result = parse_manpage_string groff in - check "two entries" (List.length result.entries = 2); - if List.length result.entries >= 1 then begin - let e = List.hd result.entries in - check "first is -k/--insecure" (e.switch = Both ('k', "insecure")) - end - -let test_manpage_groff_stripping () = - Printf.printf "\n== Groff escape stripping ==\n"; - let s = strip_groff_escapes {|\fB\-\-color\fR[=\fIWHEN\fR]|} in - check "font escapes removed" (not (String.contains s 'f' && String.contains s 'B')); - check "dashes converted" (String.contains s '-'); - let s2 = strip_groff_escapes {|\(aqhello\(aq|} in - check "aq -> quote" (String.contains s2 '\'') - -let test_manpage_empty_options () = - Printf.printf "\n== Manpage with no OPTIONS section ==\n"; - let groff = {|.SH NAME -foo \- does stuff -.SH DESCRIPTION -Does stuff. -|} in - let result = parse_manpage_string groff in - check "no entries" (List.length result.entries = 0) - -let test_slash_switch_separator () = - Printf.printf "\n== Slash switch separator (--long / -s) ==\n"; - let r = parse " --verbose / -v Increase verbosity\n" in - check "one entry" (List.length r.entries = 1); - let e = List.hd r.entries in - check "both switch" (e.switch = Both ('v', "verbose")); - check "no param" (e.param = None); - check "desc" (e.desc = "Increase verbosity") - -let test_manpage_nix3_style () = - Printf.printf "\n== Manpage nix3 style ==\n"; - let groff = {|.SH Options -.SS Logging-related options -.IP "\(bu" 3 -.UR #opt-verbose -\f(CR--verbose\fR -.UE -/ \f(CR-v\fR -.IP -Increase the logging verbosity level. -.IP "\(bu" 3 -.UR #opt-quiet -\f(CR--quiet\fR -.UE -.IP -Decrease the logging verbosity level. -.SH SEE ALSO -|} in - let result = parse_manpage_string groff in - check "two entries" (List.length result.entries = 2); - if List.length result.entries >= 1 then begin - let e = List.hd result.entries in - check "verbose is Both" (e.switch = Both ('v', "verbose")); - check "verbose desc" (String.length e.desc > 0) - end; - if List.length result.entries >= 2 then begin - let e = List.nth result.entries 1 in - check "quiet is Long" (e.switch = Long "quiet"); - check "quiet desc" (String.length e.desc > 0) - end - -let test_manpage_nix3_with_params () = - Printf.printf "\n== Manpage nix3 with params ==\n"; - let groff = {|.SH Options -.IP "\(bu" 3 -.UR #opt-arg -\f(CR--arg\fR -.UE -\fIname\fR \fIexpr\fR -.IP -Pass the value as the argument name to Nix functions. -.IP "\(bu" 3 -.UR #opt-include -\f(CR--include\fR -.UE -/ \f(CR-I\fR \fIpath\fR -.IP -Add path to search path entries. -.IP -This option may be given multiple times. -.SH SEE ALSO -|} in - let result = parse_manpage_string groff in - check "two entries" (List.length result.entries = 2); - if List.length result.entries >= 1 then begin - let e = List.hd result.entries in - check "arg is Long" (e.switch = Long "arg"); - check "arg has param" (e.param <> None) - end; - if List.length result.entries >= 2 then begin - let e = List.nth result.entries 1 in - check "include is Both" (e.switch = Both ('I', "include")); - check "include has path param" (e.param = Some (Mandatory "path")) - end - -let test_synopsis_subcommand () = - Printf.printf "\n== SYNOPSIS subcommand detection ==\n"; - let groff = {|.SH "SYNOPSIS" -.sp -.nf -\fBgit\fR \fBcommit\fR [\fB\-a\fR | \fB\-\-interactive\fR] -.fi -.SH "DESCRIPTION" -|} in - let cmd = extract_synopsis_command groff in - check "detected git commit" (cmd = Some "git commit") - -let test_synopsis_standalone () = - Printf.printf "\n== SYNOPSIS standalone command ==\n"; - let groff = {|.SH Synopsis -.LP -\f(CRnix-build\fR [\fIpaths\fR] -.SH Description -|} in - let cmd = extract_synopsis_command groff in - check "detected nix-build" (cmd = Some "nix-build") - -let test_synopsis_nix3 () = - Printf.printf "\n== SYNOPSIS nix3 subcommand ==\n"; - let groff = {|.SH Synopsis -.LP -\f(CRnix run\fR [\fIoption\fR] \fIinstallable\fR -.SH Description -|} in - let cmd = extract_synopsis_command groff in - check "detected nix run" (cmd = Some "nix run") - -(* --- Nushell generation tests --- *) - -let contains s sub = - try - let _ = Str.search_forward (Str.regexp_string sub) s 0 in true - with Not_found -> false - -let test_nushell_basic () = - Printf.printf "\n== Nushell basic extern ==\n"; - let r = parse " -a, --all do not ignore entries starting with .\n" in - let nu = generate_extern "ls" r in - check "has extern" (contains nu "export extern \"ls\""); - check "has --all(-a)" (contains nu "--all(-a)"); - check "has comment" (contains nu "# do not ignore") - -let test_nushell_param_types () = - Printf.printf "\n== Nushell param type mapping ==\n"; - let r = parse {| -w, --width=COLS set output width - --block-size=SIZE scale sizes - -o, --output FILE output file -|} in - let nu = generate_extern "ls" r in - check "COLS -> int" (contains nu "--width(-w): int"); - check "SIZE -> string" (contains nu "--block-size: string"); - check "FILE -> path" (contains nu "--output(-o): path") - -let test_nushell_subcommands () = - Printf.printf "\n== Nushell subcommands ==\n"; - let r = parse {|Common Commands: - run Create and run a new container - exec Execute a command - -Flags: - -D, --debug Enable debug mode -|} in - let nu = generate_extern "docker" r in - check "has main extern" (contains nu "export extern \"docker\""); - check "has --debug" (contains nu "--debug(-D)"); - check "has run subcommand" (contains nu "export extern \"docker run\""); - check "has exec subcommand" (contains nu "export extern \"docker exec\"") - -let test_nushell_from_manpage () = - Printf.printf "\n== Nushell from manpage ==\n"; - let groff = {|.SH OPTIONS -.TP -\fB\-a\fR, \fB\-\-all\fR -do not ignore entries starting with . -.TP -\fB\-\-block\-size\fR=\fISIZE\fR -scale sizes by SIZE -.SH AUTHOR -|} in - let result = parse_manpage_string groff in - let nu = generate_extern "ls" result in - check "has extern" (contains nu "export extern \"ls\""); - check "has --all(-a)" (contains nu "--all(-a)"); - check "has --block-size" (contains nu "--block-size: string") - -let test_nushell_module () = - Printf.printf "\n== Nushell module wrapper ==\n"; - let r = parse " -v, --verbose verbose output\n" in - let nu = generate_module "myapp" r in - check "has module" (contains nu "module myapp-completions"); - check "has extern inside" (contains nu "export extern \"myapp\""); - check "has flag" (contains nu "--verbose(-v)") - -let test_dedup_entries () = - Printf.printf "\n== Deduplication ==\n"; - let r = parse {| -v, --verbose verbose output - --verbose verbose mode - -v be verbose -|} in - let nu = generate_extern "test" r in - (* Count occurrences of --verbose *) - let count = - let re = Str.regexp_string "--verbose" in - let n = ref 0 in - let i = ref 0 in - (try while true do - let _ = Str.search_forward re nu !i in - incr n; i := Str.match_end () - done with Not_found -> ()); - !n - in - check "verbose appears once" (count = 1); - check "best version kept (Both)" (contains nu "--verbose(-v)") - -let test_dedup_manpage () = - Printf.printf "\n== Dedup from manpage ==\n"; - let groff = {|.SH OPTIONS -.TP -\fB\-v\fR, \fB\-\-verbose\fR -Be verbose. -.SH DESCRIPTION -Use \fB\-v\fR for verbose output. -Use \fB\-\-verbose\fR to see more. -|} in - let result = parse_manpage_string groff in - let nu = generate_extern "test" result in - check "has --verbose(-v)" (contains nu "--verbose(-v)"); - (* Should not have standalone -v or duplicate --verbose *) - let lines = String.split_on_char '\n' nu in - let verbose_lines = List.filter (fun l -> contains l "verbose") lines in - check "only one verbose line" (List.length verbose_lines = 1) - -let test_commands_section_subcommands () = - Printf.printf "\n== COMMANDS section subcommand extraction ==\n"; - (* manpages like systemctl have a COMMANDS section with bold command names - * inside .PP + .RS/.RE blocks. these should be extracted as subcommands - * and treated as leaf nodes (no entries of their own). *) - let groff = {|.SH OPTIONS -.TP -\fB\-\-user\fR -Talk to the service manager of the calling user. -.TP -\fB\-\-system\fR -Talk to the service manager of the system. -.SH COMMANDS -.PP -\fBstart\fR \fIUNIT\fR\&... -.RS 4 -Start (activate) one or more units. -.RE -.PP -\fBstop\fR \fIUNIT\fR\&... -.RS 4 -Stop (deactivate) one or more units. -.RE -.PP -\fBreload\fR \fIUNIT\fR\&... -.RS 4 -Asks all units to reload their configuration. -.RE -.SH SEE ALSO -|} in - let result = parse_manpage_string groff in - check "has options entries" (List.length result.entries = 2); - check "has subcommands" (List.length result.subcommands = 3); - let sc_names = List.map (fun (sc : subcommand) -> sc.name) result.subcommands in - check "has start" (List.mem "start" sc_names); - check "has stop" (List.mem "stop" sc_names); - check "has reload" (List.mem "reload" sc_names); - (* verify subcommand descriptions are extracted *) - let start_sc = List.find (fun (sc : subcommand) -> sc.name = "start") result.subcommands in - check "start has desc" (String.length start_sc.desc > 0) - -let test_self_listing_detection () = - Printf.printf "\n== Self-listing subcommand detection ==\n"; - (* when a subcommand's --help shows the parent's help text, - * the subcommand name appears in its own subcommand list. - * the parser should detect this — tested via parse_help. *) - let help_text = {|systemctl [OPTIONS...] COMMAND ... - -Unit Commands: - start UNIT... Start (activate) one or more units - stop UNIT... Stop (deactivate) one or more units - status [PATTERN...] Show runtime status - -Options: - --user Talk to the user service manager - --system Talk to the system service manager -|} in - let r = parse help_text in - let has_start = List.exists (fun (sc : subcommand) -> sc.name = "start") r.subcommands in - check "detected start as subcommand" has_start; - (* the self-listing logic (in main.ml) would check: is "start" in r.subcommands? - * here we just verify the parser extracts it correctly. *) - check "has entries too" (List.length r.entries >= 2) - -let test_nu_file_parsing () = - Printf.printf "\n== .nu file parsing ==\n"; - let nu_source = {|module completions { - - # Unofficial CLI tool - export extern mytool [ - --help(-h) # Print help - --version(-V) # Print version - ] - - # List all items - export extern "mytool list" [ - --raw # Output as JSON - --format(-f): string # Output format - --help(-h) # Print help - name?: string # Filter by name - ] - -} - -use completions * -|} in - let r = Inshellah.Store.parse_nu_completions "mytool" nu_source in - check "has entries" (List.length r.entries = 2); - check "has subcommands" (List.length r.subcommands >= 1); - let list_sc = List.find_opt (fun (sc : subcommand) -> sc.name = "list") r.subcommands in - check "has list subcommand" (list_sc <> None); - check "description" (r.description = "Unofficial CLI tool"); - (* test subcommand lookup *) - let r2 = Inshellah.Store.parse_nu_completions "mytool list" nu_source in - check "list has entries" (List.length r2.entries = 3); - let has_format = List.exists (fun (e : entry) -> - e.switch = Both ('f', "format")) r2.entries in - check "list has --format(-f)" has_format; - check "list has positional" (List.length r2.positionals >= 1) - -let test_italic_synopsis () = - Printf.printf "\n== Italic in SYNOPSIS ==\n"; - let groff = {|.SH Synopsis -.LP -\f(CRnix-env\fR \fIoperation\fR [\fIoptions\fR] [\fIarguments…\fR] -.SH Description -|} in - let cmd = extract_synopsis_command groff in - check "no phantom operation" (cmd = Some "nix-env") - -let test_font_boundary_spacing () = - Printf.printf "\n== Font boundary spacing ==\n"; - (* \fB--max-results\fR\fIcount\fR should become "--max-results count" *) - let s = strip_groff_escapes {|\fB\-\-max\-results\fR\fIcount\fR|} in - check "has space before param" (contains s "--max-results count"); - (* \fB--color\fR[=\fIWHEN\fR] should NOT insert space before = *) - let s2 = strip_groff_escapes {|\fB\-\-color\fR[=\fIWHEN\fR]|} in - check "no space before =" (contains s2 "--color[=WHEN]") - -let () = - Printf.printf "Running help parser tests...\n"; - test_gnu_basic (); - test_gnu_eq_param (); - test_gnu_opt_param (); - test_underscore_param (); - test_short_only (); - test_long_only (); - test_multiline_desc (); - test_multiple_entries (); - test_clap_short_sections (); - test_clap_long_style (); - test_clap_long_angle_param (); - test_space_upper_param (); - test_go_cobra_flags (); - test_go_cobra_subcommands (); - test_busybox_tab (); - test_no_debug_prints (); - - Printf.printf "\nRunning manpage parser tests...\n"; - test_manpage_tp_style (); - test_manpage_ip_style (); - test_manpage_groff_stripping (); - test_manpage_empty_options (); - test_slash_switch_separator (); - test_manpage_nix3_style (); - test_manpage_nix3_with_params (); - test_synopsis_subcommand (); - test_synopsis_standalone (); - test_synopsis_nix3 (); - - Printf.printf "\nRunning nushell generation tests...\n"; - test_nushell_basic (); - test_nushell_param_types (); - test_nushell_subcommands (); - test_nushell_from_manpage (); - test_nushell_module (); - - Printf.printf "\nRunning dedup and font tests...\n"; - test_dedup_entries (); - test_dedup_manpage (); - test_font_boundary_spacing (); - - Printf.printf "\nRunning COMMANDS section tests...\n"; - test_commands_section_subcommands (); - test_self_listing_detection (); - - Printf.printf "\nRunning .nu and synopsis tests...\n"; - test_nu_file_parsing (); - test_italic_synopsis (); - - Printf.printf "\n=== Results: %d passed, %d failed ===\n" !passes !failures; - if !failures > 0 then exit 1 diff --git a/tests/git_clone_fix.rs b/tests/git_clone_fix.rs new file mode 100644 index 0000000..c12f0e1 --- /dev/null +++ b/tests/git_clone_fix.rs @@ -0,0 +1,78 @@ +use inshellah::parsers::help::help_parser; + +#[test] +fn parser_recovers_past_no_bracket_long_form() { + // git clone -h produces lines like `--[no-]progress` that switch_parser + // can't parse. previously the help parser got stuck on these because + // skip_non_option_line refused to skip option-looking lines. now it falls + // through to skip, letting the parser continue to the next real entry. + let text = r#"usage: git clone [] [--] [

] + + -v, --[no-]verbose be more verbose + -q, --[no-]quiet be more quiet + --[no-]progress force progress reporting + --[no-]reject-shallow don't clone shallow repository + -n, --no-checkout don't create a checkout + --checkout opposite of --no-checkout + -s, --[no-]shared setup as shared repository +"#; + let (_, r) = help_parser(text).expect("parse"); + // before the fix: only 2 entries (-v, -q) before the parser got stuck. + // after: -v, -q, -n/--no-checkout, --checkout, -s, plus any others. + assert!( + r.entries.len() >= 4, + "expected ≥4 entries, got {}", + r.entries.len() + ); + assert!( + r.entries.iter().any(|e| { + matches!( + &e.switch, + inshellah::types::Switch::Both('v', l) if *l == "verbose" + ) + }), + "expected -v/--verbose from --[no-]verbose, got {:?}", + r.entries.len() + ); +} + +#[test] +fn parser_keeps_negatable_params() { + let text = r#"usage: git clone [] [--] [] + + -j, --[no-]jobs number of submodules cloned in parallel + --[no-]recurse-submodules[=] + initialize submodules in the clone + --[no-]reject-shallow don't clone shallow repository +"#; + let (_, r) = help_parser(text).expect("parse"); + let jobs = r + .entries + .iter() + .find(|e| matches!(&e.switch, inshellah::types::Switch::Both('j', l) if *l == "jobs")) + .expect("jobs entry"); + assert!(matches!( + &jobs.param, + Some(inshellah::types::Param::Mandatory("n")) + )); + + let recurse = r + .entries + .iter() + .find(|e| matches!(&e.switch, inshellah::types::Switch::Long(l) if *l == "recurse-submodules")) + .expect("recurse-submodules entry"); + assert!(matches!( + &recurse.param, + Some(inshellah::types::Param::Optional("pathspec")) + )); + + let reject = r + .entries + .iter() + .find(|e| matches!(&e.switch, inshellah::types::Switch::Long(l) if *l == "reject-shallow")) + .expect("reject-shallow entry"); + assert!( + reject.param.is_none(), + "reject-shallow should not parse prose as a param" + ); +} diff --git a/tests/manpage_cli.rs b/tests/manpage_cli.rs new file mode 100644 index 0000000..8fc2e0d --- /dev/null +++ b/tests/manpage_cli.rs @@ -0,0 +1,150 @@ +use std::fs; +use std::process::Command; +use std::time::{SystemTime, UNIX_EPOCH}; + +fn unique_temp_dir(name: &str) -> std::path::PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("system time") + .as_nanos(); + std::env::temp_dir().join(format!("{name}-{}-{nanos}", std::process::id())) +} + +#[test] +fn manpage_command_uses_synopsis_name() { + let root = unique_temp_dir("inshellah-manpage-cli"); + fs::create_dir_all(&root).expect("temp dir"); + let manpage = root.join("btrfs-check.8"); + fs::write( + &manpage, + r#".SH SYNOPSIS +btrfs check [options] +.SH OPTIONS +.TP +\fB\-\-repair\fR +try to repair the filesystem +"#, + ) + .expect("write manpage"); + + let output = Command::new(env!("CARGO_BIN_EXE_inshellah")) + .arg("manpage") + .arg(&manpage) + .output() + .expect("run inshellah manpage"); + + assert!( + output.status.success(), + "stderr = {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8(output.stdout).expect("stdout"); + assert!( + stdout.contains("export extern \"btrfs check\""), + "stdout = {stdout}" + ); + assert!( + !stdout.contains("export extern \"btrfs-check\""), + "stdout = {stdout}" + ); + + let _ = fs::remove_dir_all(root); +} + +#[test] +fn manpage_command_strips_git_style_subcommand_prefixes() { + let root = unique_temp_dir("inshellah-manpage-cli"); + fs::create_dir_all(&root).expect("temp dir"); + let manpage = root.join("git.1"); + fs::write( + &manpage, + r#".SH SYNOPSIS +git [--version] [--help] [] +.SH OPTIONS +.TP +\fB\-\-version\fR +show version +.SH "GIT COMMANDS" +.SS "Main porcelain commands" +.PP +.BR git-add (1) +.RS 4 +Add file contents to the index. +.RE +"#, + ) + .expect("write manpage"); + + let output = Command::new(env!("CARGO_BIN_EXE_inshellah")) + .arg("manpage") + .arg(&manpage) + .output() + .expect("run inshellah manpage"); + + assert!( + output.status.success(), + "stderr = {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8(output.stdout).expect("stdout"); + assert!( + stdout.contains("export extern \"git add\""), + "stdout = {stdout}" + ); + assert!( + !stdout.contains("export extern \"git git-add\""), + "stdout = {stdout}" + ); + + let _ = fs::remove_dir_all(root); +} + +#[test] +fn manpage_command_falls_back_when_synopsis_starts_with_prose() { + let root = unique_temp_dir("inshellah-manpage-cli"); + fs::create_dir_all(&root).expect("temp dir"); + let manpage = root.join("ld.so.8"); + fs::write( + &manpage, + r#".SH SYNOPSIS +The dynamic linker can be run either indirectly by running some +dynamically linked program or shared object +(in which case no command-line options +to the dynamic linker can be passed and, in the ELF case, the dynamic linker +which is stored in the +.B .interp +section of the program is executed) or directly by running: +.P +.I /lib/ld\-linux.so.* +[OPTIONS] [PROGRAM [ARGUMENTS]] +.SH OPTIONS +.TP +.BI \-\-argv0\~ string +Set argv[0] to the value string. +"#, + ) + .expect("write manpage"); + + let output = Command::new(env!("CARGO_BIN_EXE_inshellah")) + .arg("manpage") + .arg(&manpage) + .output() + .expect("run inshellah manpage"); + + assert!( + output.status.success(), + "stderr = {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8(output.stdout).expect("stdout"); + assert!( + stdout.contains("export extern \"ld.so\""), + "stdout = {stdout}" + ); + assert!( + !stdout.contains("export extern \"The\""), + "stdout = {stdout}" + ); + + let _ = fs::remove_dir_all(root); +} diff --git a/tests/nushell-completer.nu b/tests/nushell-completer.nu new file mode 100644 index 0000000..0913c36 --- /dev/null +++ b/tests/nushell-completer.nu @@ -0,0 +1,128 @@ +def fail [msg: string] { + error make {msg: $msg} +} + +def assert-eq [actual expected msg: string] { + if $actual != $expected { + fail $"($msg): expected ($expected | to nuon), got ($actual | to nuon)" + } +} + +def assert-contains [items needle msg: string] { + if not ($needle in $items) { + fail $"($msg): expected ($items | to nuon) to contain ($needle | to nuon)" + } +} + +def values [items] { + $items | default [] | get value +} + +let completer = $env.config.completions.external.completer + +def _assert_elevation_wrappers_accept_command_tails [p: path] { + sudo nix-env --set -p /nix/var/nix/profiles/system $p + doas nix-env --set -p /nix/var/nix/profiles/system $p +} + +'[{"value":"--static","description":"from static cache"}]' | save --force $env.INSHELLAH_STATIC_FILE +let static_result = do $completer [demo ""] +assert-eq ($static_result | get 0.value) "--static" "static completion pass-through" +'[{"value":"--server","description":"from static cache"},{"value":"--preserve","description":"from static cache"}]' | save --force $env.INSHELLAH_STATIC_FILE +let static_fuzzy_result = do $completer [demo ser] +assert-eq (values $static_fuzzy_result) ['--server' '--preserve'] "static fuzzy completions are not refiltered by shim" + +"{" | save --force $env.INSHELLAH_STATIC_FILE +let bad_static_result = do $completer [demo ""] +assert-eq $bad_static_result null "bad static JSON falls back cleanly" +"" | save --force $env.INSHELLAH_STATIC_FILE + +assert-eq (do $completer [nix]) null "nix completion ignores too-short spans" +let nix_commands = do $completer [nix ""] +assert-eq ($nix_commands | get 0.value) "build" "nix command completion uses NIX_GET_COMPLETIONS" +let nix_pkg = do $completer [nix "flake#pkg"] +assert-eq ($nix_pkg | get 0.description) "raw package description" "nix descriptions are raw strings" + +let systemctl_empty = do $completer [systemctl daemon-reload ""] +assert-eq $systemctl_empty null "systemctl does not offer units for non-unit verbs" +let systemctl_units = do $completer [systemctl status ""] +assert-eq ($systemctl_units | get 0.value) "demo.service" "systemctl offers units for unit verbs" +let systemctl_prefixed_units = do $completer [systemctl start g] +assert-eq ($systemctl_prefixed_units | get 0.value) "greetd.service" "systemctl unit completions accept typed prefixes" + +let kubectl_pods = do $completer [kubectl get pods -n prod ""] +assert-eq ($kubectl_pods | get 0.value) "pod-a" "kubectl resource names complete" +assert-eq (open $env.KUBECTL_ARGS_FILE | str contains "-n prod") true "kubectl preserves namespace flags" +let kubectl_rollout = do $completer [kubectl rollout status deployment ""] +assert-eq ($kubectl_rollout | get 0.description) "deployment" "kubectl rollout uses resource kind, not action" + +let cargo_packages = do $completer [cargo test -p ""] +assert-eq (values $cargo_packages) [app-lib helper-lib] "cargo -p completes packages" +let cargo_bins = do $completer [cargo run --bin ""] +assert-eq (values $cargo_bins) [app-cli] "cargo --bin completes only bin targets" + +"[]" | save --force $env.INSHELLAH_STATIC_FILE +let git_top = do $completer [git ""] +assert-contains (values $git_top) "remote" "git top-level completes common commands" +assert-contains (values $git_top) "stash" "git top-level includes stash" +let git_push = do $completer [git push ""] +assert-eq (values $git_push) [origin upstream] "empty static completions fall through to git remotes" +let git_remote_verbs = do $completer [git remote ""] +assert-eq (values $git_remote_verbs) [add rename remove rm set-head set-branches get-url set-url show prune update] "git remote completes subcommands" +let git_remote_filtered = do $completer [git remote sho] +assert-eq (values $git_remote_filtered) [show] "git remote subcommands filter by typed prefix" +let git_remote_fuzzy = do $completer [git remote shw] +assert-eq (values $git_remote_fuzzy) [show] "git remote subcommands use fuzzy filtering" +let git_remote_exact = do $completer [git remote show] +assert-eq $git_remote_exact null "exact dynamic completion disappears" +let git_remote_show = do $completer [git remote show ""] +assert-eq (values $git_remote_show) [origin upstream] "git remote show completes remote names" +let git_fetch = do $completer [git fetch ""] +assert-eq (values $git_fetch) [origin upstream] "git fetch completes remotes" +let git_fetch_ref = do $completer [git fetch origin ""] +assert-contains (values $git_fetch_ref) "main" "git fetch after remote completes refs" +let git_branch_delete = do $completer [git branch -d ""] +assert-eq (values $git_branch_delete) [main feature] "git branch delete completes local branches" +let git_tag_delete = do $completer [git tag -d ""] +assert-eq (values $git_tag_delete) [v1.0 v2.0] "git tag delete completes tags" +let git_stash_apply = do $completer [git stash apply ""] +assert-eq (values $git_stash_apply) ['stash@{0}'] "git stash apply completes stashes" +let git_submodule_update = do $completer [git submodule update ""] +assert-eq (values $git_submodule_update) [deps/demo] "git submodule update completes submodule paths" +let git_bisect = do $completer [git bisect ""] +assert-contains (values $git_bisect) "good" "git bisect completes subcommands" +let git_bisect_good = do $completer [git bisect good ""] +assert-contains (values $git_bisect_good) "main" "git bisect good completes refs" +let git_add_paths = do $completer [git add ""] +assert-eq (values $git_add_paths) [src/main.rs new-file.txt renamed.txt] "git add completes changed paths" +let git_rm_paths = do $completer [git rm ""] +assert-eq (values $git_rm_paths) [src/main.rs README.md] "git rm completes tracked paths" +"" | save --force $env.INSHELLAH_STATIC_FILE +let git_worktree_add = do $completer [git worktree add ""] +assert-eq $git_worktree_add null "git worktree add first argument falls back to files" +let git_worktree_remove = do $completer [git worktree remove ""] +assert-eq ($git_worktree_remove | get 0.value) "/repo/linked" "git worktree remove completes existing worktrees" + +"[]" | save --force $env.INSHELLAH_STATIC_FILE +let jj_top = do $completer [jj ""] +assert-contains (values $jj_top) "bookmark" "jj top-level completes common commands" +assert-contains (values $jj_top) "git" "jj top-level includes git command" +let jj_bookmarks = do $completer [jj bookmark delete ""] +assert-eq (values $jj_bookmarks) [main feature origin/main] "jj bookmark delete completes bookmarks" +let jj_tags = do $completer [jj tag delete ""] +assert-eq (values $jj_tags) [v1.0 v2.0] "jj tag delete completes tags" +let jj_git_fetch = do $completer [jj git fetch ""] +assert-eq (values $jj_git_fetch) [origin upstream] "jj git fetch completes remotes" +let jj_git_remote_verbs = do $completer [jj git remote ""] +assert-eq (values $jj_git_remote_verbs) [add list remove rename set-url] "jj git remote completes subcommands" +let jj_git_remote_remove = do $completer [jj git remote remove ""] +assert-eq (values $jj_git_remote_remove) [origin upstream] "jj git remote remove completes remotes" +let jj_revs = do $completer [jj rebase -d ""] +assert-eq (values $jj_revs) [k m] "jj revision flags complete revisions" +let jj_ops = do $completer [jj op restore ""] +assert-eq (values $jj_ops) [abc123] "jj op restore completes operations" +let jj_files = do $completer [jj file show ""] +assert-eq (values $jj_files) [src/main.rs README.md] "jj file show completes repo files" +let jj_workspaces = do $completer [jj workspace forget ""] +assert-eq (values $jj_workspaces) [default linked] "jj workspace forget completes workspaces" +"" | save --force $env.INSHELLAH_STATIC_FILE diff --git a/tests/ports.rs b/tests/ports.rs new file mode 100644 index 0000000..3a3fe4e --- /dev/null +++ b/tests/ports.rs @@ -0,0 +1,915 @@ +//! Tests ported from ../inshellah/test/test_inshellah.ml. +//! +//! Covers the help parser, manpage parser, groff stripping, and nushell +//! generation. The single .nu store parser test (`test_nu_file_parsing`) is +//! not included — it requires porting store.ml first. + +use inshellah::parsers::help::help_parser; +use inshellah::parsers::manpage::{ + ManpageResult, OwnedParam, OwnedSwitch, extract_synopsis_command, parse_manpage_string, + strip_groff_escapes, +}; +use inshellah::parsers::nushell::{generate_extern, generate_module}; +use inshellah::store::{json_of_result, parse_nu_completions, result_from_json}; +use inshellah::types::{HelpResult, Param, Switch}; + +fn parse(txt: &str) -> HelpResult<'_> { + match help_parser(txt) { + Ok((_, r)) => r, + Err(e) => panic!("parse_help failed: {e:?}"), + } +} + +// --- Help parser tests --- + +#[test] +fn gnu_basic() { + let r = parse(" -a, --all do not ignore entries starting with .\n"); + assert_eq!(r.entries.len(), 1); + let e = &r.entries[0]; + assert!(matches!(&e.switch, Switch::Both('a', l) if *l == "all")); + assert!(e.param.is_none()); + assert!(!e.desc.is_empty()); +} + +#[test] +fn gnu_eq_param() { + let r = parse(" --block-size=SIZE scale sizes by SIZE\n"); + assert_eq!(r.entries.len(), 1); + let e = &r.entries[0]; + assert!(matches!(&e.switch, Switch::Long(l) if *l == "block-size")); + assert!(matches!(&e.param, Some(Param::Mandatory(p)) if *p == "SIZE")); +} + +#[test] +fn gnu_opt_param() { + let r = parse(" --color[=WHEN] color the output WHEN\n"); + assert_eq!(r.entries.len(), 1); + let e = &r.entries[0]; + assert!(matches!(&e.switch, Switch::Long(l) if *l == "color")); + assert!(matches!(&e.param, Some(Param::Optional(p)) if *p == "WHEN")); +} + +#[test] +fn underscore_param() { + let r = parse(" --time-style=TIME_STYLE time/date format\n"); + assert_eq!(r.entries.len(), 1); + let e = &r.entries[0]; + assert!(matches!(&e.param, Some(Param::Mandatory(p)) if *p == "TIME_STYLE")); +} + +#[test] +fn short_only() { + let r = parse(" -v verbose output\n"); + assert_eq!(r.entries.len(), 1); + assert!(matches!(r.entries[0].switch, Switch::Short('v'))); +} + +#[test] +fn long_only() { + let r = parse(" --help display help\n"); + assert_eq!(r.entries.len(), 1); + assert!(matches!(&r.entries[0].switch, Switch::Long(l) if *l == "help")); +} + +#[test] +fn multiline_desc() { + let txt = " --block-size=SIZE with -l, scale sizes by SIZE when printing them;\n e.g., '--block-size=M'; see SIZE format below\n"; + let r = parse(txt); + assert_eq!(r.entries.len(), 1); + let combined: String = r.entries[0].desc.join(" "); + assert!(combined.len() > 50, "desc was: {combined}"); +} + +#[test] +fn multiple_entries() { + let txt = " -a, --all do not ignore entries starting with .\n -A, --almost-all do not list implied . and ..\n --author with -l, print the author of each file\n"; + let r = parse(txt); + assert_eq!(r.entries.len(), 3); +} + +#[test] +fn clap_short_sections() { + let txt = "INPUT OPTIONS:\n -e, --regexp=PATTERN A pattern to search for.\n -f, --file=PATTERNFILE Search for patterns from the given file.\nSEARCH OPTIONS:\n -s, --case-sensitive Search case sensitively.\n"; + let r = parse(txt); + assert_eq!(r.entries.len(), 3); + let e = &r.entries[0]; + assert!(matches!(&e.switch, Switch::Both('e', l) if *l == "regexp")); + assert!(matches!(&e.param, Some(Param::Mandatory(p)) if *p == "PATTERN")); +} + +#[test] +fn clap_long_style() { + let txt = " -H, --hidden\n Include hidden directories and files.\n\n --no-ignore\n Do not respect ignore files.\n"; + let r = parse(txt); + assert_eq!(r.entries.len(), 2); + let e = &r.entries[0]; + assert!(matches!(&e.switch, Switch::Both('H', l) if *l == "hidden")); + assert!(!e.desc.is_empty()); +} + +#[test] +fn clap_long_angle_param() { + let txt = " --nonprintable-notation \n Set notation for non-printable characters.\n"; + let r = parse(txt); + assert_eq!(r.entries.len(), 1); + let e = &r.entries[0]; + assert!(matches!(&e.switch, Switch::Long(l) if *l == "nonprintable-notation")); + assert!(matches!(&e.param, Some(Param::Mandatory(p)) if *p == "notation")); +} + +#[test] +fn space_upper_param() { + let r = parse(" -f, --foo FOO foo help\n"); + assert_eq!(r.entries.len(), 1); + let e = &r.entries[0]; + assert!(matches!(&e.switch, Switch::Both('f', l) if *l == "foo")); + assert!(matches!(&e.param, Some(Param::Mandatory(p)) if *p == "FOO")); +} + +#[test] +fn go_cobra_flags() { + let txt = "Flags:\n -D, --debug Enable debug mode\n -H, --host string Daemon socket to connect to\n -v, --version Print version information\n"; + let r = parse(txt); + assert_eq!(r.entries.len(), 3); + let host = &r.entries[1]; + assert!(matches!(&host.switch, Switch::Both('H', l) if *l == "host")); + assert!(matches!(&host.param, Some(Param::Mandatory(p)) if *p == "string")); +} + +#[test] +fn go_cobra_subcommands() { + let txt = "Common Commands:\n run Create and run a new container from an image\n exec Execute a command in a running container\n build Build an image from a Dockerfile\n"; + let r = parse(txt); + assert!( + !r.subcommands.is_empty(), + "expected subcommands, got: {:?}", + r.subcommands.len() + ); +} + +#[test] +fn help_parser_ignores_value_enums_and_defaults() { + let txt = r#"Usage: tar [OPTION...] [FILE]... + + Main operation mode: + -c, --create create a new archive + + Archive format selection: + + -H, --format=FORMAT create archive of the given format + + FORMAT is one of the following: + gnu GNU tar 1.13.x format + oldgnu GNU format as per tar <= 1.12 + pax POSIX 1003.1-2001 (pax) format + posix same as pax + ustar POSIX 1003.1-1988 (ustar) format + v7 old V7 tar format + +*This* tar defaults to: +--format=gnu -f- -b20 --quoting-style=escape +--rmt-command=/nix/store/example/libexec/rmt +"#; + let r = parse(txt); + assert!( + r.subcommands.is_empty(), + "enum values became subcommands: {:?}", + r.subcommands.len() + ); + assert!( + !r.entries + .iter() + .any(|e| matches!(&e.switch, Switch::Long(l) if *l == "rmt-command")), + "default lines should not become flags" + ); + assert!( + r.entries + .iter() + .any(|e| matches!(&e.switch, Switch::Both('H', l) if *l == "format")), + "real option should still be parsed" + ); +} + +#[test] +fn busybox_tab() { + let r = parse("\t-1\tOne column output\n\t-a\tInclude names starting with .\n"); + assert_eq!(r.entries.len(), 2); + assert!(matches!(r.entries[0].switch, Switch::Short('1'))); +} + +#[test] +fn no_debug_prints() { + // the old ocaml parser had print_endline at module load time; this test + // documents that no such side effects exist in the rust port. + let _ = parse(" -v verbose\n"); +} + +#[test] +fn slash_switch_separator() { + let r = parse(" --verbose / -v Increase verbosity\n"); + assert_eq!(r.entries.len(), 1); + let e = &r.entries[0]; + assert!(matches!(&e.switch, Switch::Both('v', l) if *l == "verbose")); + assert!(e.param.is_none()); + let combined: String = e.desc.join(" "); + assert_eq!(combined.trim(), "Increase verbosity"); +} + +// --- Manpage parser tests --- + +#[test] +fn manpage_tp_style() { + let groff = r#".SH OPTIONS +.TP +\fB\-a\fR, \fB\-\-all\fR +do not ignore entries starting with . +.TP +\fB\-A\fR, \fB\-\-almost\-all\fR +do not list implied . and .. +.TP +\fB\-\-block\-size\fR=\fISIZE\fR +with \fB\-l\fR, scale sizes by SIZE +.SH AUTHOR +Written by someone. +"#; + let r = parse_manpage_string(groff); + assert_eq!(r.entries.len(), 3, "entries: {:?}", r.entries); + assert!(matches!(&r.entries[0].switch, OwnedSwitch::Both('a', l) if l == "all")); + assert!(!r.entries[0].desc.is_empty()); + assert!(matches!(&r.entries[2].switch, OwnedSwitch::Long(l) if l == "block-size")); + assert!(matches!(&r.entries[2].param, Some(OwnedParam::Mandatory(p)) if p == "SIZE")); +} + +#[test] +fn manpage_ip_style() { + let groff = r#".SH OPTIONS +.IP "\fB\-k\fR, \fB\-\-insecure\fR" +Allow insecure connections. +.IP "\fB\-o\fR, \fB\-\-output\fR \fIfile\fR" +Write output to file. +.SH SEE ALSO +"#; + let r = parse_manpage_string(groff); + assert_eq!(r.entries.len(), 2, "entries: {:?}", r.entries); + assert!(matches!(&r.entries[0].switch, OwnedSwitch::Both('k', l) if l == "insecure")); +} + +#[test] +fn manpage_groff_stripping() { + let s = strip_groff_escapes(r#"\fB\-\-color\fR[=\fIWHEN\fR]"#); + // font escapes removed + assert!(!(s.contains('f') && s.contains('B') && s.contains('\\'))); + // dashes converted + assert!(s.contains('-')); + let s2 = strip_groff_escapes(r#"\(aqhello\(aq"#); + assert!(s2.contains('\''), "expected apostrophe in: {s2}"); +} + +#[test] +fn manpage_getent_databases_from_description() { + let groff = r#".SH SYNOPSIS +.SY getent +.RI [ option \~.\|.\|.\&] +.I database +.IR key \~.\|.\|. +.YS +.SH DESCRIPTION +The +.I database +may be any of those supported by the GNU C Library, listed below: +.TP +.B passwd +When no +.I key +is provided, enumerate the passwd database. +.TP +.B services +When no +.I key +is provided, enumerate the services database. +.SH OPTIONS +.TP +.BI \-\-service\~ service +.TQ +.BI \-s\~ service +Override all databases with the specified service. +.TP +.BI \-\-service\~ database : service +.TQ +.BI \-s\~ database : service +Override only specified databases with the specified service. +.TP +.B \-\-usage +Print a short usage summary and exit. +"#; + let r = parse_manpage_string(groff); + let positional_names: Vec<&str> = r + .positionals + .iter() + .map(|(name, _)| name.as_str()) + .collect(); + assert_eq!(positional_names, vec!["database", "key"]); + + let service = r + .entries + .iter() + .find(|e| matches!(&e.switch, OwnedSwitch::Both('s', name) if name == "service")) + .expect("expected --service(-s)"); + assert!(matches!( + &service.param, + Some(OwnedParam::Mandatory(param)) if param == "service" + )); + assert!( + !r.entries + .iter() + .any(|e| matches!(&e.switch, OwnedSwitch::Long(name) if name == "serviceservice" || name == "servicedatabase")), + "entries: {:?}", + r.entries + ); + + let subcommands: Vec<&str> = r.subcommands.iter().map(|sc| sc.name.as_str()).collect(); + assert!( + subcommands.contains(&"passwd"), + "subcommands: {subcommands:?}" + ); + assert!( + subcommands.contains(&"services"), + "subcommands: {subcommands:?}" + ); + + let nu = generate_extern("getent", &r); + assert!(nu.contains("database: string"), "nu = {nu}"); + assert!(nu.contains("...key: string"), "nu = {nu}"); + assert!(nu.contains("--service(-s): string"), "nu = {nu}"); + assert!(!nu.contains("--servicedatabase"), "nu = {nu}"); + assert!(nu.contains("export extern \"getent passwd\""), "nu = {nu}"); +} + +#[test] +fn manpage_b_macro_option_tag_with_embedded_quotes() { + let groff = r#".SH OPTIONS +.TP +.B "\-s ""\fIprogram\fR [\fIargument \fR...]\fB""\fR, \fB\-\-speller=""\fIprogram\fR [\fIargument \fR...]\fB""" +Use this command to perform spell checking and correcting. +"#; + let r = parse_manpage_string(groff); + assert!( + r.entries + .iter() + .any(|e| matches!(e.switch, OwnedSwitch::Short('s'))), + "entries: {:?}", + r.entries + ); +} + +#[test] +fn manpage_synopsis_b_macro_bracket_args_keep_spaces() { + let groff = r#".SH "SYNOPSIS" +.B "rtmon" +.RI "[ " OPTIONS " ] " +.BI "file " FILE +.BR "[ " all +.RI "| " OBJECTS +.RB "]" +.ti -8 +.I OBJECTS +.B ":= [" link "]" "[" address "]" "[" route "]" +.SH OPTIONS +"#; + let r = parse_manpage_string(groff); + let positional_names: Vec<&str> = r + .positionals + .iter() + .map(|(name, _)| name.as_str()) + .collect(); + assert!( + !positional_names.contains(&"ptions") + && positional_names.contains(&"link") + && positional_names.contains(&"address"), + "positionals: {positional_names:?}" + ); +} + +#[test] +fn bracketed_angle_positionals_keep_inner_ellipsis() { + let groff = r#".SH SYNOPSIS +.B bzip2 +.RB [ " \-cdfkqstvzVL123456789 " ] +[ +.I "filenames \&..." +] +.SH OPTIONS +"#; + let r = parse_manpage_string(groff); + assert!( + r.positionals + .iter() + .any(|(name, positional)| name == "filenames" && positional.variadic), + "positionals: {:?}", + r.positionals + ); +} + +#[test] +fn nested_optional_positionals_keep_last_valid_inner_name() { + let groff = r#".SH SYNOPSIS +\fBfc-cat\fR [ \fB-rvVh\fR ] + [ \fB [ \fIfonts-cache-%version%-files\fB ] [ \fIdirs\fB ] \fR\fI...\fR ] +.SH OPTIONS +"#; + let r = parse_manpage_string(groff); + assert!( + r.positionals + .iter() + .any(|(name, positional)| name == "dirs" && positional.optional && positional.variadic), + "positionals: {:?}", + r.positionals + ); +} + +#[test] +fn manpage_empty_options() { + let groff = ".SH NAME\nfoo \\- does stuff\n.SH DESCRIPTION\nDoes stuff.\n"; + let r = parse_manpage_string(groff); + assert_eq!(r.entries.len(), 0); +} + +#[test] +fn manpage_nix3_style() { + let groff = r#".SH Options +.SS Logging-related options +.IP "\(bu" 3 +.UR #opt-verbose +\f(CR--verbose\fR +.UE +/ \f(CR-v\fR +.IP +Increase the logging verbosity level. +.IP "\(bu" 3 +.UR #opt-quiet +\f(CR--quiet\fR +.UE +.IP +Decrease the logging verbosity level. +.SH SEE ALSO +"#; + let r = parse_manpage_string(groff); + assert_eq!(r.entries.len(), 2, "entries: {:?}", r.entries); + assert!(matches!(&r.entries[0].switch, OwnedSwitch::Both('v', l) if l == "verbose")); + assert!(!r.entries[0].desc.is_empty()); + assert!(matches!(&r.entries[1].switch, OwnedSwitch::Long(l) if l == "quiet")); + assert!(!r.entries[1].desc.is_empty()); +} + +#[test] +fn manpage_nix3_with_params() { + let groff = r#".SH Options +.IP "\(bu" 3 +.UR #opt-arg +\f(CR--arg\fR +.UE +\fIname\fR \fIexpr\fR +.IP +Pass the value as the argument name to Nix functions. +.IP "\(bu" 3 +.UR #opt-include +\f(CR--include\fR +.UE +/ \f(CR-I\fR \fIpath\fR +.IP +Add path to search path entries. +.IP +This option may be given multiple times. +.SH SEE ALSO +"#; + let r = parse_manpage_string(groff); + assert_eq!(r.entries.len(), 2, "entries: {:?}", r.entries); + assert!(matches!(&r.entries[0].switch, OwnedSwitch::Long(l) if l == "arg")); + assert!(r.entries[0].param.is_some()); + assert!(matches!(&r.entries[1].switch, OwnedSwitch::Both('I', l) if l == "include")); + assert!(matches!(&r.entries[1].param, Some(OwnedParam::Mandatory(p)) if p == "path")); +} + +#[test] +fn synopsis_subcommand() { + let groff = r#".SH "SYNOPSIS" +.sp +.nf +\fBgit\fR \fBcommit\fR [\fB\-a\fR | \fB\-\-interactive\fR] +.fi +.SH "DESCRIPTION" +"#; + let cmd = extract_synopsis_command(groff); + assert_eq!(cmd.as_deref(), Some("git commit")); +} + +#[test] +fn synopsis_standalone() { + let groff = ".SH Synopsis\n.LP\n\\f(CRnix-build\\fR [\\fIpaths\\fR]\n.SH Description\n"; + let cmd = extract_synopsis_command(groff); + assert_eq!(cmd.as_deref(), Some("nix-build")); +} + +#[test] +fn synopsis_nix3() { + let groff = ".SH Synopsis\n.LP\n\\f(CRnix run\\fR [\\fIoption\\fR] \\fIinstallable\\fR\n.SH Description\n"; + let cmd = extract_synopsis_command(groff); + assert_eq!(cmd.as_deref(), Some("nix run")); +} + +#[test] +fn italic_synopsis() { + let groff = ".SH Synopsis\n.LP\n\\f(CRnix-env\\fR \\fIoperation\\fR [\\fIoptions\\fR] [\\fIarguments…\\fR]\n.SH Description\n"; + let cmd = extract_synopsis_command(groff); + assert_eq!(cmd.as_deref(), Some("nix-env")); +} + +#[test] +fn synopsis_italic_command_name() { + // git-am.1 (and many other git manpages) put the entire command + // invocation in italics: `\fIgit am\fR [...]`. should still resolve + // to "git am" rather than treating it as a placeholder. + let groff = ".SH \"SYNOPSIS\"\n.sp\n.nf\n\\fIgit am\\fR [\\-\\-signoff] [\\-\\-keep]\n.fi\n.SH \"DESCRIPTION\"\n"; + let cmd = extract_synopsis_command(groff); + assert_eq!(cmd.as_deref(), Some("git am")); +} + +#[test] +fn synopsis_skips_prose_before_invocation() { + let groff = r#".SH SYNOPSIS +The dynamic linker can be run either indirectly by running some +dynamically linked program or shared object +(in which case no command-line options +to the dynamic linker can be passed and, in the ELF case, the dynamic linker +which is stored in the +.B .interp +section of the program is executed) or directly by running: +.P +.I /lib/ld\-linux.so.* +[OPTIONS] [PROGRAM [ARGUMENTS]] +.SH DESCRIPTION +"#; + let cmd = extract_synopsis_command(groff); + assert_eq!(cmd.as_deref(), None); +} + +#[test] +fn synopsis_skips_labels_before_invocation() { + let groff = r#".SH "SYNOPSIS" +.sp +Set up a loop device: +.sp +\fBlosetup\fP [options] \fB\-f\fP|\fIloopdev file\fP +.sp +Get info: +.RS 4 +\fBlosetup\fP \fIloopdev\fP +.RE +.SH "DESCRIPTION" +"#; + let cmd = extract_synopsis_command(groff); + assert_eq!(cmd.as_deref(), Some("losetup")); +} + +#[test] +fn synopsis_b_macro_preserves_command_spaces() { + let groff = r#".SH "SYNOPSIS" +.sp +.B ip link +.RI " { " COMMAND " | " +.BR help " }" +.SH "DESCRIPTION" +"#; + let cmd = extract_synopsis_command(groff); + assert_eq!(cmd.as_deref(), Some("ip link")); +} + +#[test] +fn synopsis_br_macro_preserves_quoted_command_spaces() { + let groff = r#".SH "SYNOPSIS" +.sp +.BR "ip monitor" " [ " all " |" +.IR OBJECT-LIST " ]" +.SH "DESCRIPTION" +"#; + let cmd = extract_synopsis_command(groff); + assert_eq!(cmd.as_deref(), Some("ip monitor")); +} + +#[test] +fn synopsis_long_b_macro_is_not_prose() { + let groff = r#".SH SYNOPSIS +.ad l +.in +8 +.ti -8 +.B tipc peer remove address +.IR ADDRESS +.SH OPTIONS +"#; + let cmd = extract_synopsis_command(groff); + assert_eq!(cmd.as_deref(), Some("tipc peer remove address")); +} + +#[test] +fn synopsis_ss_heading_is_accepted() { + let groff = r#".SH Name +.LP +\f(CRnix-env --set\fR - set profile to contain a specified derivation +.SS +Synopsis +.LP +\f(CRnix-env\fR \f(CR--set\fR \fIdrvname\fR +.SS +Description +"#; + let cmd = extract_synopsis_command(groff); + assert_eq!(cmd.as_deref(), Some("nix-env")); +} + +// --- Font/dedup tests (only the font-spacing one is portable) --- + +#[test] +fn font_boundary_spacing() { + // \fB--max-results\fR\fIcount\fR should become "--max-results count" + let s = strip_groff_escapes(r#"\fB\-\-max\-results\fR\fIcount\fR"#); + assert!(s.contains("--max-results count"), "got: {s}"); + // \fB--color\fR[=\fIWHEN\fR] should NOT insert space before = + let s2 = strip_groff_escapes(r#"\fB\-\-color\fR[=\fIWHEN\fR]"#); + assert!(s2.contains("--color[=WHEN]"), "got: {s2}"); +} + +// --- COMMANDS section tests --- + +#[test] +fn commands_section_subcommands() { + let groff = r#".SH OPTIONS +.TP +\fB\-\-user\fR +Talk to the service manager of the calling user. +.TP +\fB\-\-system\fR +Talk to the service manager of the system. +.SH COMMANDS +.PP +\fBstart\fR \fIUNIT\fR\&... +.RS 4 +Start (activate) one or more units. +.RE +.PP +\fBstop\fR \fIUNIT\fR\&... +.RS 4 +Stop (deactivate) one or more units. +.RE +.PP +\fBreload\fR \fIUNIT\fR\&... +.RS 4 +Asks all units to reload their configuration. +.RE +.SH SEE ALSO +"#; + let r = parse_manpage_string(groff); + assert_eq!(r.entries.len(), 2, "options entries: {:?}", r.entries); + assert_eq!(r.subcommands.len(), 3, "subcommands: {:?}", r.subcommands); + let names: Vec<&str> = r.subcommands.iter().map(|sc| sc.name.as_str()).collect(); + assert!(names.contains(&"start")); + assert!(names.contains(&"stop")); + assert!(names.contains(&"reload")); + let start_sc = r.subcommands.iter().find(|sc| sc.name == "start").unwrap(); + assert!(!start_sc.desc.is_empty()); +} + +#[test] +fn commands_section_git_style_refs() { + let groff = r#".SH OPTIONS +.TP +\fB\-\-version\fR +Show version. +.SH "GIT COMMANDS" +.SS "Main porcelain commands" +.PP +.BR git-add (1) +.RS 4 +Add file contents to the index. +.RE +.PP +\fBgit-commit\fR(1) +.RS 4 +Record changes to the repository. +.RE +"#; + let r = parse_manpage_string(groff); + let names: Vec<&str> = r.subcommands.iter().map(|sc| sc.name.as_str()).collect(); + assert!( + names.contains(&"git-add"), + "subcommands: {:?}", + r.subcommands + ); + assert!( + names.contains(&"git-commit"), + "subcommands: {:?}", + r.subcommands + ); + let add = r + .subcommands + .iter() + .find(|sc| sc.name == "git-add") + .unwrap(); + assert!(add.desc.contains("Add file contents")); +} + +// --- Nushell generation tests --- + +fn to_owned_result(r: &HelpResult<'_>) -> ManpageResult { + r.into() +} + +#[test] +fn nushell_basic() { + let r = parse(" -a, --all do not ignore entries starting with .\n"); + let nu = generate_extern("ls", &to_owned_result(&r)); + assert!(nu.contains("export extern \"ls\""), "nu = {nu}"); + assert!(nu.contains("--all(-a)"), "nu = {nu}"); + assert!(nu.contains("# do not ignore"), "nu = {nu}"); +} + +#[test] +fn nushell_param_types() { + let txt = " -w, --width=COLS set output width\n --block-size=SIZE scale sizes\n -o, --output FILE output file\n"; + let r = parse(txt); + let nu = generate_extern("ls", &to_owned_result(&r)); + assert!(nu.contains("--width(-w): int"), "nu = {nu}"); + assert!(nu.contains("--block-size: string"), "nu = {nu}"); + assert!(nu.contains("--output(-o): path"), "nu = {nu}"); +} + +#[test] +fn nushell_subcommands() { + let txt = "Common Commands:\n run Create and run a new container\n exec Execute a command\n\nFlags:\n -D, --debug Enable debug mode\n"; + let r = parse(txt); + let nu = generate_extern("docker", &to_owned_result(&r)); + assert!(nu.contains("export extern \"docker\""), "nu = {nu}"); + assert!(nu.contains("--debug(-D)"), "nu = {nu}"); + assert!(nu.contains("export extern \"docker run\""), "nu = {nu}"); + assert!(nu.contains("export extern \"docker exec\""), "nu = {nu}"); +} + +#[test] +fn positional_order_survives_cache_and_generation() { + let txt = "usage: git clone [] [--] [directory]\n"; + let result = to_owned_result(&parse(txt)); + assert_eq!( + result + .positionals + .iter() + .map(|(name, _)| name.as_str()) + .collect::>(), + vec!["repository", "directory"] + ); + + let json = json_of_result("help", &result); + let value = serde_json::from_str(&json).expect("cache json"); + let cached = result_from_json(&value); + assert_eq!( + cached + .positionals + .iter() + .map(|(name, _)| name.as_str()) + .collect::>(), + vec!["repository", "directory"] + ); + + let nu = generate_extern("git clone", &cached); + let repository = nu + .find("repository: string") + .expect("repository positional"); + let directory = nu.find("directory?: path").expect("directory positional"); + assert!(repository < directory, "nu = {nu}"); +} + +#[test] +fn nushell_from_manpage() { + let groff = r#".SH OPTIONS +.TP +\fB\-a\fR, \fB\-\-all\fR +do not ignore entries starting with . +.TP +\fB\-\-block\-size\fR=\fISIZE\fR +scale sizes by SIZE +.SH AUTHOR +"#; + let result = parse_manpage_string(groff); + let nu = generate_extern("ls", &result); + assert!(nu.contains("export extern \"ls\""), "nu = {nu}"); + assert!(nu.contains("--all(-a)"), "nu = {nu}"); + assert!(nu.contains("--block-size: string"), "nu = {nu}"); +} + +#[test] +fn nushell_module() { + let r = parse(" -v, --verbose verbose output\n"); + let nu = generate_module("myapp", &to_owned_result(&r)); + assert!(nu.contains("module myapp-completions"), "nu = {nu}"); + assert!(nu.contains("export extern \"myapp\""), "nu = {nu}"); + assert!(nu.contains("--verbose(-v)"), "nu = {nu}"); +} + +#[test] +fn dedup_entries_help() { + let txt = " -v, --verbose verbose output\n --verbose verbose mode\n -v be verbose\n"; + let r = parse(txt); + let nu = generate_extern("test", &to_owned_result(&r)); + let count = nu.matches("--verbose").count(); + assert_eq!(count, 1, "expected --verbose to appear once, nu = {nu}"); + assert!(nu.contains("--verbose(-v)"), "nu = {nu}"); +} + +#[test] +fn dedup_manpage_entries() { + let groff = r#".SH OPTIONS +.TP +\fB\-v\fR, \fB\-\-verbose\fR +Be verbose. +.SH DESCRIPTION +Use \fB\-v\fR for verbose output. +Use \fB\-\-verbose\fR to see more. +"#; + let result = parse_manpage_string(groff); + let nu = generate_extern("test", &result); + assert!(nu.contains("--verbose(-v)"), "nu = {nu}"); + let verbose_lines: Vec<&str> = nu.lines().filter(|l| l.contains("verbose")).collect(); + assert_eq!( + verbose_lines.len(), + 1, + "expected 1 verbose line, got: {verbose_lines:?}" + ); +} + +#[test] +fn nu_file_parsing() { + let nu_source = r#"module completions { + + # Unofficial CLI tool + export extern mytool [ + --help(-h) # Print help + --version(-V) # Print version + ] + + # List all items + export extern "mytool list" [ + --raw # Output as JSON + --format(-f): string # Output format + --help(-h) # Print help + name?: string # Filter by name + ] + +} + +use completions * +"#; + let r = parse_nu_completions("mytool", nu_source); + assert_eq!(r.entries.len(), 2, "entries: {:?}", r.entries); + assert!( + !r.subcommands.is_empty(), + "subcommands: {:?}", + r.subcommands + ); + assert!(r.subcommands.iter().any(|sc| sc.name == "list")); + assert_eq!(r.description, "Unofficial CLI tool"); + + let r2 = parse_nu_completions("mytool list", nu_source); + assert_eq!(r2.entries.len(), 3, "list entries: {:?}", r2.entries); + let has_format = r2 + .entries + .iter() + .any(|e| matches!(&e.switch, OwnedSwitch::Both('f', l) if l == "format")); + assert!( + has_format, + "list should have --format(-f): {:?}", + r2.entries + ); + assert!(!r2.positionals.is_empty(), "list should have a positional"); +} + +#[test] +fn self_listing_detection() { + let txt = r#"systemctl [OPTIONS...] COMMAND ... + +Unit Commands: + start UNIT... Start (activate) one or more units + stop UNIT... Stop (deactivate) one or more units + status [PATTERN...] Show runtime status + +Options: + --user Talk to the user service manager + --system Talk to the system service manager +"#; + let r = parse(txt); + let has_start = r.subcommands.iter().any(|sc| sc.name == "start"); + assert!( + has_start, + "expected start in subcommands: {:?}", + r.subcommands.iter().map(|sc| sc.name).collect::>() + ); + assert!(r.entries.len() >= 2); +} diff --git a/tests/runtime_complete.rs b/tests/runtime_complete.rs new file mode 100644 index 0000000..9d209b7 --- /dev/null +++ b/tests/runtime_complete.rs @@ -0,0 +1,500 @@ +use std::fs; +use std::os::unix::fs::PermissionsExt; +use std::process::Command; +use std::time::{SystemTime, UNIX_EPOCH}; + +use inshellah::parsers::manpage::{ManpageEntry, ManpageResult, ManpageSubcommand, OwnedSwitch}; +use inshellah::store::write_result; + +fn unique_temp_dir(name: &str) -> std::path::PathBuf { + let nanos = SystemTime::now() + .duration_since(UNIX_EPOCH) + .expect("system time") + .as_nanos(); + std::env::temp_dir().join(format!("{name}-{}-{nanos}", std::process::id())) +} + +#[test] +fn complete_scrapes_missing_subcommand_when_parent_is_cached() { + let root = unique_temp_dir("inshellah-runtime-complete"); + let bin_dir = root.join("bin"); + let cache_dir = root.join("cache"); + fs::create_dir_all(&bin_dir).expect("bin dir"); + fs::create_dir_all(&cache_dir).expect("cache dir"); + + let fakecmd = bin_dir.join("fakecmd"); + fs::write( + &fakecmd, + r#"#!/bin/sh +if [ "$1" = "clone" ]; then + if [ "$2" = "--help" ] || [ "$2" = "-h" ]; then + cat <<'EOF' +Usage: fakecmd clone [OPTIONS] [directory] + +Options: + --depth clone depth + -v, --verbose verbose +EOF + exit 0 + fi +fi + +if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then + cat <<'EOF' +Usage: fakecmd [OPTIONS] COMMAND + +Commands: + clone Clone a repository + +Options: + -h, --help show help +EOF + exit 0 +fi + +exit 2 +"#, + ) + .expect("write fakecmd"); + let mut perms = fs::metadata(&fakecmd).expect("metadata").permissions(); + perms.set_mode(0o755); + fs::set_permissions(&fakecmd, perms).expect("chmod"); + + let parent = ManpageResult { + entries: Vec::new(), + subcommands: vec![ManpageSubcommand { + name: "clone".to_string(), + desc: "Clone a repository".to_string(), + }], + positionals: Vec::new(), + description: String::new(), + }; + write_result(&cache_dir, "fakecmd", "help", &parent).expect("parent cache"); + + let old_path = std::env::var_os("PATH").unwrap_or_default(); + let output = Command::new(env!("CARGO_BIN_EXE_inshellah")) + .arg("complete") + .arg("--dir") + .arg(&cache_dir) + .arg("--timeout-ms") + .arg("1000") + .arg("fakecmd") + .arg("clone") + .arg("--") + .env( + "PATH", + format!("{}:{}", bin_dir.display(), old_path.to_string_lossy()), + ) + .output() + .expect("run inshellah complete"); + + assert!( + output.status.success(), + "stderr = {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8(output.stdout).expect("stdout"); + assert!(stdout.contains("--depth"), "stdout = {stdout}"); + assert!( + cache_dir.join("fakecmd_clone.json").is_file(), + "subcommand cache was not written" + ); + + let _ = fs::remove_dir_all(root); +} + +#[test] +fn complete_does_not_scan_path_at_command_position() { + let root = unique_temp_dir("inshellah-command-position-complete"); + let bin_dir = root.join("bin"); + let cache_dir = root.join("cache"); + fs::create_dir_all(&bin_dir).expect("bin dir"); + fs::create_dir_all(&cache_dir).expect("cache dir"); + + let fake_git = bin_dir.join("git"); + fs::write(&fake_git, "#!/bin/sh\nexit 0\n").expect("write fake git"); + let mut perms = fs::metadata(&fake_git).expect("metadata").permissions(); + perms.set_mode(0o755); + fs::set_permissions(&fake_git, perms).expect("chmod"); + + let output = Command::new(env!("CARGO_BIN_EXE_inshellah")) + .arg("complete") + .arg("--dir") + .arg(&cache_dir) + .arg("gi") + .env("PATH", &bin_dir) + .output() + .expect("run inshellah complete"); + + assert!( + output.status.success(), + "stderr = {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8(output.stdout).expect("stdout"); + assert_eq!(stdout.trim(), "null", "stdout = {stdout}"); + + let _ = fs::remove_dir_all(root); +} + +#[test] +fn complete_uses_boundary_aware_fuzzy_ranking() { + let root = unique_temp_dir("inshellah-fuzzy-complete"); + let cache_dir = root.join("cache"); + fs::create_dir_all(&cache_dir).expect("cache dir"); + + let result = ManpageResult { + entries: Vec::new(), + subcommands: vec![ + ManpageSubcommand { + name: "load".to_string(), + desc: "load something".to_string(), + }, + ManpageSubcommand { + name: "clone".to_string(), + desc: "clone something".to_string(), + }, + ], + positionals: Vec::new(), + description: String::new(), + }; + write_result(&cache_dir, "demo", "help", &result).expect("cache"); + + let output = Command::new(env!("CARGO_BIN_EXE_inshellah")) + .arg("complete") + .arg("--dir") + .arg(&cache_dir) + .arg("demo") + .arg("lo") + .output() + .expect("run inshellah complete"); + + assert!( + output.status.success(), + "stderr = {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8(output.stdout).expect("stdout"); + let load_pos = stdout.find(r#""value":"load""#).unwrap_or(usize::MAX); + let clone_pos = stdout.find(r#""value":"clone""#).unwrap_or(usize::MAX); + assert!( + load_pos < clone_pos, + "expected boundary match to outrank substring match, stdout = {stdout}" + ); + + let _ = fs::remove_dir_all(root); +} + +#[test] +fn complete_returns_flags_only_after_hyphen() { + let root = unique_temp_dir("inshellah-flag-prefix-complete"); + let cache_dir = root.join("cache"); + fs::create_dir_all(&cache_dir).expect("cache dir"); + + let result = ManpageResult { + entries: vec![ManpageEntry { + switch: OwnedSwitch::Long("verbose".to_string()), + param: None, + desc: "verbose output".to_string(), + }], + subcommands: Vec::new(), + positionals: Vec::new(), + description: String::new(), + }; + write_result(&cache_dir, "demo", "help", &result).expect("cache"); + + let argument_output = Command::new(env!("CARGO_BIN_EXE_inshellah")) + .arg("complete") + .arg("--dir") + .arg(&cache_dir) + .arg("demo") + .arg("") + .output() + .expect("run inshellah complete"); + assert!( + argument_output.status.success(), + "stderr = {}", + String::from_utf8_lossy(&argument_output.stderr) + ); + let argument_stdout = String::from_utf8(argument_output.stdout).expect("stdout"); + assert_eq!(argument_stdout.trim(), "null", "stdout = {argument_stdout}"); + + let flag_output = Command::new(env!("CARGO_BIN_EXE_inshellah")) + .arg("complete") + .arg("--dir") + .arg(&cache_dir) + .arg("demo") + .arg("--") + .output() + .expect("run inshellah complete"); + assert!( + flag_output.status.success(), + "stderr = {}", + String::from_utf8_lossy(&flag_output.stderr) + ); + let flag_stdout = String::from_utf8(flag_output.stdout).expect("stdout"); + assert!( + flag_stdout.contains(r#""value":"--verbose""#), + "stdout = {flag_stdout}" + ); + + let _ = fs::remove_dir_all(root); +} + +#[test] +fn complete_resolves_absolute_path_after_elevation_wrapper() { + let root = unique_temp_dir("inshellah-absolute-elevation-complete"); + let bin_dir = root.join("bin"); + let cache_dir = root.join("cache"); + fs::create_dir_all(&bin_dir).expect("bin dir"); + fs::create_dir_all(&cache_dir).expect("cache dir"); + + let fakecmd = bin_dir.join("fakecmd"); + fs::write( + &fakecmd, + r#"#!/bin/sh +if [ "$1" = "--help" ] || [ "$1" = "-h" ]; then + printf '%s\n' 'Usage: fakecmd [OPTIONS]' '' 'Options:' ' --verbose verbose output' + exit 0 +fi +exit 2 +"#, + ) + .expect("write fakecmd"); + let mut perms = fs::metadata(&fakecmd).expect("metadata").permissions(); + perms.set_mode(0o755); + fs::set_permissions(&fakecmd, perms).expect("chmod"); + + let output = Command::new(env!("CARGO_BIN_EXE_inshellah")) + .arg("complete") + .arg("--dir") + .arg(&cache_dir) + .arg("--timeout-ms") + .arg("1000") + .arg("sudo") + .arg(&fakecmd) + .arg("--") + .env("PATH", "") + .output() + .expect("run inshellah complete"); + + assert!( + output.status.success(), + "stderr = {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8(output.stdout).expect("stdout"); + assert!( + stdout.contains(r#""value":"--verbose""#), + "stdout = {stdout}" + ); + + let _ = fs::remove_dir_all(root); +} + +#[test] +fn complete_adb_dynamic_values_use_live_devices_and_packages() { + let root = unique_temp_dir("inshellah-adb-dynamic-complete"); + let bin_dir = root.join("bin"); + let cache_dir = root.join("cache"); + fs::create_dir_all(&bin_dir).expect("bin dir"); + fs::create_dir_all(&cache_dir).expect("cache dir"); + + let adb = bin_dir.join("adb"); + fs::write( + &adb, + r#"#!/bin/sh +selector="" +case "$1" in + -s|--serial|--one-device) + selector="$2" + shift 2 + ;; + -t|--transport-id) + selector="transport:$2" + shift 2 + ;; + --serial=*) + selector="${1#--serial=}" + shift + ;; + --one-device=*) + selector="${1#--one-device=}" + shift + ;; + --transport-id=*) + selector="transport:${1#--transport-id=}" + shift + ;; +esac + +if [ "$1" = "devices" ] && [ "$2" = "-l" ]; then + printf '%s\n' 'List of devices attached' + printf '%s\n' 'emulator-5554 device product:sdk_gphone_x86 model:Pixel_8 device:emu transport_id:1' + printf '%s\n' 'R58M123456 device product:oriole model:Pixel_6 device:oriole transport_id:2' + printf '%s\n' 'offline-1 offline transport_id:3' + exit 0 +fi + +if [ "$1" = "shell" ] && [ "$2" = "pm" ] && [ "$3" = "list" ] && [ "$4" = "packages" ]; then + case "$selector" in + emulator-5554) + printf '%s\n' 'package:com.example.emu' + printf '%s\n' 'package:org.example.shared' + ;; + transport:2) + printf '%s\n' 'package:com.example.transport' + printf '%s\n' 'package:org.example.transport' + ;; + *) + printf '%s\n' 'package:com.default.app' + printf '%s\n' 'package:/data/app/org.default.path/base.apk=org.default.path' + ;; + esac + exit 0 +fi + +exit 2 +"#, + ) + .expect("write adb"); + let mut perms = fs::metadata(&adb).expect("metadata").permissions(); + perms.set_mode(0o755); + fs::set_permissions(&adb, perms).expect("chmod"); + + let run_complete = |args: &[&str]| -> String { + let mut cmd = Command::new(env!("CARGO_BIN_EXE_inshellah")); + cmd.arg("complete") + .arg("--dir") + .arg(&cache_dir) + .arg("--timeout-ms") + .arg("1000"); + for arg in args { + cmd.arg(arg); + } + let output = cmd + .env("PATH", &bin_dir) + .output() + .expect("run inshellah complete"); + assert!( + output.status.success(), + "stderr = {}", + String::from_utf8_lossy(&output.stderr) + ); + String::from_utf8(output.stdout).expect("stdout") + }; + + let stdout = run_complete(&["adb", "-s", ""]); + assert!( + stdout.contains(r#""value":"emulator-5554""#), + "stdout = {stdout}" + ); + assert!( + stdout.contains(r#""description":"device sdk gphone x86 Pixel 8""#), + "stdout = {stdout}" + ); + assert!( + stdout.contains(r#""value":"R58M123456""#), + "stdout = {stdout}" + ); + assert!( + stdout.contains(r#""value":"offline-1""#), + "stdout = {stdout}" + ); + + let prefixed_stdout = run_complete(&["adb", "--serial=R5"]); + assert!( + prefixed_stdout.contains(r#""value":"--serial=R58M123456""#), + "stdout = {prefixed_stdout}" + ); + assert!( + !prefixed_stdout.contains(r#""value":"--serial=emulator-5554""#), + "stdout = {prefixed_stdout}" + ); + + let one_device_stdout = run_complete(&["adb", "--one-device", ""]); + assert!( + one_device_stdout.contains(r#""value":"emulator-5554""#), + "stdout = {one_device_stdout}" + ); + + let transport_stdout = run_complete(&["adb", "-t", ""]); + assert!( + transport_stdout.contains(r#""value":"1""#), + "stdout = {transport_stdout}" + ); + assert!( + transport_stdout.contains(r#""description":"emulator-5554 device sdk gphone x86 Pixel 8""#), + "stdout = {transport_stdout}" + ); + assert!( + transport_stdout.contains(r#""value":"2""#), + "stdout = {transport_stdout}" + ); + + let transport_prefixed_stdout = run_complete(&["adb", "--transport-id=2"]); + assert!( + transport_prefixed_stdout.contains(r#""value":"--transport-id=2""#), + "stdout = {transport_prefixed_stdout}" + ); + assert!( + !transport_prefixed_stdout.contains(r#""value":"--transport-id=1""#), + "stdout = {transport_prefixed_stdout}" + ); + + let uninstall_stdout = run_complete(&["adb", "uninstall", "org"]); + assert!( + uninstall_stdout.contains(r#""value":"org.default.path""#), + "stdout = {uninstall_stdout}" + ); + assert!( + !uninstall_stdout.contains(r#""value":"com.default.app""#), + "stdout = {uninstall_stdout}" + ); + + let clear_stdout = run_complete(&["adb", "-s", "emulator-5554", "shell", "pm", "clear", ""]); + assert!( + clear_stdout.contains(r#""value":"com.example.emu""#), + "stdout = {clear_stdout}" + ); + assert!( + !clear_stdout.contains(r#""value":"com.example.transport""#), + "stdout = {clear_stdout}" + ); + + let force_stop_stdout = run_complete(&[ + "adb", + "-t", + "2", + "shell", + "am", + "force-stop", + "--user", + "0", + "com.", + ]); + assert!( + force_stop_stdout.contains(r#""value":"com.example.transport""#), + "stdout = {force_stop_stdout}" + ); + assert!( + !force_stop_stdout.contains(r#""value":"com.example.emu""#), + "stdout = {force_stop_stdout}" + ); + + let flag_value_stdout = run_complete(&["adb", "shell", "pm", "enable", "--user", ""]); + assert_eq!( + flag_value_stdout.trim(), + "null", + "stdout = {flag_value_stdout}" + ); + + let shell_flag_stdout = run_complete(&["adb", "shell", "-s", ""]); + assert_eq!( + shell_flag_stdout.trim(), + "null", + "stdout = {shell_flag_stdout}" + ); + + let _ = fs::remove_dir_all(root); +} diff --git a/tests/self_completions.rs b/tests/self_completions.rs new file mode 100644 index 0000000..14b8667 --- /dev/null +++ b/tests/self_completions.rs @@ -0,0 +1,31 @@ +use std::process::Command; + +#[test] +fn inshellah_completions_include_all_subcommands() { + let output = Command::new(env!("CARGO_BIN_EXE_inshellah")) + .arg("completions") + .output() + .expect("run inshellah completions"); + + assert!( + output.status.success(), + "stderr = {}", + String::from_utf8_lossy(&output.stderr) + ); + let stdout = String::from_utf8(output.stdout).expect("stdout"); + for subcommand in [ + "index", + "manpage", + "manpage-dir", + "complete", + "query", + "dump", + "completions", + ] { + let extern_name = format!("export extern \"inshellah {subcommand}\""); + assert!( + stdout.contains(&extern_name), + "missing {extern_name}; stdout = {stdout}" + ); + } +}