comprehensive completion generation: native, manpage, --help

Three-strategy pipeline with priority: native completion generators
(e.g. CMD completions nushell) > manpage parsing > --help fallback.
Single `generate` command produces one module-wrapped .nu file per
command. Parallel execution scaled to cores, 200ms timeouts, ELF
string scanning to skip binaries without -h support, native gzip
decompression via camlzip, SYNOPSIS-based subcommand detection,
nix3 manpage strategy, deduplication, nushell builtin exclusion.
This commit is contained in:
atagen 2026-03-21 02:07:46 +11:00
parent 01ccf64efc
commit 7f0ec8ab4d
9 changed files with 937 additions and 265 deletions

View file

@ -1,3 +1,3 @@
(library
(name inshellah)
(libraries angstrom angstrom-unix str unix))
(libraries angstrom angstrom-unix camlzip str unix))

View file

@ -6,6 +6,11 @@ let strip_groff_escapes s =
let buf = Buffer.create (String.length s) in
let len = String.length s in
let i = ref 0 in
let last = ref '\000' in
let put c = Buffer.add_char buf c; last := c in
let is_alnum c =
(c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')
in
while !i < len do
if s.[!i] = '\\' && !i + 1 < len then begin
let next = s.[!i + 1] in
@ -13,10 +18,13 @@ let strip_groff_escapes s =
| 'f' ->
(* Font escape: \fB, \fI, \fP, \fR, \f(XX, \f[...] *)
if !i + 2 < len then begin
if s.[!i + 2] = '(' then
i := !i + 4 (* \f(XX *)
else if s.[!i + 2] = '[' then begin
(* \f[...] - skip to ] *)
let fc = s.[!i + 2] in
(* Insert space before italic font to preserve word boundaries
e.g. \fB--max-results\fR\fIcount\fR "--max-results count" *)
if fc = 'I' && is_alnum !last then put ' ';
if fc = '(' then
i := !i + 5 (* \f(XX *)
else if fc = '[' then begin
i := !i + 3;
while !i < len && s.[!i] <> ']' do incr i done;
if !i < len then incr i
@ -25,7 +33,7 @@ let strip_groff_escapes s =
end else
i := !i + 2
| '-' ->
Buffer.add_char buf '-';
put '-';
i := !i + 2
| '&' | '/' | ',' ->
(* Zero-width characters *)
@ -35,10 +43,10 @@ let strip_groff_escapes s =
if !i + 3 < len then begin
let name = String.sub s (!i + 2) 2 in
(match name with
| "aq" -> Buffer.add_char buf '\''
| "lq" | "Lq" -> Buffer.add_char buf '"'
| "rq" | "Rq" -> Buffer.add_char buf '"'
| "em" | "en" -> Buffer.add_char buf '-'
| "aq" -> put '\''
| "lq" | "Lq" -> put '"'
| "rq" | "Rq" -> put '"'
| "em" | "en" -> put '-'
| _ -> ());
i := !i + 4
end else
@ -51,9 +59,9 @@ let strip_groff_escapes s =
if !i < len then begin
let name = String.sub s start (!i - start) in
(match name with
| "aq" -> Buffer.add_char buf '\''
| "lq" | "Lq" -> Buffer.add_char buf '"'
| "rq" | "Rq" -> Buffer.add_char buf '"'
| "aq" -> put '\''
| "lq" | "Lq" -> put '"'
| "rq" | "Rq" -> put '"'
| _ -> ());
incr i
end
@ -106,19 +114,19 @@ let strip_groff_escapes s =
incr i
end
| 'e' ->
Buffer.add_char buf '\\';
put '\\';
i := !i + 2
| '\\' ->
Buffer.add_char buf '\\';
put '\\';
i := !i + 2
| ' ' ->
Buffer.add_char buf ' ';
put ' ';
i := !i + 2
| _ ->
(* Unknown escape, skip *)
i := !i + 2
end else begin
Buffer.add_char buf s.[!i];
put s.[!i];
incr i
end
done;
@ -262,18 +270,29 @@ let parse_tag_to_entry tag desc =
| Ok (switch, param) -> Some { switch; param; desc }
| Error _ -> None
(* Extract tag text from a macro line (.B, .I preserve spaces; .BI/.BR/.IR alternate) *)
let tag_of_macro name args =
match name with
| "B" | "I" -> strip_groff_escapes args |> String.trim
| _ -> strip_inline_macro_args args |> strip_groff_escapes |> String.trim
(* Strategy A: .TP style (most common — GNU coreutils, help2man) *)
let strategy_tp lines =
let rec walk lines acc =
match lines with
| [] -> List.rev acc
| Macro ("TP", _) :: rest ->
(* Next text line is the tag *)
(* Next line is the tag — could be Text or a formatting macro *)
begin match rest with
| Text tag :: rest2 ->
let (desc, rest3) = collect_text_lines rest2 [] in
let entry = parse_tag_to_entry tag desc in
walk rest3 (match entry with Some e -> e :: acc | None -> acc)
| Macro (("B" | "I" | "BI" | "BR" | "IR") as m, args) :: rest2 ->
let tag = tag_of_macro m args in
let (desc, rest3) = collect_text_lines rest2 [] in
let entry = parse_tag_to_entry tag desc in
walk rest3 (match entry with Some e -> e :: acc | None -> acc)
| _ -> walk rest acc
end
| _ :: rest -> walk rest acc
@ -352,6 +371,62 @@ let strategy_deroff_lines lines =
| Ok result -> result.entries
| Error _ -> []
(* Strategy E: Nix3-style bullet .IP with .UR/.UE hyperlinks *)
let strategy_nix lines =
let is_bullet_ip args =
String.length (String.trim args) > 0
in
let rec walk lines acc =
match lines with
| [] -> List.rev acc
| Macro ("IP", args) :: rest when is_bullet_ip args ->
(* Collect tag: skip UR/UE macros, collect Text lines *)
let rec collect_tag lines parts =
match lines with
| Macro ("UR", _) :: rest2 -> collect_tag rest2 parts
| Macro ("UE", _) :: rest2 -> collect_tag rest2 parts
| Text s :: rest2 -> collect_tag rest2 (s :: parts)
| _ -> (String.concat " " (List.rev parts), lines)
in
let (tag, rest2) = collect_tag rest [] in
(* Collect description after the description .IP marker *)
let rec collect_desc lines parts =
match lines with
| Macro ("IP", dargs) :: rest3 when not (is_bullet_ip dargs) ->
collect_desc_text rest3 parts
| _ -> (String.concat " " (List.rev parts), lines)
and collect_desc_text lines parts =
match lines with
| Text s :: rest3 -> collect_desc_text rest3 (s :: parts)
| Macro ("IP", args) :: _ when is_bullet_ip args ->
(String.concat " " (List.rev parts), lines)
| Macro (("SS" | "SH"), _) :: _ ->
(String.concat " " (List.rev parts), lines)
| Macro ("RS", _) :: rest3 ->
skip_rs rest3 parts 1
| Macro ("IP", _) :: rest3 ->
(* Non-bullet IP = continuation paragraph *)
collect_desc_text rest3 parts
| Macro _ :: rest3 -> collect_desc_text rest3 parts
| Blank :: rest3 -> collect_desc_text rest3 parts
| Comment :: rest3 -> collect_desc_text rest3 parts
| [] -> (String.concat " " (List.rev parts), [])
and skip_rs lines parts depth =
match lines with
| Macro ("RE", _) :: rest3 ->
if depth <= 1 then collect_desc_text rest3 parts
else skip_rs rest3 parts (depth - 1)
| Macro ("RS", _) :: rest3 -> skip_rs rest3 parts (depth + 1)
| _ :: rest3 -> skip_rs rest3 parts depth
| [] -> (String.concat " " (List.rev parts), [])
in
let (desc, rest3) = collect_desc rest2 [] in
let entry = parse_tag_to_entry tag desc in
walk rest3 (match entry with Some e -> e :: acc | None -> acc)
| _ :: rest -> walk rest acc
in
walk lines []
(* Count macros of a given type *)
let count_macro name lines =
List.fold_left (fun n line ->
@ -370,46 +445,106 @@ let extract_entries lines =
(* Try PP+RS if both present *)
if count_macro "PP" lines > 0 && count_macro "RS" lines > 0 then
results := ("PP+RS", strategy_pp_rs lines) :: !results;
(* Try nix3 style if UR macros present *)
if count_macro "UR" lines > 0 && count_macro "IP" lines > 0 then
results := ("nix", strategy_nix lines) :: !results;
(* Always try deroff as fallback *)
results := ("deroff", strategy_deroff_lines lines) :: !results;
(* Pick the result with the most entries *)
(* Prefer specialized strategies over deroff fallback *)
let specialized =
List.filter (fun (name, entries) -> name <> "deroff" && entries <> []) !results
in
let candidates = if specialized <> [] then specialized else !results in
let best =
List.fold_left (fun (best_name, best_entries) (name, entries) ->
if List.length entries >= List.length best_entries then (name, entries)
else (best_name, best_entries)
) ("none", []) !results
) ("none", []) candidates
in
snd best
(* --- SYNOPSIS command name extraction --- *)
let extract_synopsis_command_lines lines =
let classified = List.map classify_line lines in
let is_synopsis name =
let s = String.uppercase_ascii (String.trim name) in
s = "SYNOPSIS"
in
let extract_cmd line =
let words = String.split_on_char ' ' (String.trim line) in
let words = List.filter (fun w -> String.length w > 0) words in
let is_cmd_char = function
| 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '.' -> true
| _ -> false
in
let rec take = function
| [] -> []
| w :: rest ->
if String.length w > 0
&& (w.[0] = '[' || w.[0] = '-' || w.[0] = '<'
|| w.[0] = '(' || w.[0] = '{')
then []
else if String.for_all is_cmd_char w then
w :: take rest
else []
in
match take words with
| [] -> None
| cmd -> Some (String.concat " " cmd)
in
let rec find = function
| [] -> None
| Macro ("SH", args) :: rest when is_synopsis args -> collect rest
| _ :: rest -> find rest
and collect = function
| [] -> None
| Macro ("SH", _) :: _ -> None
| Text s :: _ ->
let s = String.trim s in
if String.length s > 0 then extract_cmd s else None
| Macro (("B" | "BI" | "BR"), args) :: _ ->
let s = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in
if String.length s > 0 then extract_cmd s else None
| _ :: rest -> collect rest
in
find classified
let extract_synopsis_command contents =
let lines = String.split_on_char '\n' contents in
extract_synopsis_command_lines lines
(* --- Top-level API --- *)
let parse_manpage_lines lines =
let options_section = extract_options_section lines in
extract_entries options_section
let entries = extract_entries options_section in
{ entries; subcommands = [] }
let parse_manpage_string contents =
let lines = String.split_on_char '\n' contents in
parse_manpage_lines lines
let parse_manpage_gzipped_file path =
let ic = Unix.open_process_in (Printf.sprintf "gzip -dc %s" (Filename.quote path)) in
let buf = Buffer.create 4096 in
(try while true do
let line = input_line ic in
Buffer.add_string buf line;
Buffer.add_char buf '\n'
done with End_of_file -> ());
let _ = Unix.close_process_in ic in
parse_manpage_string (Buffer.contents buf)
let parse_manpage_file path =
if Filename.check_suffix path ".gz" then
parse_manpage_gzipped_file path
else begin
let read_manpage_file path =
if Filename.check_suffix path ".gz" then begin
let ic = Gzip.open_in path in
let buf = Buffer.create 8192 in
let chunk = Bytes.create 8192 in
(try while true do
let n = Gzip.input ic chunk 0 8192 in
if n = 0 then raise Exit
else Buffer.add_subbytes buf chunk 0 n
done with Exit | End_of_file -> ());
Gzip.close_in ic;
Buffer.contents buf
end else begin
let ic = open_in path in
let n = in_channel_length ic in
let s = Bytes.create n in
really_input ic s 0 n;
close_in ic;
parse_manpage_string (Bytes.to_string s)
Bytes.to_string s
end
let parse_manpage_file path =
read_manpage_file path |> parse_manpage_string

View file

@ -1,6 +1,90 @@
open Parser
(* Map a param name/type hint to a nushell type *)
(* Nushell built-in commands and keywords *)
let nushell_builtins = [
"alias"; "all"; "ansi"; "any"; "append"; "ast"; "attr";
"bits"; "break"; "bytes";
"cal"; "cd"; "char"; "chunk-by"; "chunks"; "clear"; "collect";
"columns"; "commandline"; "compact"; "complete"; "config"; "const";
"continue"; "cp";
"date"; "debug"; "decode"; "def"; "default"; "describe"; "detect";
"do"; "drop"; "du";
"each"; "echo"; "encode"; "enumerate"; "error"; "every"; "exec";
"exit"; "explain"; "explore"; "export"; "export-env"; "extern";
"fill"; "filter"; "find"; "first"; "flatten"; "for"; "format"; "from";
"generate"; "get"; "glob"; "grid"; "group-by";
"hash"; "headers"; "help"; "hide"; "hide-env"; "histogram";
"history"; "http";
"if"; "ignore"; "input"; "insert"; "inspect"; "interleave"; "into";
"is-admin"; "is-empty"; "is-not-empty"; "is-terminal"; "items";
"job"; "join";
"keybindings"; "kill";
"last"; "length"; "let"; "let-env"; "lines"; "load-env"; "loop"; "ls";
"match"; "math"; "merge"; "metadata"; "mkdir"; "mktemp"; "module";
"move"; "mut"; "mv";
"nu-check"; "nu-highlight";
"open"; "overlay";
"panic"; "par-each"; "parse"; "path"; "plugin"; "port"; "prepend"; "print"; "ps";
"query";
"random"; "reduce"; "reject"; "rename"; "return"; "reverse"; "rm";
"roll"; "rotate"; "run-external";
"save"; "schema"; "scope"; "select"; "seq"; "shuffle"; "skip"; "sleep";
"slice"; "sort"; "sort-by"; "source"; "source-env"; "split"; "start";
"stor"; "str"; "sys";
"table"; "take"; "tee"; "term"; "timeit"; "to"; "touch"; "transpose";
"try"; "tutor";
"ulimit"; "umask"; "uname"; "uniq"; "uniq-by"; "unlet"; "update";
"upsert"; "url"; "use";
"values"; "version"; "view";
"watch"; "where"; "which"; "while"; "whoami"; "window"; "with-env"; "wrap";
"zip";
]
let builtin_set = lazy (
let tbl = Hashtbl.create (List.length nushell_builtins) in
List.iter (fun s -> Hashtbl.replace tbl s true) nushell_builtins;
tbl)
let is_nushell_builtin cmd =
Hashtbl.mem (Lazy.force builtin_set) cmd
let dedup_entries entries =
let key_of entry =
match entry.switch with
| Short c -> Printf.sprintf "-%c" c
| Long l | Both (_, l) -> Printf.sprintf "--%s" l
in
let score entry =
let sw = match entry.switch with Both _ -> 10 | _ -> 0 in
let p = match entry.param with Some _ -> 5 | None -> 0 in
let d = min 5 (String.length entry.desc / 10) in
sw + p + d
in
let best = Hashtbl.create 64 in
List.iter (fun e ->
let k = key_of e in
match Hashtbl.find_opt best k with
| Some prev when score prev >= score e -> ()
| _ -> Hashtbl.replace best k e
) entries;
let covered_shorts = Hashtbl.create 16 in
Hashtbl.iter (fun _ e ->
match e.switch with
| Both (c, _) -> Hashtbl.replace covered_shorts c true
| _ -> ()
) best;
let seen = Hashtbl.create 64 in
List.filter_map (fun e ->
let k = key_of e in
if Hashtbl.mem seen k then None
else
match e.switch with
| Short c when Hashtbl.mem covered_shorts c -> None
| _ ->
Hashtbl.add seen k true;
Hashtbl.find_opt best k
) entries
let nushell_type_of_param = function
| "FILE" | "file" | "PATH" | "path" | "DIR" | "dir" | "DIRECTORY"
| "FILENAME" | "PATTERNFILE" -> "path"
@ -8,75 +92,63 @@ let nushell_type_of_param = function
| "LINES" | "DEPTH" | "depth" -> "int"
| _ -> "string"
(* Escape a nushell string: wrap in double quotes, escape inner quotes *)
let escape_nu s =
let buf = Buffer.create (String.length s + 2) in
String.iter (fun c ->
match c with
| '"' -> Buffer.add_string buf "\\\""
| '\\' -> Buffer.add_string buf "\\\\"
| _ -> Buffer.add_char buf c
) s;
Buffer.contents buf
if not (String.contains s '"') && not (String.contains s '\\') then s
else begin
let buf = Buffer.create (String.length s + 4) in
String.iter (fun c -> match c with
| '"' -> Buffer.add_string buf "\\\""
| '\\' -> Buffer.add_string buf "\\\\"
| _ -> Buffer.add_char buf c
) s;
Buffer.contents buf
end
(* Format a single flag for nushell extern *)
let format_flag entry =
let buf = Buffer.create 64 in
Buffer.add_string buf " ";
(* Flag name *)
(match entry.switch with
| Both (s, l) ->
Buffer.add_string buf (Printf.sprintf "--%s(-%c)" l s)
| Long l ->
Buffer.add_string buf (Printf.sprintf "--%s" l)
| Short s ->
Buffer.add_string buf (Printf.sprintf "-%c" s));
(* Type annotation *)
(match entry.param with
| Some (Mandatory name) ->
Buffer.add_string buf ": ";
Buffer.add_string buf (nushell_type_of_param name)
| Some (Optional name) ->
Buffer.add_string buf ": ";
Buffer.add_string buf (nushell_type_of_param name)
| None -> ());
(* Description as comment *)
if String.length entry.desc > 0 then begin
(* Pad to align comments *)
let current_len = Buffer.length buf in
let target = max (current_len + 1) 40 in
for _ = current_len to target - 1 do
Buffer.add_char buf ' '
done;
Buffer.add_string buf "# ";
Buffer.add_string buf entry.desc
end;
Buffer.contents buf
let name = match entry.switch with
| Both (s, l) -> Printf.sprintf "--%s(-%c)" l s
| Long l -> Printf.sprintf "--%s" l
| Short s -> Printf.sprintf "-%c" s
in
let typed = match entry.param with
| Some (Mandatory p) | Some (Optional p) -> ": " ^ nushell_type_of_param p
| None -> ""
in
let flag = " " ^ name ^ typed in
if String.length entry.desc = 0 then flag
else
let pad_len = max 1 (40 - String.length flag) in
flag ^ String.make pad_len ' ' ^ "# " ^ entry.desc
let write_extern buf cmd_name result =
let entries = dedup_entries result.entries in
Printf.bprintf buf "export extern \"%s\" [\n" (escape_nu cmd_name);
List.iter (fun e ->
Buffer.add_string buf (format_flag e); Buffer.add_char buf '\n'
) entries;
Buffer.add_string buf "]\n";
List.iter (fun (sc : subcommand) ->
Printf.bprintf buf "\nexport extern \"%s %s\" [ # %s\n]\n"
(escape_nu cmd_name) (escape_nu sc.name) (escape_nu sc.desc)
) result.subcommands
(* Generate nushell extern definition for a command *)
let generate_extern cmd_name result =
let buf = Buffer.create 1024 in
(* Main extern with flags *)
Buffer.add_string buf (Printf.sprintf "export extern \"%s\" [\n" (escape_nu cmd_name));
List.iter (fun entry ->
Buffer.add_string buf (format_flag entry);
Buffer.add_char buf '\n'
) result.entries;
Buffer.add_string buf "]\n";
(* Subcommand externs *)
List.iter (fun (sc : subcommand) ->
Buffer.add_string buf
(Printf.sprintf "\nexport extern \"%s %s\" [ # %s\n]\n"
(escape_nu cmd_name) (escape_nu sc.name) (escape_nu sc.desc))
) result.subcommands;
write_extern buf cmd_name result;
Buffer.contents buf
(* Generate a complete nushell module *)
let generate_module cmd_name result =
Printf.sprintf "module %s-completions {\n%s}\n"
cmd_name (generate_extern cmd_name result)
let module_name_of cmd_name =
let s = String.map (function
| ('a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_') as c -> c | _ -> '-') cmd_name in
s ^ "-completions"
let generate_module cmd_name result =
let m = module_name_of cmd_name in
let buf = Buffer.create 1024 in
Printf.bprintf buf "module %s {\n" m;
write_extern buf cmd_name result;
Printf.bprintf buf "}\n\nuse %s *\n" m;
Buffer.contents buf
(* Generate from manpage entries (no subcommands) *)
let generate_extern_from_entries cmd_name entries =
let result = { entries; subcommands = [] } in
generate_extern cmd_name result
generate_extern cmd_name { entries; subcommands = [] }

View file

@ -128,12 +128,15 @@ let param_parser =
space_upper_param; space_type_param ]
>>| fun a -> Some a)
(* Switch parser: -a, --all | -a | --all *)
(* Switch parser: -a, --all | --all / -a | -a | --all *)
let switch_parser =
choice
[
(short_switch >>= fun s ->
comma *> long_switch >>| fun l -> Both (s, l));
(long_switch >>= fun l ->
inline_ws *> char '/' *> inline_ws *>
short_switch >>| fun s -> Both (s, l));
(short_switch >>| fun s -> Short s);
(long_switch >>| fun l -> Long l);
]
@ -219,10 +222,15 @@ let entry =
(* --- Subcommand parsing --- *)
(* A subcommand line: " name description" *)
let is_subcommand_char = function
| 'a' .. 'z' | 'A' .. 'Z' | '0' .. '9' | '-' | '_' -> true
| _ -> false
let subcommand_entry =
inline_ws *>
take_while1 (fun c -> c <> ' ' && c <> '\t' && c <> '\n') >>= fun name ->
(* Must have at least 2 spaces before description *)
take_while1 is_subcommand_char >>= fun name ->
if String.length name < 2 then fail "subcommand name too short"
else
char ' ' *> char ' ' *> inline_ws *>
rest_of_line <* eol >>| fun desc ->
{ name; desc = String.trim desc }