inshellah/lib/manpage.ml
atagen 7f0ec8ab4d comprehensive completion generation: native, manpage, --help
Three-strategy pipeline with priority: native completion generators
(e.g. CMD completions nushell) > manpage parsing > --help fallback.
Single `generate` command produces one module-wrapped .nu file per
command. Parallel execution scaled to cores, 200ms timeouts, ELF
string scanning to skip binaries without -h support, native gzip
decompression via camlzip, SYNOPSIS-based subcommand detection,
nix3 manpage strategy, deduplication, nushell builtin exclusion.
2026-03-21 02:14:11 +11:00

550 lines
18 KiB
OCaml

open Parser
(* --- Groff escape/formatting stripper --- *)
let strip_groff_escapes s =
let buf = Buffer.create (String.length s) in
let len = String.length s in
let i = ref 0 in
let last = ref '\000' in
let put c = Buffer.add_char buf c; last := c in
let is_alnum c =
(c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')
in
while !i < len do
if s.[!i] = '\\' && !i + 1 < len then begin
let next = s.[!i + 1] in
match next with
| 'f' ->
(* Font escape: \fB, \fI, \fP, \fR, \f(XX, \f[...] *)
if !i + 2 < len then begin
let fc = s.[!i + 2] in
(* Insert space before italic font to preserve word boundaries
e.g. \fB--max-results\fR\fIcount\fR → "--max-results count" *)
if fc = 'I' && is_alnum !last then put ' ';
if fc = '(' then
i := !i + 5 (* \f(XX *)
else if fc = '[' then begin
i := !i + 3;
while !i < len && s.[!i] <> ']' do incr i done;
if !i < len then incr i
end else
i := !i + 3 (* \fX *)
end else
i := !i + 2
| '-' ->
put '-';
i := !i + 2
| '&' | '/' | ',' ->
(* Zero-width characters *)
i := !i + 2
| '(' ->
(* Two-char named character: \(aq, \(lq, \(rq, etc. *)
if !i + 3 < len then begin
let name = String.sub s (!i + 2) 2 in
(match name with
| "aq" -> put '\''
| "lq" | "Lq" -> put '"'
| "rq" | "Rq" -> put '"'
| "em" | "en" -> put '-'
| _ -> ());
i := !i + 4
end else
i := !i + 2
| '[' ->
(* Named character: \[...] *)
i := !i + 2;
let start = !i in
while !i < len && s.[!i] <> ']' do incr i done;
if !i < len then begin
let name = String.sub s start (!i - start) in
(match name with
| "aq" -> put '\''
| "lq" | "Lq" -> put '"'
| "rq" | "Rq" -> put '"'
| _ -> ());
incr i
end
| 's' ->
(* Size escape: \sN, \s+N, \s-N, \s'N' *)
i := !i + 2;
if !i < len && (s.[!i] = '+' || s.[!i] = '-') then incr i;
if !i < len && s.[!i] >= '0' && s.[!i] <= '9' then incr i;
if !i < len && s.[!i] >= '0' && s.[!i] <= '9' then incr i
| 'm' ->
(* Color escape: \m[...] *)
i := !i + 2;
if !i < len && s.[!i] = '[' then begin
incr i;
while !i < len && s.[!i] <> ']' do incr i done;
if !i < len then incr i
end
| 'X' ->
(* Device control: \X'...' *)
i := !i + 2;
if !i < len && s.[!i] = '\'' then begin
incr i;
while !i < len && s.[!i] <> '\'' do incr i done;
if !i < len then incr i
end
| '*' ->
(* String variable: \*X or \*(XX or \*[...] *)
i := !i + 2;
if !i < len then begin
if s.[!i] = '(' then
i := !i + 2
else if s.[!i] = '[' then begin
incr i;
while !i < len && s.[!i] <> ']' do incr i done;
if !i < len then incr i
end else
incr i
end
| 'n' ->
(* Number register: \nX or \n(XX or \n[...] *)
i := !i + 2;
if !i < len then begin
if s.[!i] = '(' then
i := !i + 2
else if s.[!i] = '[' then begin
incr i;
while !i < len && s.[!i] <> ']' do incr i done;
if !i < len then incr i
end else
incr i
end
| 'e' ->
put '\\';
i := !i + 2
| '\\' ->
put '\\';
i := !i + 2
| ' ' ->
put ' ';
i := !i + 2
| _ ->
(* Unknown escape, skip *)
i := !i + 2
end else begin
put s.[!i];
incr i
end
done;
Buffer.contents buf
(* Strip inline macro formatting: .BI, .BR, .IR, etc.
These macros alternate between fonts for their arguments.
We just concatenate the arguments. *)
let strip_inline_macro_args s =
(* Arguments are separated by spaces, quoted strings are kept together *)
let buf = Buffer.create (String.length s) in
let len = String.length s in
let i = ref 0 in
while !i < len do
if s.[!i] = '"' then begin
incr i;
while !i < len && s.[!i] <> '"' do
Buffer.add_char buf s.[!i];
incr i
done;
if !i < len then incr i
end else if s.[!i] = ' ' || s.[!i] = '\t' then begin
incr i
end else begin
Buffer.add_char buf s.[!i];
incr i
end
done;
Buffer.contents buf
let strip_groff line =
let s = strip_groff_escapes line in
String.trim s
(* --- Line classification --- *)
type groff_line =
| Macro of string * string (* e.g. ("SH", "OPTIONS") or ("TP", "") *)
| Text of string (* plain text after stroff stripping *)
| Blank
| Comment
let classify_line line =
let len = String.length line in
if len = 0 then Blank
else if len >= 2 && line.[0] = '.' && line.[1] = '\\' && (len < 3 || line.[2] = '"') then
Comment
else if len >= 3 && line.[0] = '\\' && line.[1] = '"' then
Comment
else if line.[0] = '.' || line.[0] = '\'' then begin
(* Macro line *)
let rest = String.sub line 1 (len - 1) in
let rest = String.trim rest in
(* Split into macro name and arguments *)
let space_pos =
try Some (String.index rest ' ')
with Not_found ->
try Some (String.index rest '\t')
with Not_found -> None
in
match space_pos with
| Some pos ->
let name = String.sub rest 0 pos in
let args = String.trim (String.sub rest (pos + 1) (String.length rest - pos - 1)) in
(* Strip quotes from args *)
let args =
let alen = String.length args in
if alen >= 2 && args.[0] = '"' && args.[alen - 1] = '"' then
String.sub args 1 (alen - 2)
else args
in
Macro (name, args)
| None ->
Macro (rest, "")
end else begin
let stripped = strip_groff line in
if String.length stripped = 0 then Blank
else Text stripped
end
(* Check for dot-backslash-quote style comments more carefully *)
let is_comment_line line =
let len = String.length line in
(len >= 3 && line.[0] = '.' && line.[1] = '\\' && line.[2] = '"')
|| (len >= 2 && line.[0] = '\\' && line.[1] = '"')
let classify_line line =
if is_comment_line line then Comment
else classify_line line
(* --- Section extraction --- *)
let extract_options_section lines =
let classified = List.map classify_line lines in
let rec collect_until_next_sh lines acc =
match lines with
| [] -> List.rev acc
| Macro ("SH", _) :: _ -> List.rev acc
| line :: rest -> collect_until_next_sh rest (line :: acc)
in
let is_options_section name =
let s = String.uppercase_ascii (String.trim name) in
s = "OPTIONS"
|| (String.length s > 0 &&
try let _ = Str.search_forward (Str.regexp_string "OPTION") s 0 in true
with Not_found -> false)
in
(* First pass: look for OPTIONS section *)
let rec find_options = function
| [] -> None
| Macro ("SH", args) :: rest when is_options_section args ->
Some (collect_until_next_sh rest [])
| _ :: rest -> find_options rest
in
(* Fallback: DESCRIPTION section *)
let rec find_description = function
| [] -> []
| Macro ("SH", args) :: rest
when String.uppercase_ascii (String.trim args) = "DESCRIPTION" ->
collect_until_next_sh rest []
| _ :: rest -> find_description rest
in
match find_options classified with
| Some section -> section
| None -> find_description classified
(* --- Strategy-based entry extraction --- *)
(* Collect text lines until next macro or blank *)
let rec collect_text_lines lines acc =
match lines with
| Text s :: rest -> collect_text_lines rest (s :: acc)
| _ -> (String.concat " " (List.rev acc), lines)
(* Parse a tag line to extract entry using the Angstrom switch_parser *)
let parse_tag_to_entry tag desc =
let tag = strip_groff_escapes tag in
let tag = String.trim tag in
match Angstrom.parse_string ~consume:Angstrom.Consume.Prefix
(Angstrom.lift2 (fun sw p -> (sw, p)) switch_parser param_parser) tag with
| Ok (switch, param) -> Some { switch; param; desc }
| Error _ -> None
(* Extract tag text from a macro line (.B, .I preserve spaces; .BI/.BR/.IR alternate) *)
let tag_of_macro name args =
match name with
| "B" | "I" -> strip_groff_escapes args |> String.trim
| _ -> strip_inline_macro_args args |> strip_groff_escapes |> String.trim
(* Strategy A: .TP style (most common — GNU coreutils, help2man) *)
let strategy_tp lines =
let rec walk lines acc =
match lines with
| [] -> List.rev acc
| Macro ("TP", _) :: rest ->
(* Next line is the tag — could be Text or a formatting macro *)
begin match rest with
| Text tag :: rest2 ->
let (desc, rest3) = collect_text_lines rest2 [] in
let entry = parse_tag_to_entry tag desc in
walk rest3 (match entry with Some e -> e :: acc | None -> acc)
| Macro (("B" | "I" | "BI" | "BR" | "IR") as m, args) :: rest2 ->
let tag = tag_of_macro m args in
let (desc, rest3) = collect_text_lines rest2 [] in
let entry = parse_tag_to_entry tag desc in
walk rest3 (match entry with Some e -> e :: acc | None -> acc)
| _ -> walk rest acc
end
| _ :: rest -> walk rest acc
in
walk lines []
(* Strategy B: .IP style (curl, hand-written) *)
let strategy_ip lines =
let rec walk lines acc =
match lines with
| [] -> List.rev acc
| Macro ("IP", tag) :: rest ->
let tag = strip_groff_escapes tag in
let (desc, rest2) = collect_text_lines rest [] in
let entry = parse_tag_to_entry tag desc in
walk rest2 (match entry with Some e -> e :: acc | None -> acc)
| _ :: rest -> walk rest acc
in
walk lines []
(* Strategy C: .PP + .RS/.RE style (git, DocBook) *)
let strategy_pp_rs lines =
let rec walk lines acc =
match lines with
| [] -> List.rev acc
| Macro ("PP", _) :: rest ->
begin match rest with
| Text tag :: rest2 ->
(* Look for .RS ... text ... .RE *)
let rec collect_rs lines desc_acc =
match lines with
| Macro ("RS", _) :: rest3 ->
collect_in_rs rest3 desc_acc
| Text s :: rest3 ->
(* Sometimes description follows directly *)
collect_rs rest3 (s :: desc_acc)
| _ -> (String.concat " " (List.rev desc_acc), lines)
and collect_in_rs lines desc_acc =
match lines with
| Macro ("RE", _) :: rest3 ->
(String.concat " " (List.rev desc_acc), rest3)
| Text s :: rest3 ->
collect_in_rs rest3 (s :: desc_acc)
| Macro ("PP", _) :: _ | Macro ("SH", _) :: _ ->
(String.concat " " (List.rev desc_acc), lines)
| _ :: rest3 -> collect_in_rs rest3 desc_acc
| [] -> (String.concat " " (List.rev desc_acc), [])
in
let (desc, rest3) = collect_rs rest2 [] in
let entry = parse_tag_to_entry tag desc in
walk rest3 (match entry with Some e -> e :: acc | None -> acc)
| _ -> walk rest acc
end
| _ :: rest -> walk rest acc
in
walk lines []
(* Strategy D: Deroff fallback — strip all groff, use help text parser *)
let strategy_deroff_lines lines =
let buf = Buffer.create 256 in
List.iter (fun line ->
match line with
| Text s ->
Buffer.add_string buf s;
Buffer.add_char buf '\n'
| Macro (("BI" | "BR" | "IR" | "B" | "I"), args) ->
let text = strip_inline_macro_args args in
let text = strip_groff_escapes text in
Buffer.add_string buf text;
Buffer.add_char buf '\n'
| Blank -> Buffer.add_char buf '\n'
| _ -> ()
) lines;
let text = Buffer.contents buf in
match parse_help text with
| Ok result -> result.entries
| Error _ -> []
(* Strategy E: Nix3-style bullet .IP with .UR/.UE hyperlinks *)
let strategy_nix lines =
let is_bullet_ip args =
String.length (String.trim args) > 0
in
let rec walk lines acc =
match lines with
| [] -> List.rev acc
| Macro ("IP", args) :: rest when is_bullet_ip args ->
(* Collect tag: skip UR/UE macros, collect Text lines *)
let rec collect_tag lines parts =
match lines with
| Macro ("UR", _) :: rest2 -> collect_tag rest2 parts
| Macro ("UE", _) :: rest2 -> collect_tag rest2 parts
| Text s :: rest2 -> collect_tag rest2 (s :: parts)
| _ -> (String.concat " " (List.rev parts), lines)
in
let (tag, rest2) = collect_tag rest [] in
(* Collect description after the description .IP marker *)
let rec collect_desc lines parts =
match lines with
| Macro ("IP", dargs) :: rest3 when not (is_bullet_ip dargs) ->
collect_desc_text rest3 parts
| _ -> (String.concat " " (List.rev parts), lines)
and collect_desc_text lines parts =
match lines with
| Text s :: rest3 -> collect_desc_text rest3 (s :: parts)
| Macro ("IP", args) :: _ when is_bullet_ip args ->
(String.concat " " (List.rev parts), lines)
| Macro (("SS" | "SH"), _) :: _ ->
(String.concat " " (List.rev parts), lines)
| Macro ("RS", _) :: rest3 ->
skip_rs rest3 parts 1
| Macro ("IP", _) :: rest3 ->
(* Non-bullet IP = continuation paragraph *)
collect_desc_text rest3 parts
| Macro _ :: rest3 -> collect_desc_text rest3 parts
| Blank :: rest3 -> collect_desc_text rest3 parts
| Comment :: rest3 -> collect_desc_text rest3 parts
| [] -> (String.concat " " (List.rev parts), [])
and skip_rs lines parts depth =
match lines with
| Macro ("RE", _) :: rest3 ->
if depth <= 1 then collect_desc_text rest3 parts
else skip_rs rest3 parts (depth - 1)
| Macro ("RS", _) :: rest3 -> skip_rs rest3 parts (depth + 1)
| _ :: rest3 -> skip_rs rest3 parts depth
| [] -> (String.concat " " (List.rev parts), [])
in
let (desc, rest3) = collect_desc rest2 [] in
let entry = parse_tag_to_entry tag desc in
walk rest3 (match entry with Some e -> e :: acc | None -> acc)
| _ :: rest -> walk rest acc
in
walk lines []
(* Count macros of a given type *)
let count_macro name lines =
List.fold_left (fun n line ->
match line with Macro (m, _) when m = name -> n + 1 | _ -> n
) 0 lines
(* Auto-detect and try strategies, return the one with most entries *)
let extract_entries lines =
let results = ref [] in
(* Try TP if .TP macros present *)
if count_macro "TP" lines > 0 then
results := ("TP", strategy_tp lines) :: !results;
(* Try IP if .IP macros present *)
if count_macro "IP" lines > 0 then
results := ("IP", strategy_ip lines) :: !results;
(* Try PP+RS if both present *)
if count_macro "PP" lines > 0 && count_macro "RS" lines > 0 then
results := ("PP+RS", strategy_pp_rs lines) :: !results;
(* Try nix3 style if UR macros present *)
if count_macro "UR" lines > 0 && count_macro "IP" lines > 0 then
results := ("nix", strategy_nix lines) :: !results;
(* Always try deroff as fallback *)
results := ("deroff", strategy_deroff_lines lines) :: !results;
(* Prefer specialized strategies over deroff fallback *)
let specialized =
List.filter (fun (name, entries) -> name <> "deroff" && entries <> []) !results
in
let candidates = if specialized <> [] then specialized else !results in
let best =
List.fold_left (fun (best_name, best_entries) (name, entries) ->
if List.length entries >= List.length best_entries then (name, entries)
else (best_name, best_entries)
) ("none", []) candidates
in
snd best
(* --- SYNOPSIS command name extraction --- *)
let extract_synopsis_command_lines lines =
let classified = List.map classify_line lines in
let is_synopsis name =
let s = String.uppercase_ascii (String.trim name) in
s = "SYNOPSIS"
in
let extract_cmd line =
let words = String.split_on_char ' ' (String.trim line) in
let words = List.filter (fun w -> String.length w > 0) words in
let is_cmd_char = function
| 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '.' -> true
| _ -> false
in
let rec take = function
| [] -> []
| w :: rest ->
if String.length w > 0
&& (w.[0] = '[' || w.[0] = '-' || w.[0] = '<'
|| w.[0] = '(' || w.[0] = '{')
then []
else if String.for_all is_cmd_char w then
w :: take rest
else []
in
match take words with
| [] -> None
| cmd -> Some (String.concat " " cmd)
in
let rec find = function
| [] -> None
| Macro ("SH", args) :: rest when is_synopsis args -> collect rest
| _ :: rest -> find rest
and collect = function
| [] -> None
| Macro ("SH", _) :: _ -> None
| Text s :: _ ->
let s = String.trim s in
if String.length s > 0 then extract_cmd s else None
| Macro (("B" | "BI" | "BR"), args) :: _ ->
let s = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in
if String.length s > 0 then extract_cmd s else None
| _ :: rest -> collect rest
in
find classified
let extract_synopsis_command contents =
let lines = String.split_on_char '\n' contents in
extract_synopsis_command_lines lines
(* --- Top-level API --- *)
let parse_manpage_lines lines =
let options_section = extract_options_section lines in
let entries = extract_entries options_section in
{ entries; subcommands = [] }
let parse_manpage_string contents =
let lines = String.split_on_char '\n' contents in
parse_manpage_lines lines
let read_manpage_file path =
if Filename.check_suffix path ".gz" then begin
let ic = Gzip.open_in path in
let buf = Buffer.create 8192 in
let chunk = Bytes.create 8192 in
(try while true do
let n = Gzip.input ic chunk 0 8192 in
if n = 0 then raise Exit
else Buffer.add_subbytes buf chunk 0 n
done with Exit | End_of_file -> ());
Gzip.close_in ic;
Buffer.contents buf
end else begin
let ic = open_in path in
let n = in_channel_length ic in
let s = Bytes.create n in
really_input ic s 0 n;
close_in ic;
Bytes.to_string s
end
let parse_manpage_file path =
read_manpage_file path |> parse_manpage_string