1145 lines
47 KiB
OCaml
1145 lines
47 KiB
OCaml
(* manpage.ml — parse unix manpages (groff/mdoc format) into help_result.
|
|
*
|
|
* manpages are written in roff/groff markup — a decades-old typesetting language
|
|
* used by man(1). this module strips the formatting and extracts structured data
|
|
* (flags, subcommands, positionals) from the raw groff source.
|
|
*
|
|
* there are two major manpage macro packages:
|
|
* - man (groff) — used by gnu/linux tools. uses macros like .SH, .TP, .IP, .PP
|
|
* - mdoc (bsd) — used by bsd tools. uses .Sh, .Fl, .Ar, .Op, .It, .Bl/.El
|
|
*
|
|
* this module handles both, auto-detecting the format by checking for .Sh macros.
|
|
*
|
|
* for groff manpages, flag extraction uses multiple "strategies" that target
|
|
* different common formatting patterns:
|
|
* - strategy_tp: .TP tagged paragraphs (gnu coreutils, help2man)
|
|
* - strategy_ip: .IP indented paragraphs (curl, hand-written)
|
|
* - strategy_pp_rs: .PP + .RS/.RE blocks (git, docbook)
|
|
* - strategy_nix: nix3-style bullet .IP with .UR/.UE hyperlinks
|
|
* - strategy_deroff: fallback — strip all groff, feed to help text parser
|
|
*
|
|
* the module tries all applicable strategies and picks the one that extracts
|
|
* the most flag entries, on the theory that more results = better match.
|
|
*
|
|
* key peculiarities:
|
|
* - groff has an enormous escape syntax (font changes, named characters,
|
|
* size changes, color, string variables, etc.) — strip_groff_escapes
|
|
* handles the common cases but is not exhaustive
|
|
* - font escapes like \fI (italic) need to insert spaces at word boundaries
|
|
* to prevent flag names from fusing with their parameter names
|
|
* - the strategies share the angstrom-based switch_parser from parser.ml
|
|
* for parsing the actual flag syntax out of the stripped text
|
|
*)
|
|
|
|
open Parser
|
|
|
|
(* --- shared helpers for imperative string scanning ---
|
|
* many groff parsing routines use an imperative cursor (ref int) walking
|
|
* through a string. these helpers factor out common scanning patterns. *)
|
|
|
|
(* advance pos past all characters until the delimiter is found.
|
|
* leaves pos pointing at the delimiter character, or at len if not found. *)
|
|
let skip_to_char source len pos delim =
|
|
while !pos < len && source.[!pos] <> delim do incr pos done
|
|
|
|
(* translate a groff named character escape to its text equivalent.
|
|
* groff uses two-letter codes like "aq" for apostrophe, "lq"/"rq" for
|
|
* left/right quotes, "em"/"en" for dashes. returns None for unknown names. *)
|
|
let named_char_of = function
|
|
| "aq" -> Some '\''
|
|
| "lq" | "Lq" -> Some '\x22' (* left double quote *)
|
|
| "rq" | "Rq" -> Some '\x22' (* right double quote *)
|
|
| "em" | "en" -> Some '-'
|
|
| _ -> None
|
|
|
|
(* skip a groff reference that uses one of three sub-forms:
|
|
* single char — e.g. \*X or \nX
|
|
* ( + 2 chars — e.g. \*(XX or \n(XX
|
|
* [ to ] — e.g. \*[name] or \n[name]
|
|
* used for \* (string variable) and \n (number register) escapes.
|
|
* advances pos past the consumed characters. *)
|
|
let skip_groff_reference source len pos =
|
|
if !pos < len then begin
|
|
if source.[!pos] = '(' then
|
|
pos := !pos + 3 (* skip past '(' + two-character name *)
|
|
else if source.[!pos] = '[' then begin
|
|
incr pos;
|
|
skip_to_char source len pos ']';
|
|
if !pos < len then incr pos
|
|
end else
|
|
incr pos
|
|
end
|
|
|
|
(* --- groff escape/formatting stripper ---
|
|
* groff escapes start with backslash and use various continuation syntaxes.
|
|
* this function strips them, replacing named characters (like \(aq for
|
|
* apostrophe) with their text equivalents and discarding formatting directives. *)
|
|
|
|
let strip_groff_escapes source =
|
|
let buffer = Buffer.create (String.length source) in
|
|
let len = String.length source in
|
|
let pos = ref 0 in
|
|
let prev_char = ref '\000' in
|
|
(* emit a character into the output buffer and track it as previous *)
|
|
let put char_val = Buffer.add_char buffer char_val; prev_char := char_val in
|
|
let is_alnum char_val =
|
|
(char_val >= 'a' && char_val <= 'z')
|
|
|| (char_val >= 'A' && char_val <= 'Z')
|
|
|| (char_val >= '0' && char_val <= '9')
|
|
in
|
|
while !pos < len do
|
|
if source.[!pos] = '\\' && !pos + 1 < len then begin
|
|
let next = source.[!pos + 1] in
|
|
match next with
|
|
| 'f' ->
|
|
(* font escape: \fB, \fI, \fP, \fR, \f(XX, \f[...] *)
|
|
if !pos + 2 < len then begin
|
|
let font_char = source.[!pos + 2] in
|
|
(* insert space before italic font to preserve word boundaries
|
|
e.g. \fB--max-results\fR\fIcount\fR -> "--max-results count" *)
|
|
if font_char = 'I' && is_alnum !prev_char then put ' ';
|
|
if font_char = '(' then
|
|
pos := !pos + 5 (* \f(XX — two-character font name *)
|
|
else if font_char = '[' then begin
|
|
pos := !pos + 3;
|
|
skip_to_char source len pos ']';
|
|
if !pos < len then incr pos
|
|
end else
|
|
pos := !pos + 3 (* \fX — single-character font selector *)
|
|
end else
|
|
pos := !pos + 2
|
|
| '-' ->
|
|
(* escaped hyphen-minus — emit a plain hyphen *)
|
|
put '-';
|
|
pos := !pos + 2
|
|
| '&' | '/' | ',' ->
|
|
(* zero-width characters — discard without output *)
|
|
pos := !pos + 2
|
|
| '(' ->
|
|
(* two-char named character: \(aq, \(lq, \(rq, etc. *)
|
|
if !pos + 3 < len then begin
|
|
let name = String.sub source (!pos + 2) 2 in
|
|
(match named_char_of name with
|
|
| Some char_val -> put char_val
|
|
| None -> ());
|
|
pos := !pos + 4
|
|
end else
|
|
pos := !pos + 2
|
|
| '[' ->
|
|
(* bracketed named character: \[aq], \[lq], etc. *)
|
|
pos := !pos + 2;
|
|
let start = !pos in
|
|
skip_to_char source len pos ']';
|
|
if !pos < len then begin
|
|
let name = String.sub source start (!pos - start) in
|
|
(match named_char_of name with
|
|
| Some char_val -> put char_val
|
|
| None -> ());
|
|
incr pos
|
|
end
|
|
| 's' ->
|
|
(* size escape: \sN, \s+N, \s-N — skip the numeric argument *)
|
|
pos := !pos + 2;
|
|
if !pos < len && (source.[!pos] = '+' || source.[!pos] = '-') then incr pos;
|
|
if !pos < len && source.[!pos] >= '0' && source.[!pos] <= '9' then incr pos;
|
|
if !pos < len && source.[!pos] >= '0' && source.[!pos] <= '9' then incr pos
|
|
| 'm' ->
|
|
(* color escape: \m[...] — skip the bracketed color name *)
|
|
pos := !pos + 2;
|
|
if !pos < len && source.[!pos] = '[' then begin
|
|
incr pos;
|
|
skip_to_char source len pos ']';
|
|
if !pos < len then incr pos
|
|
end
|
|
| 'X' ->
|
|
(* device control: \X'...' — skip the single-quoted payload *)
|
|
pos := !pos + 2;
|
|
if !pos < len && source.[!pos] = '\'' then begin
|
|
incr pos;
|
|
skip_to_char source len pos '\'';
|
|
if !pos < len then incr pos
|
|
end
|
|
| '*' ->
|
|
(* string variable: \*X or \*(XX or \*[...] — skip the reference *)
|
|
pos := !pos + 2;
|
|
skip_groff_reference source len pos
|
|
| 'n' ->
|
|
(* number register: \nX or \n(XX or \n[...] — skip the reference *)
|
|
pos := !pos + 2;
|
|
skip_groff_reference source len pos
|
|
| 'e' ->
|
|
(* escaped backslash literal *)
|
|
put '\\';
|
|
pos := !pos + 2
|
|
| '\\' ->
|
|
(* double backslash — emit one *)
|
|
put '\\';
|
|
pos := !pos + 2
|
|
| ' ' ->
|
|
(* escaped space — emit a regular space *)
|
|
put ' ';
|
|
pos := !pos + 2
|
|
| _ ->
|
|
(* unknown escape — skip the two-character sequence *)
|
|
pos := !pos + 2
|
|
end else begin
|
|
put source.[!pos];
|
|
incr pos
|
|
end
|
|
done;
|
|
Buffer.contents buffer
|
|
|
|
(* strip inline macro formatting: .BI, .BR, .IR, etc.
|
|
* these macros alternate between fonts for their arguments, e.g.:
|
|
* .BI "--output " "FILE"
|
|
* becomes "--outputFILE" (arguments concatenated without spaces).
|
|
*
|
|
* quoted strings are kept together (quotes stripped), but unquoted spaces
|
|
* are consumed. this matches groff's actual rendering of these macros,
|
|
* where alternating-font arguments are concatenated. *)
|
|
let strip_inline_macro_args text =
|
|
let buffer = Buffer.create (String.length text) in
|
|
let len = String.length text in
|
|
let pos = ref 0 in
|
|
while !pos < len do
|
|
if text.[!pos] = '"' then begin
|
|
(* quoted argument — copy characters up to the closing quote *)
|
|
incr pos;
|
|
while !pos < len && text.[!pos] <> '"' do
|
|
Buffer.add_char buffer text.[!pos];
|
|
incr pos
|
|
done;
|
|
if !pos < len then incr pos
|
|
end else if text.[!pos] = ' ' || text.[!pos] = '\t' then begin
|
|
(* unquoted whitespace — skip (arguments are concatenated) *)
|
|
incr pos
|
|
end else begin
|
|
(* regular character — copy to output *)
|
|
Buffer.add_char buffer text.[!pos];
|
|
incr pos
|
|
end
|
|
done;
|
|
Buffer.contents buffer
|
|
|
|
(* convenience: strip escapes and trim whitespace *)
|
|
let strip_groff line =
|
|
let text = strip_groff_escapes line in
|
|
String.trim text
|
|
|
|
(* --- line classification ---
|
|
* every line in a manpage is classified as one of four types.
|
|
* this classification drives all subsequent parsing — strategies
|
|
* pattern-match on sequences of classified lines. *)
|
|
|
|
type groff_line =
|
|
| Macro of string * string (* macro name + args, e.g. ("SH", "OPTIONS") or ("TP", "") *)
|
|
| Text of string (* plain text after groff stripping *)
|
|
| Blank (* empty line *)
|
|
| Comment (* groff comment: .backslash-quote or backslash-quote *)
|
|
|
|
(* classify a single line of manpage source.
|
|
* macro lines start with '.' or '\'' (groff alternate control char).
|
|
* the macro name is split from its arguments at the first space/tab.
|
|
* arguments wrapped in double quotes are unquoted. *)
|
|
let classify_line line =
|
|
let len = String.length line in
|
|
if len = 0 then Blank
|
|
else if len >= 2 && line.[0] = '.' && line.[1] = '\\' && (len < 3 || line.[2] = '"') then
|
|
Comment
|
|
else if len >= 3 && line.[0] = '\\' && line.[1] = '"' then
|
|
Comment
|
|
else if line.[0] = '.' || line.[0] = '\'' then begin
|
|
(* macro line — extract macro name and arguments *)
|
|
let rest = String.sub line 1 (len - 1) in
|
|
let rest = String.trim rest in
|
|
(* split into macro name and arguments at the first whitespace *)
|
|
let space_pos =
|
|
try Some (String.index rest ' ')
|
|
with Not_found ->
|
|
try Some (String.index rest '\t')
|
|
with Not_found -> None
|
|
in
|
|
match space_pos with
|
|
| Some split_at ->
|
|
let name = String.sub rest 0 split_at in
|
|
let args = String.trim (String.sub rest (split_at + 1) (String.length rest - split_at - 1)) in
|
|
(* strip surrounding quotes from arguments *)
|
|
let args =
|
|
let alen = String.length args in
|
|
if alen >= 2 && args.[0] = '"' && args.[alen - 1] = '"' then
|
|
String.sub args 1 (alen - 2)
|
|
else args
|
|
in
|
|
Macro (name, args)
|
|
| None ->
|
|
Macro (rest, "")
|
|
end else begin
|
|
let stripped = strip_groff line in
|
|
if String.length stripped = 0 then Blank
|
|
else Text stripped
|
|
end
|
|
|
|
(* refined comment detection — the base classify_line may miss some comment
|
|
* forms, so this wrapper checks more carefully before falling through to
|
|
* the general classifier. *)
|
|
let is_comment_line line =
|
|
let len = String.length line in
|
|
(len >= 3 && line.[0] = '.' && line.[1] = '\\' && line.[2] = '"')
|
|
|| (len >= 2 && line.[0] = '\\' && line.[1] = '"')
|
|
|
|
let classify_line line =
|
|
if is_comment_line line then Comment
|
|
else classify_line line
|
|
|
|
(* --- section extraction ---
|
|
* manpages are divided into sections by .SH macros. the OPTIONS section
|
|
* contains the flag definitions we want. if there's no OPTIONS section,
|
|
* we fall back to DESCRIPTION (some simple tools put flags there).
|
|
*
|
|
* old-style nix manpages (nix-build, nix-env-install, etc.) split flags
|
|
* across multiple .SH sections with option-like names: e.g. "Options" for
|
|
* command-specific flags and "Common Options" for flags shared by all nix
|
|
* commands. collecting only the first such section misses the majority of
|
|
* flags, so we collect and concatenate all option-like sections. *)
|
|
|
|
let extract_options_section lines =
|
|
let classified = List.map classify_line lines in
|
|
(* collect lines until the next .SH header, returning (content, rest)
|
|
* where rest starts at the .SH line (or is empty if at end of file). *)
|
|
let rec collect_section lines acc =
|
|
match lines with
|
|
| [] -> (List.rev acc, [])
|
|
| Macro ("SH", _) :: _ -> (List.rev acc, lines)
|
|
| line :: rest -> collect_section rest (line :: acc)
|
|
in
|
|
(* test whether a section name looks like an options section.
|
|
* matches "OPTIONS", "COMMON OPTIONS", "GLOBAL OPTIONS", etc. *)
|
|
let is_options_section name =
|
|
let upper = String.uppercase_ascii (String.trim name) in
|
|
upper = "OPTIONS"
|
|
|| (String.length upper > 0 &&
|
|
try let _ = Str.search_forward (Str.regexp_string "OPTION") upper 0 in true
|
|
with Not_found -> false)
|
|
in
|
|
(* collect from all option-like .SH sections and concatenate them.
|
|
* handles the common nix pattern where "Options" and "Common Options"
|
|
* are separate .SH sections but both contain relevant flags.
|
|
*
|
|
* a synthetic Macro("SH","") separator is inserted between sections so
|
|
* that collect_desc_text (which stops on SH/SS) does not let a description
|
|
* from the last entry in one section bleed into the intro text of the next. *)
|
|
let rec find_all_options lines acc =
|
|
match lines with
|
|
| [] -> acc
|
|
| Macro ("SH", args) :: rest when is_options_section args ->
|
|
let (section, remaining) = collect_section rest [] in
|
|
let sep = if acc = [] then [] else [Macro ("SH", "")] in
|
|
find_all_options remaining (acc @ sep @ section)
|
|
| _ :: rest -> find_all_options rest acc
|
|
in
|
|
(* fallback: DESCRIPTION section for simple tools that put flags there *)
|
|
let rec find_description = function
|
|
| [] -> []
|
|
| Macro ("SH", args) :: rest
|
|
when String.uppercase_ascii (String.trim args) = "DESCRIPTION" ->
|
|
fst (collect_section rest [])
|
|
| _ :: rest -> find_description rest
|
|
in
|
|
match find_all_options classified [] with
|
|
| [] -> find_description classified
|
|
| sections -> sections
|
|
|
|
(* --- strategy-based entry extraction ---
|
|
* rather than a single monolithic parser, we use multiple "strategies" that
|
|
* each target a specific groff formatting pattern. this is necessary because
|
|
* manpage authors use very different macro combinations for the same purpose.
|
|
*
|
|
* the shared building blocks:
|
|
* - collect_text_lines: gather consecutive Text lines into one description string
|
|
* - parse_tag_to_entry: run the angstrom switch parser on a tag string to
|
|
* extract the flag definition. this reuses the same parser that handles
|
|
* --help output, giving consistent extraction across both sources.
|
|
* - tag_of_macro: extract the "tag" text from formatting macros like .B, .BI, etc.
|
|
*)
|
|
|
|
(* collect consecutive text lines, joining them with spaces *)
|
|
let rec collect_text_lines lines acc =
|
|
match lines with
|
|
| Text text :: rest -> collect_text_lines rest (text :: acc)
|
|
| _ -> (String.concat " " (List.rev acc), lines)
|
|
|
|
(* attempt to parse a tag string (e.g. "-v, --verbose FILE") into an entry.
|
|
* uses the angstrom switch_parser + param_parser from parser.ml.
|
|
* returns None if the tag doesn't look like a flag definition. *)
|
|
let parse_tag_to_entry tag desc =
|
|
let tag = strip_groff_escapes tag in
|
|
let tag = String.trim tag in
|
|
match Angstrom.parse_string ~consume:Angstrom.Consume.Prefix
|
|
(Angstrom.lift2 (fun sw p -> (sw, p)) switch_parser param_parser) tag with
|
|
| Ok (switch, param) -> Some { switch; param; desc }
|
|
| Error _ -> None
|
|
|
|
(* extract tag text from a macro line.
|
|
* .B and .I preserve spaces (single argument); .BI, .BR, .IR alternate
|
|
* fonts and concatenate arguments. *)
|
|
let tag_of_macro name args =
|
|
match name with
|
|
| "B" | "I" -> strip_groff_escapes args |> String.trim
|
|
| _ -> strip_inline_macro_args args |> strip_groff_escapes |> String.trim
|
|
|
|
(* strategy a: .TP style (most common — gnu coreutils, help2man).
|
|
* .TP introduces a tagged paragraph: the next line is the "tag" (flag name)
|
|
* and subsequent text lines are the description. the tag can be plain text
|
|
* or wrapped in a formatting macro (.B, .BI, etc.).
|
|
*
|
|
* example groff:
|
|
* .TP
|
|
* \fB\-v\fR, \fB\-\-verbose\fR
|
|
* increase verbosity *)
|
|
let strategy_tp lines =
|
|
let rec walk lines acc =
|
|
match lines with
|
|
| [] -> List.rev acc
|
|
| Macro ("TP", _) :: rest ->
|
|
(* next line is the tag — could be Text or a formatting macro *)
|
|
begin match rest with
|
|
| Text tag :: rest2 ->
|
|
let (desc, rest3) = collect_text_lines rest2 [] in
|
|
let entry = parse_tag_to_entry tag desc in
|
|
walk rest3 (match entry with Some e -> e :: acc | None -> acc)
|
|
| Macro (("B" | "I" | "BI" | "BR" | "IR") as macro_name, args) :: rest2 ->
|
|
let tag = tag_of_macro macro_name args in
|
|
let (desc, rest3) = collect_text_lines rest2 [] in
|
|
let entry = parse_tag_to_entry tag desc in
|
|
walk rest3 (match entry with Some e -> e :: acc | None -> acc)
|
|
| _ -> walk rest acc
|
|
end
|
|
| _ :: rest -> walk rest acc
|
|
in
|
|
walk lines []
|
|
|
|
(* strategy b: .IP style (curl, hand-written manpages).
|
|
* .IP takes an inline tag argument: .IP "-v, --verbose"
|
|
* the description follows as text lines. simpler than .TP because
|
|
* the tag is on the macro line itself. *)
|
|
let strategy_ip lines =
|
|
let rec walk lines acc =
|
|
match lines with
|
|
| [] -> List.rev acc
|
|
| Macro ("IP", tag) :: rest ->
|
|
let tag = strip_groff_escapes tag in
|
|
let (desc, rest2) = collect_text_lines rest [] in
|
|
let entry = parse_tag_to_entry tag desc in
|
|
walk rest2 (match entry with Some e -> e :: acc | None -> acc)
|
|
| _ :: rest -> walk rest acc
|
|
in
|
|
walk lines []
|
|
|
|
(* strategy c: .PP + .RS/.RE style (git, docbook-generated manpages).
|
|
* flag entries are introduced by .PP (paragraph), with the flag name as
|
|
* plain text, followed by a .RS (indent) block containing the description,
|
|
* closed by .RE (de-indent). this is common in docbook-to-manpage toolchains. *)
|
|
let strategy_pp_rs lines =
|
|
let rec walk lines acc =
|
|
match lines with
|
|
| [] -> List.rev acc
|
|
| Macro ("PP", _) :: rest ->
|
|
begin match rest with
|
|
| Text tag :: rest2 ->
|
|
(* look for .RS ... text ... .RE *)
|
|
let rec collect_rs lines desc_acc =
|
|
match lines with
|
|
| Macro ("RS", _) :: rest3 ->
|
|
collect_in_rs rest3 desc_acc
|
|
| Text text :: rest3 ->
|
|
(* sometimes description follows directly *)
|
|
collect_rs rest3 (text :: desc_acc)
|
|
| _ -> (String.concat " " (List.rev desc_acc), lines)
|
|
and collect_in_rs lines desc_acc =
|
|
match lines with
|
|
| Macro ("RE", _) :: rest3 ->
|
|
(String.concat " " (List.rev desc_acc), rest3)
|
|
| Text text :: rest3 ->
|
|
collect_in_rs rest3 (text :: desc_acc)
|
|
| Macro ("PP", _) :: _ | Macro ("SH", _) :: _ ->
|
|
(String.concat " " (List.rev desc_acc), lines)
|
|
| _ :: rest3 -> collect_in_rs rest3 desc_acc
|
|
| [] -> (String.concat " " (List.rev desc_acc), [])
|
|
in
|
|
let (desc, rest3) = collect_rs rest2 [] in
|
|
let entry = parse_tag_to_entry tag desc in
|
|
walk rest3 (match entry with Some e -> e :: acc | None -> acc)
|
|
| _ -> walk rest acc
|
|
end
|
|
| _ :: rest -> walk rest acc
|
|
in
|
|
walk lines []
|
|
|
|
(* strategy d: deroff fallback — strip all groff markup, then feed the
|
|
* resulting plain text through the --help parser from parser.ml.
|
|
* this is the last resort when no structured macro pattern is recognized.
|
|
* it works surprisingly well for simple manpages but may miss entries
|
|
* in heavily formatted ones. *)
|
|
let strategy_deroff_lines lines =
|
|
let buffer = Buffer.create 256 in
|
|
List.iter (fun line ->
|
|
match line with
|
|
| Text text ->
|
|
Buffer.add_string buffer text;
|
|
Buffer.add_char buffer '\n'
|
|
| Macro (("BI" | "BR" | "IR" | "B" | "I"), args) ->
|
|
let text = strip_inline_macro_args args in
|
|
let text = strip_groff_escapes text in
|
|
Buffer.add_string buffer text;
|
|
Buffer.add_char buffer '\n'
|
|
| Blank -> Buffer.add_char buffer '\n'
|
|
| _ -> ()
|
|
) lines;
|
|
let text = Buffer.contents buffer in
|
|
match parse_help text with
|
|
| Ok result -> result.entries
|
|
| Error _ -> []
|
|
|
|
(* strategy e: nix3-style bullet .IP with .UR/.UE hyperlinks.
|
|
* nix's manpages use .IP with bullet markers for flag entries, interleaved
|
|
* with .UR/.UE hyperlink macros. the flag tag is in text lines after the
|
|
* bullet .IP, and the description follows a non-bullet .IP marker.
|
|
*
|
|
* nix manpages nest .RS/.RE blocks inside descriptions for sub-examples.
|
|
* the skip_rs helper tracks nesting depth to skip these without losing
|
|
* the rest of the description. *)
|
|
let strategy_nix lines =
|
|
(* a bullet .IP has non-empty args (the bullet marker) *)
|
|
let is_bullet_ip args =
|
|
String.length (String.trim args) > 0
|
|
in
|
|
let rec walk lines acc =
|
|
match lines with
|
|
| [] -> List.rev acc
|
|
| Macro ("IP", args) :: rest when is_bullet_ip args ->
|
|
(* collect tag: skip .UR/.UE macros, collect Text lines *)
|
|
let rec collect_tag lines parts =
|
|
match lines with
|
|
| Macro ("UR", _) :: rest2 -> collect_tag rest2 parts
|
|
| Macro ("UE", _) :: rest2 -> collect_tag rest2 parts
|
|
| Text text :: rest2 -> collect_tag rest2 (text :: parts)
|
|
| _ -> (String.concat " " (List.rev parts), lines)
|
|
in
|
|
let (tag, rest2) = collect_tag rest [] in
|
|
(* collect description after the description .IP marker *)
|
|
let rec collect_desc lines parts =
|
|
match lines with
|
|
| Macro ("IP", dargs) :: rest3 when not (is_bullet_ip dargs) ->
|
|
collect_desc_text rest3 parts
|
|
| _ -> (String.concat " " (List.rev parts), lines)
|
|
and collect_desc_text lines parts =
|
|
match lines with
|
|
| Text text :: rest3 -> collect_desc_text rest3 (text :: parts)
|
|
| Macro ("IP", args2) :: _ when is_bullet_ip args2 ->
|
|
(* next bullet entry — stop collecting *)
|
|
(String.concat " " (List.rev parts), lines)
|
|
| Macro (("SS" | "SH"), _) :: _ ->
|
|
(* section boundary — stop collecting *)
|
|
(String.concat " " (List.rev parts), lines)
|
|
| Macro ("RS", _) :: rest3 ->
|
|
skip_rs rest3 parts 1
|
|
| Macro ("IP", _) :: rest3 ->
|
|
(* non-bullet .IP = continuation paragraph *)
|
|
collect_desc_text rest3 parts
|
|
| Macro _ :: rest3 -> collect_desc_text rest3 parts
|
|
| Blank :: rest3 -> collect_desc_text rest3 parts
|
|
| Comment :: rest3 -> collect_desc_text rest3 parts
|
|
| [] -> (String.concat " " (List.rev parts), [])
|
|
and skip_rs lines parts depth =
|
|
match lines with
|
|
| Macro ("RE", _) :: rest3 ->
|
|
if depth <= 1 then collect_desc_text rest3 parts
|
|
else skip_rs rest3 parts (depth - 1)
|
|
| Macro ("RS", _) :: rest3 -> skip_rs rest3 parts (depth + 1)
|
|
| _ :: rest3 -> skip_rs rest3 parts depth
|
|
| [] -> (String.concat " " (List.rev parts), [])
|
|
in
|
|
let (desc, rest3) = collect_desc rest2 [] in
|
|
let entry = parse_tag_to_entry tag desc in
|
|
walk rest3 (match entry with Some e -> e :: acc | None -> acc)
|
|
| _ :: rest -> walk rest acc
|
|
in
|
|
walk lines []
|
|
|
|
(* count occurrences of a specific macro in the section.
|
|
* used by extract_entries to decide which strategies are worth trying. *)
|
|
let count_macro name lines =
|
|
List.fold_left (fun count line ->
|
|
match line with Macro (macro_name, _) when macro_name = name -> count + 1 | _ -> count
|
|
) 0 lines
|
|
|
|
(* auto-detect and try strategies, return the one with most entries.
|
|
* first counts macros to determine which strategies are applicable,
|
|
* then runs all applicable ones and picks the winner by entry count.
|
|
* if no specialized strategy produces results, falls back to deroff.
|
|
*
|
|
* this "try everything, pick the best" approach is intentional.
|
|
* manpage formatting is too varied and inconsistent to reliably detect the
|
|
* format from macro counts alone. running multiple strategies and comparing
|
|
* results is more robust. *)
|
|
let extract_entries lines =
|
|
let tp = count_macro "TP" lines
|
|
and ip = count_macro "IP" lines
|
|
and pp = count_macro "PP" lines
|
|
and rs = count_macro "RS" lines
|
|
and ur = count_macro "UR" lines in
|
|
(* build a list of (label, entries) for each applicable strategy *)
|
|
let specialized = List.filter_map Fun.id [
|
|
(if tp > 0 then Some ("TP", strategy_tp lines) else None);
|
|
(if ip > 0 then Some ("IP", strategy_ip lines) else None);
|
|
(if pp > 0 && rs > 0 then Some ("PP+RS", strategy_pp_rs lines) else None);
|
|
(if ur > 0 && ip > 0 then Some ("nix", strategy_nix lines) else None);
|
|
] in
|
|
(* filter to strategies that found at least one entry, fall back to deroff *)
|
|
let candidates = match List.filter (fun (_, entries) -> entries <> []) specialized with
|
|
| [] -> [("deroff", strategy_deroff_lines lines)]
|
|
| filtered -> filtered
|
|
in
|
|
(* pick the strategy with the most entries *)
|
|
List.fold_left (fun (_, best) (name, entries) ->
|
|
if List.length entries >= List.length best then (name, entries)
|
|
else (name, best)
|
|
) ("none", []) candidates |> snd
|
|
|
|
(* --- NAME section description extraction ---
|
|
* the NAME section in manpages follows the convention:
|
|
* "command \- short description"
|
|
* we extract the part after "\-" as the command's description.
|
|
* handles both "\-" (groff) and " - " (plain text) separators. *)
|
|
|
|
let extract_name_description contents =
|
|
let lines = String.split_on_char '\n' contents in
|
|
let classified = List.map classify_line lines in
|
|
let rec find = function
|
|
| [] -> None
|
|
| Macro ("SH", args) :: rest
|
|
when String.uppercase_ascii (String.trim args) = "NAME" ->
|
|
collect rest []
|
|
| _ :: rest -> find rest
|
|
and collect lines acc =
|
|
match lines with
|
|
| Macro ("SH", _) :: _ | [] -> finish acc
|
|
| Text text :: rest -> collect rest (text :: acc)
|
|
| Macro (("B" | "BI" | "BR" | "I" | "IR"), args) :: rest ->
|
|
let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in
|
|
collect rest (if String.length text > 0 then text :: acc else acc)
|
|
| Macro ("Nm", args) :: rest ->
|
|
let text = strip_groff_escapes args |> String.trim in
|
|
collect rest (if String.length text > 0 then text :: acc else acc)
|
|
| Macro ("Nd", args) :: rest ->
|
|
let text = strip_groff_escapes args |> String.trim in
|
|
collect rest (if String.length text > 0 then ("\\- " ^ text) :: acc else acc)
|
|
| _ :: rest -> collect rest acc
|
|
and finish acc =
|
|
let full = String.concat " " (List.rev acc) |> String.trim in
|
|
(* NAME lines look like: "git-add \- Add file contents to the index" *)
|
|
let sep = Str.regexp {| *\\- *\| +- +|} in
|
|
match Str.bounded_split sep full 2 with
|
|
| [_; desc] -> Some (String.trim desc)
|
|
| _ -> None
|
|
in
|
|
find classified
|
|
|
|
(* --- SYNOPSIS command name extraction ---
|
|
* the SYNOPSIS section shows how to invoke the command:
|
|
* .SH SYNOPSIS
|
|
* .B git add
|
|
* [\fIOPTIONS\fR] [\fB\-\-\fR] [\fI<pathspec>\fR...]
|
|
*
|
|
* we extract the command name by taking consecutive "word" tokens until
|
|
* we hit something that looks like an argument (starts with [, <, -, etc.). *)
|
|
|
|
let extract_synopsis_command_lines lines =
|
|
(* replace italic text (\fI...\fR) with angle-bracketed placeholders
|
|
* before classification strips the font info. italic in groff indicates
|
|
* a parameter/placeholder (e.g. \fIoperation\fR), not a command word.
|
|
* the angle brackets cause extract_cmd to stop at these tokens since
|
|
* '<' is in its stop set. without this, "nix-env \fIoperation\fR"
|
|
* would be parsed as command "nix-env operation" instead of "nix-env". *)
|
|
let lines = List.map (fun line ->
|
|
Str.global_replace (Str.regexp {|\\fI\([^\\]*\)\\f[RP]|}) {|<\1>|} line
|
|
) lines in
|
|
let classified = List.map classify_line lines in
|
|
let is_synopsis name =
|
|
String.uppercase_ascii (String.trim name) = "SYNOPSIS"
|
|
in
|
|
(* extract the command name from a line by taking leading word tokens *)
|
|
let extract_cmd line =
|
|
let words = String.split_on_char ' ' (String.trim line) in
|
|
let words = List.filter (fun word -> String.length word > 0) words in
|
|
let is_cmd_char = function
|
|
| 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '.' -> true
|
|
| _ -> false
|
|
in
|
|
(* take words that look like command name parts, stop at arguments *)
|
|
let rec take = function
|
|
| [] -> []
|
|
| word :: rest ->
|
|
if String.length word > 0
|
|
&& (word.[0] = '[' || word.[0] = '-' || word.[0] = '<'
|
|
|| word.[0] = '(' || word.[0] = '{')
|
|
then []
|
|
else if String.for_all is_cmd_char word then
|
|
word :: take rest
|
|
else []
|
|
in
|
|
match take words with
|
|
| [] -> None
|
|
| cmd -> Some (String.concat " " cmd)
|
|
in
|
|
let rec find = function
|
|
| [] -> None
|
|
| Macro ("SH", args) :: rest when is_synopsis args -> collect rest
|
|
| _ :: rest -> find rest
|
|
and collect = function
|
|
| [] -> None
|
|
| Macro ("SH", _) :: _ -> None
|
|
| Text text :: _ ->
|
|
let text = String.trim text in
|
|
if String.length text > 0 then extract_cmd text else None
|
|
| Macro (("B" | "BI" | "BR"), args) :: _ ->
|
|
let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in
|
|
if String.length text > 0 then extract_cmd text else None
|
|
| _ :: rest -> collect rest
|
|
in
|
|
find classified
|
|
|
|
let extract_synopsis_command contents =
|
|
let lines = String.split_on_char '\n' contents in
|
|
extract_synopsis_command_lines lines
|
|
|
|
(* --- SYNOPSIS positional extraction ---
|
|
* extract positional arguments from the SYNOPSIS section by collecting
|
|
* all text/formatting macro lines, joining them, skipping the command
|
|
* name prefix, then running parse_usage_args from parser.ml on the remainder. *)
|
|
|
|
let extract_synopsis_positionals_lines lines =
|
|
let classified = List.map classify_line lines in
|
|
let is_synopsis name =
|
|
String.uppercase_ascii (String.trim name) = "SYNOPSIS"
|
|
in
|
|
let rec find = function
|
|
| [] -> []
|
|
| Macro ("SH", args) :: rest when is_synopsis args -> collect rest []
|
|
| _ :: rest -> find rest
|
|
and collect lines acc =
|
|
match lines with
|
|
| [] -> finish acc
|
|
| Macro ("SH", _) :: _ -> finish acc
|
|
| Macro ("SS", _) :: _ -> finish acc
|
|
| Macro ("br", _) :: _ -> finish acc
|
|
| Text text :: rest ->
|
|
let text = strip_groff_escapes text |> String.trim in
|
|
collect rest (if String.length text > 0 then text :: acc else acc)
|
|
| Macro (("B" | "BI" | "BR" | "I" | "IR" | "IB" | "RB" | "RI"), args) :: rest ->
|
|
let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in
|
|
collect rest (if String.length text > 0 then text :: acc else acc)
|
|
| _ :: rest -> collect rest acc
|
|
and finish acc =
|
|
let parts = List.rev acc in
|
|
let full = String.concat " " parts |> String.trim in
|
|
if String.length full = 0 then []
|
|
else
|
|
let cmd_end = skip_command_prefix full in
|
|
let args = String.sub full cmd_end (String.length full - cmd_end) in
|
|
parse_usage_args args
|
|
in
|
|
find classified
|
|
|
|
(* --- mdoc (bsd) format support ---
|
|
* mdoc is the bsd manpage macro package. it uses semantic macros rather than
|
|
* presentation macros:
|
|
* .Fl v -> flag: -v
|
|
* .Ar file -> argument: file
|
|
* .Op ... -> optional: [...]
|
|
* .Bl/.It/.El -> list begin/item/end
|
|
* .Sh -> section header (note lowercase 'h', vs groff's .SH)
|
|
*
|
|
* the parser walks through classified lines looking for .Bl (list begin)
|
|
* blocks containing .It (items) with .Fl (flag) entries. *)
|
|
|
|
let is_mdoc lines =
|
|
List.exists (fun line ->
|
|
match classify_line line with Macro ("Sh", _) -> true | _ -> false
|
|
) lines
|
|
|
|
(* extract renderable text from an mdoc line, skipping structural macros *)
|
|
let mdoc_text_of line =
|
|
match line with
|
|
| Text text -> Some (strip_groff_escapes text)
|
|
| Macro (macro_name, args) ->
|
|
(match macro_name with
|
|
| "Pp" | "Bl" | "El" | "Sh" | "Ss" | "Os" | "Dd" | "Dt"
|
|
| "Oo" | "Oc" | "Op" -> None
|
|
| _ ->
|
|
let text = strip_groff_escapes args |> String.trim in
|
|
if text = "" then None else Some text)
|
|
| _ -> None
|
|
|
|
(* parse an mdoc .It (list item) line that contains flag definitions.
|
|
* mdoc .It lines look like: ".It Fl v Ar file"
|
|
* where Fl = flag, Ar = argument. we extract the flag name and parameter.
|
|
*
|
|
* only handles single-char short flags and long flags starting with '-'.
|
|
* mdoc's .Fl macro automatically prepends '-', so "Fl v" means "-v"
|
|
* and "Fl -verbose" means "--verbose". *)
|
|
let parse_mdoc_it args =
|
|
let words = String.split_on_char ' ' args
|
|
|> List.filter (fun word -> word <> "" && word <> "Ns") in
|
|
let param = match words with
|
|
| _ :: _ :: "Ar" :: param_name :: _ -> Some (Mandatory param_name)
|
|
| _ -> None
|
|
in
|
|
match words with
|
|
| "Fl" :: char_str :: _ when String.length char_str = 1 && is_alphanumeric char_str.[0] ->
|
|
Some { switch = Short char_str.[0]; param; desc = "" }
|
|
| "Fl" :: name :: _ when String.length name > 1 && name.[0] = '-' ->
|
|
Some { switch = Long (String.sub name 1 (String.length name - 1)); param; desc = "" }
|
|
| _ -> None
|
|
|
|
(* extract a positional argument from an mdoc line (.Ar or .Op Ar) *)
|
|
let positional_of_mdoc_line optional args =
|
|
let words = String.split_on_char ' ' args
|
|
|> List.filter (fun word -> word <> "") in
|
|
match words with
|
|
| name :: _ when String.length name >= 2 ->
|
|
Some { pos_name = String.lowercase_ascii name;
|
|
optional; variadic = List.mem "..." words }
|
|
| _ -> None
|
|
|
|
(* parse an entire mdoc-format manpage.
|
|
* walks through all classified lines looking for:
|
|
* 1. .Bl/.It/.El list blocks containing flag definitions
|
|
* 2. .Sh SYNOPSIS sections containing positional arguments (.Ar, .Op Ar)
|
|
*
|
|
* the scan function handles nested .Bl blocks — if the first .It in a .Bl
|
|
* starts with .Fl (a flag), the entire list is parsed as options. otherwise
|
|
* the list is skipped (it might be an example list or a description list). *)
|
|
let parse_mdoc_lines lines =
|
|
let classified = List.map classify_line lines in
|
|
(* skip lines until the matching .El closing tag *)
|
|
let rec skip_to_el = function
|
|
| [] -> []
|
|
| Macro ("El", _) :: rest -> rest
|
|
| _ :: rest -> skip_to_el rest
|
|
in
|
|
(* collect description text lines until the next structural macro *)
|
|
let rec collect_desc acc = function
|
|
| [] -> (acc, [])
|
|
| (Macro ("It", _) | Macro ("El", _)
|
|
| Macro ("Sh", _) | Macro ("Ss", _)) :: _ as rest -> (acc, rest)
|
|
| line :: rest ->
|
|
collect_desc (match mdoc_text_of line with Some text -> text :: acc | None -> acc) rest
|
|
in
|
|
(* convenience: collect desc and join into a trimmed string *)
|
|
let desc_of rest =
|
|
let parts, rest = collect_desc [] rest in
|
|
(String.concat " " (List.rev parts) |> String.trim, rest)
|
|
in
|
|
(* parse a single .It entry: extract flag, collect description *)
|
|
let parse_it args rest entries =
|
|
let desc, rest = desc_of rest in
|
|
let entries = match parse_mdoc_it args with
|
|
| Some entry -> { entry with desc } :: entries
|
|
| None -> entries
|
|
in
|
|
(entries, rest)
|
|
in
|
|
(* parse all .It entries within a .Bl/.El option list *)
|
|
let rec parse_option_list entries = function
|
|
| [] -> (entries, [])
|
|
| Macro ("El", _) :: rest -> (entries, rest)
|
|
| Macro ("It", args) :: rest ->
|
|
let entries, rest = parse_it args rest entries in
|
|
parse_option_list entries rest
|
|
| _ :: rest -> parse_option_list entries rest
|
|
in
|
|
(* main scan: walk through all lines, collecting flags and positionals *)
|
|
let rec scan entries positionals = function
|
|
| [] -> (entries, positionals)
|
|
| Macro ("Bl", _) :: Macro ("It", it_args) :: rest ->
|
|
(* peek at first .It to decide if this is a flag list *)
|
|
let words = String.split_on_char ' ' it_args
|
|
|> List.filter (fun word -> word <> "") in
|
|
if (match words with "Fl" :: _ -> true | _ -> false) then
|
|
let entries, rest = parse_it it_args rest entries in
|
|
let entries, rest = parse_option_list entries rest in
|
|
scan entries positionals rest
|
|
else
|
|
scan entries positionals (skip_to_el rest)
|
|
| Macro ("Bl", _) :: rest -> scan entries positionals (skip_to_el rest)
|
|
| Macro ("Sh", args) :: rest
|
|
when String.uppercase_ascii (String.trim args) = "SYNOPSIS" ->
|
|
let positionals, rest = parse_synopsis positionals rest in
|
|
scan entries positionals rest
|
|
| _ :: rest -> scan entries positionals rest
|
|
and parse_synopsis positionals = function
|
|
| [] -> (positionals, [])
|
|
| Macro ("Sh", _) :: _ as rest -> (positionals, rest)
|
|
| Macro ("Ar", args) :: rest ->
|
|
let positionals = match positional_of_mdoc_line false args with
|
|
| Some p -> p :: positionals | None -> positionals in
|
|
parse_synopsis positionals rest
|
|
| Macro ("Op", args) :: rest ->
|
|
let words = String.split_on_char ' ' args
|
|
|> List.filter (fun word -> word <> "") in
|
|
let positionals = match words with
|
|
| "Ar" :: _ ->
|
|
(match positional_of_mdoc_line true args with
|
|
| Some p -> p :: positionals | None -> positionals)
|
|
| _ -> positionals in
|
|
parse_synopsis positionals rest
|
|
| _ :: rest -> parse_synopsis positionals rest
|
|
in
|
|
let entries, positionals = scan [] [] classified in
|
|
(* deduplicate positionals by name, preserving order *)
|
|
let positionals =
|
|
List.rev positionals
|
|
|> List.fold_left (fun (seen, acc) p ->
|
|
if List.mem p.pos_name seen then (seen, acc)
|
|
else (p.pos_name :: seen, p :: acc)
|
|
) ([], [])
|
|
|> snd |> List.rev
|
|
in
|
|
{ entries = List.rev entries; subcommands = []; positionals; description = "" }
|
|
|
|
(* --- COMMANDS section subcommand extraction ---
|
|
* some manpages (notably systemctl) have a dedicated COMMANDS section
|
|
* listing subcommands with descriptions. these use .PP + bold name +
|
|
* .RS/.RE blocks:
|
|
* .PP
|
|
* \fBstart\fR \fIUNIT\fR...
|
|
* .RS 4
|
|
* Start (activate) one or more units.
|
|
* .RE
|
|
*
|
|
* we extract the bold command name and first sentence of description. *)
|
|
|
|
let extract_commands_section lines =
|
|
let classified = List.map classify_line lines in
|
|
(* collect all lines from the current position until the next .SH *)
|
|
let rec collect_until_next_sh lines acc =
|
|
match lines with
|
|
| [] -> List.rev acc
|
|
| Macro ("SH", _) :: _ -> List.rev acc
|
|
| line :: rest -> collect_until_next_sh rest (line :: acc)
|
|
in
|
|
let is_commands_section name =
|
|
let upper = String.uppercase_ascii (String.trim name) in
|
|
upper = "COMMANDS" || upper = "COMMAND"
|
|
in
|
|
(* find all COMMANDS/.COMMAND sections and collect their lines *)
|
|
let rec find_commands acc = function
|
|
| [] -> List.rev acc
|
|
| Macro ("SH", args) :: rest when is_commands_section args ->
|
|
find_commands (collect_until_next_sh rest [] :: acc) rest
|
|
| _ :: rest -> find_commands acc rest
|
|
in
|
|
let sections = find_commands [] classified in
|
|
List.concat sections
|
|
|
|
(* extract subcommand name from a bold groff text like
|
|
* "\fBlist\-units\fR [\fIPATTERN\fR...]" -> "list-units"
|
|
*
|
|
* validates that the extracted name looks like a subcommand: lowercase,
|
|
* at least 2 chars, no leading dash. falls back to stripping all groff
|
|
* and taking the first word if no \fB...\fR wrapper is found. *)
|
|
let extract_bold_command_name text =
|
|
let trimmed = String.trim text in
|
|
(* check whether a string looks like a valid subcommand name *)
|
|
let is_valid_subcmd name =
|
|
String.length name >= 2
|
|
&& name.[0] <> '-'
|
|
&& String.for_all (fun char_val ->
|
|
(char_val >= 'a' && char_val <= 'z')
|
|
|| (char_val >= '0' && char_val <= '9')
|
|
|| char_val = '-' || char_val = '_'
|
|
) name
|
|
in
|
|
(* look for \fB...\fR at the start *)
|
|
if String.length trimmed >= 4
|
|
&& trimmed.[0] = '\\' && trimmed.[1] = 'f' && trimmed.[2] = 'B' then
|
|
let start = 3 in
|
|
let end_marker = "\\fR" in
|
|
match String.split_on_char '\\' (String.sub trimmed start (String.length trimmed - start)) with
|
|
| name_part :: _ ->
|
|
let name = strip_groff_escapes ("\\fB" ^ name_part ^ end_marker) |> String.trim in
|
|
if is_valid_subcmd name then Some name else None
|
|
| [] -> None
|
|
else
|
|
(* try already-stripped text — take the first word *)
|
|
let stripped = strip_groff_escapes trimmed in
|
|
let first_word = match String.split_on_char ' ' stripped with
|
|
| word :: _ -> word | [] -> "" in
|
|
if is_valid_subcmd first_word then Some first_word else None
|
|
|
|
(* walk through commands section lines, extracting subcommand name+description
|
|
* pairs from .PP + Text + .RS/.RE blocks *)
|
|
let extract_subcommands_from_commands lines =
|
|
let rec walk lines acc =
|
|
match lines with
|
|
| [] -> List.rev acc
|
|
| Macro ("PP", _) :: rest ->
|
|
begin match rest with
|
|
| Text tag :: rest2 ->
|
|
(* check if this is a subcommand (bold name, not a flag) *)
|
|
begin match extract_bold_command_name tag with
|
|
| Some name ->
|
|
(* collect description from .RS/.RE block *)
|
|
let rec collect_desc lines desc_acc =
|
|
match lines with
|
|
| Macro ("RS", _) :: rest3 ->
|
|
collect_in_rs rest3 desc_acc
|
|
| Text text :: rest3 ->
|
|
collect_desc rest3 (text :: desc_acc)
|
|
| _ -> (String.concat " " (List.rev desc_acc), lines)
|
|
and collect_in_rs lines desc_acc =
|
|
match lines with
|
|
| Macro ("RE", _) :: rest3 ->
|
|
(String.concat " " (List.rev desc_acc), rest3)
|
|
| Text text :: rest3 ->
|
|
collect_in_rs rest3 (text :: desc_acc)
|
|
| Macro ("PP", _) :: _ | Macro ("SH", _) :: _ | Macro ("SS", _) :: _ ->
|
|
(String.concat " " (List.rev desc_acc), lines)
|
|
| _ :: rest3 -> collect_in_rs rest3 desc_acc
|
|
| [] -> (String.concat " " (List.rev desc_acc), [])
|
|
in
|
|
let (desc, rest3) = collect_desc rest2 [] in
|
|
let desc = String.trim desc in
|
|
(* take first sentence as description *)
|
|
let short_desc = match String.split_on_char '.' desc with
|
|
| first :: _ when String.length first > 0 -> String.trim first
|
|
| _ -> desc in
|
|
let sc : subcommand = { name; desc = short_desc } in
|
|
walk rest3 (sc :: acc)
|
|
| None -> walk rest2 acc
|
|
end
|
|
| _ -> walk rest acc
|
|
end
|
|
| _ :: rest -> walk rest acc
|
|
in
|
|
walk lines []
|
|
|
|
(* --- top-level api --- *)
|
|
|
|
(* parse a manpage from its classified lines.
|
|
* auto-detects mdoc vs groff format. for groff, runs the multi-strategy
|
|
* extraction pipeline: extract OPTIONS section -> try all strategies ->
|
|
* pick best -> extract SYNOPSIS positionals -> extract COMMANDS subcommands. *)
|
|
let parse_manpage_lines lines =
|
|
if is_mdoc lines then
|
|
parse_mdoc_lines lines
|
|
else begin
|
|
let options_section = extract_options_section lines in
|
|
let entries = extract_entries options_section in
|
|
let positionals = extract_synopsis_positionals_lines lines in
|
|
let commands_section = extract_commands_section lines in
|
|
let subcommands = extract_subcommands_from_commands commands_section in
|
|
{ entries; subcommands; positionals; description = "" }
|
|
end
|
|
|
|
(* parse a manpage from its raw string contents.
|
|
* splits into lines, parses, then extracts the NAME section description. *)
|
|
let parse_manpage_string contents =
|
|
let lines = String.split_on_char '\n' contents in
|
|
let result = parse_manpage_lines lines in
|
|
let description = match extract_name_description contents with
|
|
| Some desc -> desc | None -> "" in
|
|
{ result with description }
|
|
|
|
(* --- clap-style SUBCOMMAND section extraction ---
|
|
* manpages generated by clap (rust's cli arg parser) put each subcommand
|
|
* under its own .SH SUBCOMMAND header with a Usage: line giving the name.
|
|
* this is unusual — most tools list subcommands under a single COMMANDS section.
|
|
*
|
|
* we collect all .SH SUBCOMMAND/SUBCOMMANDS sections, find the Usage: line
|
|
* in each to get the subcommand name, then extract flag entries from the
|
|
* section body. returns triples of (name, description, help_result). *)
|
|
let extract_subcommand_sections contents =
|
|
let lines = String.split_on_char '\n' contents in
|
|
let classified = List.map classify_line lines in
|
|
(* split into sections at .SH boundaries, keeping only SUBCOMMAND(S) sections *)
|
|
let rec collect_sections acc current_name current_lines = function
|
|
| [] ->
|
|
let acc = match current_name with
|
|
| Some section_name -> (section_name, List.rev current_lines) :: acc
|
|
| None -> acc in
|
|
List.rev acc
|
|
| Macro ("SH", args) :: rest ->
|
|
let acc = match current_name with
|
|
| Some section_name -> (section_name, List.rev current_lines) :: acc
|
|
| None -> acc in
|
|
let name = String.uppercase_ascii (String.trim args) in
|
|
if name = "SUBCOMMAND" || name = "SUBCOMMANDS" then
|
|
collect_sections acc (Some name) [] rest
|
|
else
|
|
collect_sections acc None [] rest
|
|
| line :: rest ->
|
|
collect_sections acc current_name (line :: current_lines) rest
|
|
in
|
|
let sections = collect_sections [] None [] classified in
|
|
(* for each SUBCOMMAND section, extract name from Usage: line and parse entries *)
|
|
let usage_re = Str.regexp {|Usage: \([a-zA-Z0-9_-]+\)|} in
|
|
let matches_usage text =
|
|
try ignore (Str.search_forward usage_re text 0); Some (Str.matched_group 1 text)
|
|
with Not_found -> None in
|
|
List.filter_map (fun (_header, section_lines) ->
|
|
(* scan section lines for the Usage: line to get the subcommand name *)
|
|
let name, desc_lines =
|
|
List.fold_left (fun (name, desc_lines) line ->
|
|
match name with
|
|
| Some _ -> (name, desc_lines)
|
|
| None ->
|
|
match line with
|
|
| Text text ->
|
|
(match matches_usage text with
|
|
| Some _ as found -> (found, desc_lines)
|
|
| None -> (None, text :: desc_lines))
|
|
| Macro (("TP" | "B" | "BI" | "BR"), args) ->
|
|
let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in
|
|
(matches_usage text, desc_lines)
|
|
| _ -> (None, desc_lines)
|
|
) (None, []) section_lines in
|
|
match name with
|
|
| None -> None
|
|
| Some subcmd_name ->
|
|
let entries = extract_entries section_lines in
|
|
let desc = String.concat " " (List.rev desc_lines)
|
|
|> strip_groff_escapes |> String.trim in
|
|
(* strip backtick-quoted words *)
|
|
let desc = Str.global_replace (Str.regexp "`\\([^`]*\\)`") "\\1" desc in
|
|
Some (subcmd_name, desc, { entries; subcommands = []; positionals = []; description = desc })
|
|
) sections
|
|
|
|
(* read a manpage file from disk. handles .gz compressed files (the common
|
|
* case — most installed manpages are gzipped) using the Gzip library.
|
|
* plain text files are read directly. *)
|
|
let read_manpage_file path =
|
|
if Filename.check_suffix path ".gz" then begin
|
|
let ic = Gzip.open_in path in
|
|
let buffer = Buffer.create 8192 in
|
|
let chunk = Bytes.create 8192 in
|
|
(try while true do
|
|
let bytes_read = Gzip.input ic chunk 0 8192 in
|
|
if bytes_read = 0 then raise Exit
|
|
else Buffer.add_subbytes buffer chunk 0 bytes_read
|
|
done with Exit | End_of_file -> ());
|
|
Gzip.close_in ic;
|
|
Buffer.contents buffer
|
|
end else begin
|
|
let ic = open_in path in
|
|
let size = in_channel_length ic in
|
|
let bytes = Bytes.create size in
|
|
really_input ic bytes 0 size;
|
|
close_in ic;
|
|
Bytes.to_string bytes
|
|
end
|
|
|
|
(* convenience: read + parse a manpage file in one step *)
|
|
let parse_manpage_file path =
|
|
read_manpage_file path |> parse_manpage_string
|