(* manpage.ml — parse unix manpages (groff/mdoc format) into help_result. * * manpages are written in roff/groff markup — a decades-old typesetting language * used by man(1). this module strips the formatting and extracts structured data * (flags, subcommands, positionals) from the raw groff source. * * there are two major manpage macro packages: * - man (groff) — used by gnu/linux tools. uses macros like .SH, .TP, .IP, .PP * - mdoc (bsd) — used by bsd tools. uses .Sh, .Fl, .Ar, .Op, .It, .Bl/.El * * this module handles both, auto-detecting the format by checking for .Sh macros. * * for groff manpages, flag extraction uses multiple "strategies" that target * different common formatting patterns: * - strategy_tp: .TP tagged paragraphs (gnu coreutils, help2man) * - strategy_ip: .IP indented paragraphs (curl, hand-written) * - strategy_pp_rs: .PP + .RS/.RE blocks (git, docbook) * - strategy_nix: nix3-style bullet .IP with .UR/.UE hyperlinks * - strategy_deroff: fallback — strip all groff, feed to help text parser * * the module tries all applicable strategies and picks the one that extracts * the most flag entries, on the theory that more results = better match. * * key peculiarities: * - groff has an enormous escape syntax (font changes, named characters, * size changes, color, string variables, etc.) — strip_groff_escapes * handles the common cases but is not exhaustive * - font escapes like \fI (italic) need to insert spaces at word boundaries * to prevent flag names from fusing with their parameter names * - the strategies share the angstrom-based switch_parser from parser.ml * for parsing the actual flag syntax out of the stripped text *) open Parser (* --- shared helpers for imperative string scanning --- * many groff parsing routines use an imperative cursor (ref int) walking * through a string. these helpers factor out common scanning patterns. *) (* advance pos past all characters until the delimiter is found. * leaves pos pointing at the delimiter character, or at len if not found. *) let skip_to_char source len pos delim = while !pos < len && source.[!pos] <> delim do incr pos done (* translate a groff named character escape to its text equivalent. * groff uses two-letter codes like "aq" for apostrophe, "lq"/"rq" for * left/right quotes, "em"/"en" for dashes. returns None for unknown names. *) let named_char_of = function | "aq" -> Some '\'' | "lq" | "Lq" -> Some '\x22' (* left double quote *) | "rq" | "Rq" -> Some '\x22' (* right double quote *) | "em" | "en" -> Some '-' | _ -> None (* skip a groff reference that uses one of three sub-forms: * single char — e.g. \*X or \nX * ( + 2 chars — e.g. \*(XX or \n(XX * [ to ] — e.g. \*[name] or \n[name] * used for \* (string variable) and \n (number register) escapes. * advances pos past the consumed characters. *) let skip_groff_reference source len pos = if !pos < len then begin if source.[!pos] = '(' then pos := !pos + 3 (* skip past '(' + two-character name *) else if source.[!pos] = '[' then begin incr pos; skip_to_char source len pos ']'; if !pos < len then incr pos end else incr pos end (* --- groff escape/formatting stripper --- * groff escapes start with backslash and use various continuation syntaxes. * this function strips them, replacing named characters (like \(aq for * apostrophe) with their text equivalents and discarding formatting directives. *) let strip_groff_escapes source = let buffer = Buffer.create (String.length source) in let len = String.length source in let pos = ref 0 in let prev_char = ref '\000' in (* emit a character into the output buffer and track it as previous *) let put char_val = Buffer.add_char buffer char_val; prev_char := char_val in let is_alnum char_val = (char_val >= 'a' && char_val <= 'z') || (char_val >= 'A' && char_val <= 'Z') || (char_val >= '0' && char_val <= '9') in while !pos < len do if source.[!pos] = '\\' && !pos + 1 < len then begin let next = source.[!pos + 1] in match next with | 'f' -> (* font escape: \fB, \fI, \fP, \fR, \f(XX, \f[...] *) if !pos + 2 < len then begin let font_char = source.[!pos + 2] in (* insert space before italic font to preserve word boundaries e.g. \fB--max-results\fR\fIcount\fR -> "--max-results count" *) if font_char = 'I' && is_alnum !prev_char then put ' '; if font_char = '(' then pos := !pos + 5 (* \f(XX — two-character font name *) else if font_char = '[' then begin pos := !pos + 3; skip_to_char source len pos ']'; if !pos < len then incr pos end else pos := !pos + 3 (* \fX — single-character font selector *) end else pos := !pos + 2 | '-' -> (* escaped hyphen-minus — emit a plain hyphen *) put '-'; pos := !pos + 2 | '&' | '/' | ',' -> (* zero-width characters — discard without output *) pos := !pos + 2 | '(' -> (* two-char named character: \(aq, \(lq, \(rq, etc. *) if !pos + 3 < len then begin let name = String.sub source (!pos + 2) 2 in (match named_char_of name with | Some char_val -> put char_val | None -> ()); pos := !pos + 4 end else pos := !pos + 2 | '[' -> (* bracketed named character: \[aq], \[lq], etc. *) pos := !pos + 2; let start = !pos in skip_to_char source len pos ']'; if !pos < len then begin let name = String.sub source start (!pos - start) in (match named_char_of name with | Some char_val -> put char_val | None -> ()); incr pos end | 's' -> (* size escape: \sN, \s+N, \s-N — skip the numeric argument *) pos := !pos + 2; if !pos < len && (source.[!pos] = '+' || source.[!pos] = '-') then incr pos; if !pos < len && source.[!pos] >= '0' && source.[!pos] <= '9' then incr pos; if !pos < len && source.[!pos] >= '0' && source.[!pos] <= '9' then incr pos | 'm' -> (* color escape: \m[...] — skip the bracketed color name *) pos := !pos + 2; if !pos < len && source.[!pos] = '[' then begin incr pos; skip_to_char source len pos ']'; if !pos < len then incr pos end | 'X' -> (* device control: \X'...' — skip the single-quoted payload *) pos := !pos + 2; if !pos < len && source.[!pos] = '\'' then begin incr pos; skip_to_char source len pos '\''; if !pos < len then incr pos end | '*' -> (* string variable: \*X or \*(XX or \*[...] — skip the reference *) pos := !pos + 2; skip_groff_reference source len pos | 'n' -> (* number register: \nX or \n(XX or \n[...] — skip the reference *) pos := !pos + 2; skip_groff_reference source len pos | 'e' -> (* escaped backslash literal *) put '\\'; pos := !pos + 2 | '\\' -> (* double backslash — emit one *) put '\\'; pos := !pos + 2 | ' ' -> (* escaped space — emit a regular space *) put ' '; pos := !pos + 2 | _ -> (* unknown escape — skip the two-character sequence *) pos := !pos + 2 end else begin put source.[!pos]; incr pos end done; Buffer.contents buffer (* strip inline macro formatting: .BI, .BR, .IR, etc. * these macros alternate between fonts for their arguments, e.g.: * .BI "--output " "FILE" * becomes "--outputFILE" (arguments concatenated without spaces). * * quoted strings are kept together (quotes stripped), but unquoted spaces * are consumed. this matches groff's actual rendering of these macros, * where alternating-font arguments are concatenated. *) let strip_inline_macro_args text = let buffer = Buffer.create (String.length text) in let len = String.length text in let pos = ref 0 in while !pos < len do if text.[!pos] = '"' then begin (* quoted argument — copy characters up to the closing quote *) incr pos; while !pos < len && text.[!pos] <> '"' do Buffer.add_char buffer text.[!pos]; incr pos done; if !pos < len then incr pos end else if text.[!pos] = ' ' || text.[!pos] = '\t' then begin (* unquoted whitespace — skip (arguments are concatenated) *) incr pos end else begin (* regular character — copy to output *) Buffer.add_char buffer text.[!pos]; incr pos end done; Buffer.contents buffer (* convenience: strip escapes and trim whitespace *) let strip_groff line = let text = strip_groff_escapes line in String.trim text (* --- line classification --- * every line in a manpage is classified as one of four types. * this classification drives all subsequent parsing — strategies * pattern-match on sequences of classified lines. *) type groff_line = | Macro of string * string (* macro name + args, e.g. ("SH", "OPTIONS") or ("TP", "") *) | Text of string (* plain text after groff stripping *) | Blank (* empty line *) | Comment (* groff comment: .backslash-quote or backslash-quote *) (* classify a single line of manpage source. * macro lines start with '.' or '\'' (groff alternate control char). * the macro name is split from its arguments at the first space/tab. * arguments wrapped in double quotes are unquoted. *) let classify_line line = let len = String.length line in if len = 0 then Blank else if len >= 2 && line.[0] = '.' && line.[1] = '\\' && (len < 3 || line.[2] = '"') then Comment else if len >= 3 && line.[0] = '\\' && line.[1] = '"' then Comment else if line.[0] = '.' || line.[0] = '\'' then begin (* macro line — extract macro name and arguments *) let rest = String.sub line 1 (len - 1) in let rest = String.trim rest in (* split into macro name and arguments at the first whitespace *) let space_pos = try Some (String.index rest ' ') with Not_found -> try Some (String.index rest '\t') with Not_found -> None in match space_pos with | Some split_at -> let name = String.sub rest 0 split_at in let args = String.trim (String.sub rest (split_at + 1) (String.length rest - split_at - 1)) in (* strip surrounding quotes from arguments *) let args = let alen = String.length args in if alen >= 2 && args.[0] = '"' && args.[alen - 1] = '"' then String.sub args 1 (alen - 2) else args in Macro (name, args) | None -> Macro (rest, "") end else begin let stripped = strip_groff line in if String.length stripped = 0 then Blank else Text stripped end (* refined comment detection — the base classify_line may miss some comment * forms, so this wrapper checks more carefully before falling through to * the general classifier. *) let is_comment_line line = let len = String.length line in (len >= 3 && line.[0] = '.' && line.[1] = '\\' && line.[2] = '"') || (len >= 2 && line.[0] = '\\' && line.[1] = '"') let classify_line line = if is_comment_line line then Comment else classify_line line (* --- section extraction --- * manpages are divided into sections by .SH macros. the OPTIONS section * contains the flag definitions we want. if there's no OPTIONS section, * we fall back to DESCRIPTION (some simple tools put flags there). * * old-style nix manpages (nix-build, nix-env-install, etc.) split flags * across multiple .SH sections with option-like names: e.g. "Options" for * command-specific flags and "Common Options" for flags shared by all nix * commands. collecting only the first such section misses the majority of * flags, so we collect and concatenate all option-like sections. *) let extract_options_section lines = let classified = List.map classify_line lines in (* collect lines until the next .SH header, returning (content, rest) * where rest starts at the .SH line (or is empty if at end of file). *) let rec collect_section lines acc = match lines with | [] -> (List.rev acc, []) | Macro ("SH", _) :: _ -> (List.rev acc, lines) | line :: rest -> collect_section rest (line :: acc) in (* test whether a section name looks like an options section. * matches "OPTIONS", "COMMON OPTIONS", "GLOBAL OPTIONS", etc. *) let is_options_section name = let upper = String.uppercase_ascii (String.trim name) in upper = "OPTIONS" || (String.length upper > 0 && try let _ = Str.search_forward (Str.regexp_string "OPTION") upper 0 in true with Not_found -> false) in (* collect from all option-like .SH sections and concatenate them. * handles the common nix pattern where "Options" and "Common Options" * are separate .SH sections but both contain relevant flags. * * a synthetic Macro("SH","") separator is inserted between sections so * that collect_desc_text (which stops on SH/SS) does not let a description * from the last entry in one section bleed into the intro text of the next. *) let rec find_all_options lines acc = match lines with | [] -> acc | Macro ("SH", args) :: rest when is_options_section args -> let (section, remaining) = collect_section rest [] in let sep = if acc = [] then [] else [Macro ("SH", "")] in find_all_options remaining (acc @ sep @ section) | _ :: rest -> find_all_options rest acc in (* fallback: DESCRIPTION section for simple tools that put flags there *) let rec find_description = function | [] -> [] | Macro ("SH", args) :: rest when String.uppercase_ascii (String.trim args) = "DESCRIPTION" -> fst (collect_section rest []) | _ :: rest -> find_description rest in match find_all_options classified [] with | [] -> find_description classified | sections -> sections (* --- strategy-based entry extraction --- * rather than a single monolithic parser, we use multiple "strategies" that * each target a specific groff formatting pattern. this is necessary because * manpage authors use very different macro combinations for the same purpose. * * the shared building blocks: * - collect_text_lines: gather consecutive Text lines into one description string * - parse_tag_to_entry: run the angstrom switch parser on a tag string to * extract the flag definition. this reuses the same parser that handles * --help output, giving consistent extraction across both sources. * - tag_of_macro: extract the "tag" text from formatting macros like .B, .BI, etc. *) (* collect consecutive text lines, joining them with spaces *) let rec collect_text_lines lines acc = match lines with | Text text :: rest -> collect_text_lines rest (text :: acc) | _ -> (String.concat " " (List.rev acc), lines) (* attempt to parse a tag string (e.g. "-v, --verbose FILE") into an entry. * uses the angstrom switch_parser + param_parser from parser.ml. * returns None if the tag doesn't look like a flag definition. *) let parse_tag_to_entry tag desc = let tag = strip_groff_escapes tag in let tag = String.trim tag in match Angstrom.parse_string ~consume:Angstrom.Consume.Prefix (Angstrom.lift2 (fun sw p -> (sw, p)) switch_parser param_parser) tag with | Ok (switch, param) -> Some { switch; param; desc } | Error _ -> None (* extract tag text from a macro line. * .B and .I preserve spaces (single argument); .BI, .BR, .IR alternate * fonts and concatenate arguments. *) let tag_of_macro name args = match name with | "B" | "I" -> strip_groff_escapes args |> String.trim | _ -> strip_inline_macro_args args |> strip_groff_escapes |> String.trim (* strategy a: .TP style (most common — gnu coreutils, help2man). * .TP introduces a tagged paragraph: the next line is the "tag" (flag name) * and subsequent text lines are the description. the tag can be plain text * or wrapped in a formatting macro (.B, .BI, etc.). * * example groff: * .TP * \fB\-v\fR, \fB\-\-verbose\fR * increase verbosity *) let strategy_tp lines = let rec walk lines acc = match lines with | [] -> List.rev acc | Macro ("TP", _) :: rest -> (* next line is the tag — could be Text or a formatting macro *) begin match rest with | Text tag :: rest2 -> let (desc, rest3) = collect_text_lines rest2 [] in let entry = parse_tag_to_entry tag desc in walk rest3 (match entry with Some e -> e :: acc | None -> acc) | Macro (("B" | "I" | "BI" | "BR" | "IR") as macro_name, args) :: rest2 -> let tag = tag_of_macro macro_name args in let (desc, rest3) = collect_text_lines rest2 [] in let entry = parse_tag_to_entry tag desc in walk rest3 (match entry with Some e -> e :: acc | None -> acc) | _ -> walk rest acc end | _ :: rest -> walk rest acc in walk lines [] (* strategy b: .IP style (curl, hand-written manpages). * .IP takes an inline tag argument: .IP "-v, --verbose" * the description follows as text lines. simpler than .TP because * the tag is on the macro line itself. *) let strategy_ip lines = let rec walk lines acc = match lines with | [] -> List.rev acc | Macro ("IP", tag) :: rest -> let tag = strip_groff_escapes tag in let (desc, rest2) = collect_text_lines rest [] in let entry = parse_tag_to_entry tag desc in walk rest2 (match entry with Some e -> e :: acc | None -> acc) | _ :: rest -> walk rest acc in walk lines [] (* strategy c: .PP + .RS/.RE style (git, docbook-generated manpages). * flag entries are introduced by .PP (paragraph), with the flag name as * plain text, followed by a .RS (indent) block containing the description, * closed by .RE (de-indent). this is common in docbook-to-manpage toolchains. *) let strategy_pp_rs lines = let rec walk lines acc = match lines with | [] -> List.rev acc | Macro ("PP", _) :: rest -> begin match rest with | Text tag :: rest2 -> (* look for .RS ... text ... .RE *) let rec collect_rs lines desc_acc = match lines with | Macro ("RS", _) :: rest3 -> collect_in_rs rest3 desc_acc | Text text :: rest3 -> (* sometimes description follows directly *) collect_rs rest3 (text :: desc_acc) | _ -> (String.concat " " (List.rev desc_acc), lines) and collect_in_rs lines desc_acc = match lines with | Macro ("RE", _) :: rest3 -> (String.concat " " (List.rev desc_acc), rest3) | Text text :: rest3 -> collect_in_rs rest3 (text :: desc_acc) | Macro ("PP", _) :: _ | Macro ("SH", _) :: _ -> (String.concat " " (List.rev desc_acc), lines) | _ :: rest3 -> collect_in_rs rest3 desc_acc | [] -> (String.concat " " (List.rev desc_acc), []) in let (desc, rest3) = collect_rs rest2 [] in let entry = parse_tag_to_entry tag desc in walk rest3 (match entry with Some e -> e :: acc | None -> acc) | _ -> walk rest acc end | _ :: rest -> walk rest acc in walk lines [] (* strategy d: deroff fallback — strip all groff markup, then feed the * resulting plain text through the --help parser from parser.ml. * this is the last resort when no structured macro pattern is recognized. * it works surprisingly well for simple manpages but may miss entries * in heavily formatted ones. *) let strategy_deroff_lines lines = let buffer = Buffer.create 256 in List.iter (fun line -> match line with | Text text -> Buffer.add_string buffer text; Buffer.add_char buffer '\n' | Macro (("BI" | "BR" | "IR" | "B" | "I"), args) -> let text = strip_inline_macro_args args in let text = strip_groff_escapes text in Buffer.add_string buffer text; Buffer.add_char buffer '\n' | Blank -> Buffer.add_char buffer '\n' | _ -> () ) lines; let text = Buffer.contents buffer in match parse_help text with | Ok result -> result.entries | Error _ -> [] (* strategy e: nix3-style bullet .IP with .UR/.UE hyperlinks. * nix's manpages use .IP with bullet markers for flag entries, interleaved * with .UR/.UE hyperlink macros. the flag tag is in text lines after the * bullet .IP, and the description follows a non-bullet .IP marker. * * nix manpages nest .RS/.RE blocks inside descriptions for sub-examples. * the skip_rs helper tracks nesting depth to skip these without losing * the rest of the description. *) let strategy_nix lines = (* a bullet .IP has non-empty args (the bullet marker) *) let is_bullet_ip args = String.length (String.trim args) > 0 in let rec walk lines acc = match lines with | [] -> List.rev acc | Macro ("IP", args) :: rest when is_bullet_ip args -> (* collect tag: skip .UR/.UE macros, collect Text lines *) let rec collect_tag lines parts = match lines with | Macro ("UR", _) :: rest2 -> collect_tag rest2 parts | Macro ("UE", _) :: rest2 -> collect_tag rest2 parts | Text text :: rest2 -> collect_tag rest2 (text :: parts) | _ -> (String.concat " " (List.rev parts), lines) in let (tag, rest2) = collect_tag rest [] in (* collect description after the description .IP marker *) let rec collect_desc lines parts = match lines with | Macro ("IP", dargs) :: rest3 when not (is_bullet_ip dargs) -> collect_desc_text rest3 parts | _ -> (String.concat " " (List.rev parts), lines) and collect_desc_text lines parts = match lines with | Text text :: rest3 -> collect_desc_text rest3 (text :: parts) | Macro ("IP", args2) :: _ when is_bullet_ip args2 -> (* next bullet entry — stop collecting *) (String.concat " " (List.rev parts), lines) | Macro (("SS" | "SH"), _) :: _ -> (* section boundary — stop collecting *) (String.concat " " (List.rev parts), lines) | Macro ("RS", _) :: rest3 -> skip_rs rest3 parts 1 | Macro ("IP", _) :: rest3 -> (* non-bullet .IP = continuation paragraph *) collect_desc_text rest3 parts | Macro _ :: rest3 -> collect_desc_text rest3 parts | Blank :: rest3 -> collect_desc_text rest3 parts | Comment :: rest3 -> collect_desc_text rest3 parts | [] -> (String.concat " " (List.rev parts), []) and skip_rs lines parts depth = match lines with | Macro ("RE", _) :: rest3 -> if depth <= 1 then collect_desc_text rest3 parts else skip_rs rest3 parts (depth - 1) | Macro ("RS", _) :: rest3 -> skip_rs rest3 parts (depth + 1) | _ :: rest3 -> skip_rs rest3 parts depth | [] -> (String.concat " " (List.rev parts), []) in let (desc, rest3) = collect_desc rest2 [] in let entry = parse_tag_to_entry tag desc in walk rest3 (match entry with Some e -> e :: acc | None -> acc) | _ :: rest -> walk rest acc in walk lines [] (* count occurrences of a specific macro in the section. * used by extract_entries to decide which strategies are worth trying. *) let count_macro name lines = List.fold_left (fun count line -> match line with Macro (macro_name, _) when macro_name = name -> count + 1 | _ -> count ) 0 lines (* auto-detect and try strategies, return the one with most entries. * first counts macros to determine which strategies are applicable, * then runs all applicable ones and picks the winner by entry count. * if no specialized strategy produces results, falls back to deroff. * * this "try everything, pick the best" approach is intentional. * manpage formatting is too varied and inconsistent to reliably detect the * format from macro counts alone. running multiple strategies and comparing * results is more robust. *) let extract_entries lines = let tp = count_macro "TP" lines and ip = count_macro "IP" lines and pp = count_macro "PP" lines and rs = count_macro "RS" lines and ur = count_macro "UR" lines in (* build a list of (label, entries) for each applicable strategy *) let specialized = List.filter_map Fun.id [ (if tp > 0 then Some ("TP", strategy_tp lines) else None); (if ip > 0 then Some ("IP", strategy_ip lines) else None); (if pp > 0 && rs > 0 then Some ("PP+RS", strategy_pp_rs lines) else None); (if ur > 0 && ip > 0 then Some ("nix", strategy_nix lines) else None); ] in (* filter to strategies that found at least one entry, fall back to deroff *) let candidates = match List.filter (fun (_, entries) -> entries <> []) specialized with | [] -> [("deroff", strategy_deroff_lines lines)] | filtered -> filtered in (* pick the strategy with the most entries *) List.fold_left (fun (_, best) (name, entries) -> if List.length entries >= List.length best then (name, entries) else (name, best) ) ("none", []) candidates |> snd (* --- NAME section description extraction --- * the NAME section in manpages follows the convention: * "command \- short description" * we extract the part after "\-" as the command's description. * handles both "\-" (groff) and " - " (plain text) separators. *) let extract_name_description contents = let lines = String.split_on_char '\n' contents in let classified = List.map classify_line lines in let rec find = function | [] -> None | Macro ("SH", args) :: rest when String.uppercase_ascii (String.trim args) = "NAME" -> collect rest [] | _ :: rest -> find rest and collect lines acc = match lines with | Macro ("SH", _) :: _ | [] -> finish acc | Text text :: rest -> collect rest (text :: acc) | Macro (("B" | "BI" | "BR" | "I" | "IR"), args) :: rest -> let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in collect rest (if String.length text > 0 then text :: acc else acc) | Macro ("Nm", args) :: rest -> let text = strip_groff_escapes args |> String.trim in collect rest (if String.length text > 0 then text :: acc else acc) | Macro ("Nd", args) :: rest -> let text = strip_groff_escapes args |> String.trim in collect rest (if String.length text > 0 then ("\\- " ^ text) :: acc else acc) | _ :: rest -> collect rest acc and finish acc = let full = String.concat " " (List.rev acc) |> String.trim in (* NAME lines look like: "git-add \- Add file contents to the index" *) let sep = Str.regexp {| *\\- *\| +- +|} in match Str.bounded_split sep full 2 with | [_; desc] -> Some (String.trim desc) | _ -> None in find classified (* --- SYNOPSIS command name extraction --- * the SYNOPSIS section shows how to invoke the command: * .SH SYNOPSIS * .B git add * [\fIOPTIONS\fR] [\fB\-\-\fR] [\fI\fR...] * * we extract the command name by taking consecutive "word" tokens until * we hit something that looks like an argument (starts with [, <, -, etc.). *) let extract_synopsis_command_lines lines = let classified = List.map classify_line lines in let is_synopsis name = String.uppercase_ascii (String.trim name) = "SYNOPSIS" in (* extract the command name from a line by taking leading word tokens *) let extract_cmd line = let words = String.split_on_char ' ' (String.trim line) in let words = List.filter (fun word -> String.length word > 0) words in let is_cmd_char = function | 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '.' -> true | _ -> false in (* take words that look like command name parts, stop at arguments *) let rec take = function | [] -> [] | word :: rest -> if String.length word > 0 && (word.[0] = '[' || word.[0] = '-' || word.[0] = '<' || word.[0] = '(' || word.[0] = '{') then [] else if String.for_all is_cmd_char word then word :: take rest else [] in match take words with | [] -> None | cmd -> Some (String.concat " " cmd) in let rec find = function | [] -> None | Macro ("SH", args) :: rest when is_synopsis args -> collect rest | _ :: rest -> find rest and collect = function | [] -> None | Macro ("SH", _) :: _ -> None | Text text :: _ -> let text = String.trim text in if String.length text > 0 then extract_cmd text else None | Macro (("B" | "BI" | "BR"), args) :: _ -> let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in if String.length text > 0 then extract_cmd text else None | _ :: rest -> collect rest in find classified let extract_synopsis_command contents = let lines = String.split_on_char '\n' contents in extract_synopsis_command_lines lines (* --- SYNOPSIS positional extraction --- * extract positional arguments from the SYNOPSIS section by collecting * all text/formatting macro lines, joining them, skipping the command * name prefix, then running parse_usage_args from parser.ml on the remainder. *) let extract_synopsis_positionals_lines lines = let classified = List.map classify_line lines in let is_synopsis name = String.uppercase_ascii (String.trim name) = "SYNOPSIS" in let rec find = function | [] -> [] | Macro ("SH", args) :: rest when is_synopsis args -> collect rest [] | _ :: rest -> find rest and collect lines acc = match lines with | [] -> finish acc | Macro ("SH", _) :: _ -> finish acc | Macro ("SS", _) :: _ -> finish acc | Macro ("br", _) :: _ -> finish acc | Text text :: rest -> let text = strip_groff_escapes text |> String.trim in collect rest (if String.length text > 0 then text :: acc else acc) | Macro (("B" | "BI" | "BR" | "I" | "IR" | "IB" | "RB" | "RI"), args) :: rest -> let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in collect rest (if String.length text > 0 then text :: acc else acc) | _ :: rest -> collect rest acc and finish acc = let parts = List.rev acc in let full = String.concat " " parts |> String.trim in if String.length full = 0 then [] else let cmd_end = skip_command_prefix full in let args = String.sub full cmd_end (String.length full - cmd_end) in parse_usage_args args in find classified (* --- mdoc (bsd) format support --- * mdoc is the bsd manpage macro package. it uses semantic macros rather than * presentation macros: * .Fl v -> flag: -v * .Ar file -> argument: file * .Op ... -> optional: [...] * .Bl/.It/.El -> list begin/item/end * .Sh -> section header (note lowercase 'h', vs groff's .SH) * * the parser walks through classified lines looking for .Bl (list begin) * blocks containing .It (items) with .Fl (flag) entries. *) let is_mdoc lines = List.exists (fun line -> match classify_line line with Macro ("Sh", _) -> true | _ -> false ) lines (* extract renderable text from an mdoc line, skipping structural macros *) let mdoc_text_of line = match line with | Text text -> Some (strip_groff_escapes text) | Macro (macro_name, args) -> (match macro_name with | "Pp" | "Bl" | "El" | "Sh" | "Ss" | "Os" | "Dd" | "Dt" | "Oo" | "Oc" | "Op" -> None | _ -> let text = strip_groff_escapes args |> String.trim in if text = "" then None else Some text) | _ -> None (* parse an mdoc .It (list item) line that contains flag definitions. * mdoc .It lines look like: ".It Fl v Ar file" * where Fl = flag, Ar = argument. we extract the flag name and parameter. * * only handles single-char short flags and long flags starting with '-'. * mdoc's .Fl macro automatically prepends '-', so "Fl v" means "-v" * and "Fl -verbose" means "--verbose". *) let parse_mdoc_it args = let words = String.split_on_char ' ' args |> List.filter (fun word -> word <> "" && word <> "Ns") in let param = match words with | _ :: _ :: "Ar" :: param_name :: _ -> Some (Mandatory param_name) | _ -> None in match words with | "Fl" :: char_str :: _ when String.length char_str = 1 && is_alphanumeric char_str.[0] -> Some { switch = Short char_str.[0]; param; desc = "" } | "Fl" :: name :: _ when String.length name > 1 && name.[0] = '-' -> Some { switch = Long (String.sub name 1 (String.length name - 1)); param; desc = "" } | _ -> None (* extract a positional argument from an mdoc line (.Ar or .Op Ar) *) let positional_of_mdoc_line optional args = let words = String.split_on_char ' ' args |> List.filter (fun word -> word <> "") in match words with | name :: _ when String.length name >= 2 -> Some { pos_name = String.lowercase_ascii name; optional; variadic = List.mem "..." words } | _ -> None (* parse an entire mdoc-format manpage. * walks through all classified lines looking for: * 1. .Bl/.It/.El list blocks containing flag definitions * 2. .Sh SYNOPSIS sections containing positional arguments (.Ar, .Op Ar) * * the scan function handles nested .Bl blocks — if the first .It in a .Bl * starts with .Fl (a flag), the entire list is parsed as options. otherwise * the list is skipped (it might be an example list or a description list). *) let parse_mdoc_lines lines = let classified = List.map classify_line lines in (* skip lines until the matching .El closing tag *) let rec skip_to_el = function | [] -> [] | Macro ("El", _) :: rest -> rest | _ :: rest -> skip_to_el rest in (* collect description text lines until the next structural macro *) let rec collect_desc acc = function | [] -> (acc, []) | (Macro ("It", _) | Macro ("El", _) | Macro ("Sh", _) | Macro ("Ss", _)) :: _ as rest -> (acc, rest) | line :: rest -> collect_desc (match mdoc_text_of line with Some text -> text :: acc | None -> acc) rest in (* convenience: collect desc and join into a trimmed string *) let desc_of rest = let parts, rest = collect_desc [] rest in (String.concat " " (List.rev parts) |> String.trim, rest) in (* parse a single .It entry: extract flag, collect description *) let parse_it args rest entries = let desc, rest = desc_of rest in let entries = match parse_mdoc_it args with | Some entry -> { entry with desc } :: entries | None -> entries in (entries, rest) in (* parse all .It entries within a .Bl/.El option list *) let rec parse_option_list entries = function | [] -> (entries, []) | Macro ("El", _) :: rest -> (entries, rest) | Macro ("It", args) :: rest -> let entries, rest = parse_it args rest entries in parse_option_list entries rest | _ :: rest -> parse_option_list entries rest in (* main scan: walk through all lines, collecting flags and positionals *) let rec scan entries positionals = function | [] -> (entries, positionals) | Macro ("Bl", _) :: Macro ("It", it_args) :: rest -> (* peek at first .It to decide if this is a flag list *) let words = String.split_on_char ' ' it_args |> List.filter (fun word -> word <> "") in if (match words with "Fl" :: _ -> true | _ -> false) then let entries, rest = parse_it it_args rest entries in let entries, rest = parse_option_list entries rest in scan entries positionals rest else scan entries positionals (skip_to_el rest) | Macro ("Bl", _) :: rest -> scan entries positionals (skip_to_el rest) | Macro ("Sh", args) :: rest when String.uppercase_ascii (String.trim args) = "SYNOPSIS" -> let positionals, rest = parse_synopsis positionals rest in scan entries positionals rest | _ :: rest -> scan entries positionals rest and parse_synopsis positionals = function | [] -> (positionals, []) | Macro ("Sh", _) :: _ as rest -> (positionals, rest) | Macro ("Ar", args) :: rest -> let positionals = match positional_of_mdoc_line false args with | Some p -> p :: positionals | None -> positionals in parse_synopsis positionals rest | Macro ("Op", args) :: rest -> let words = String.split_on_char ' ' args |> List.filter (fun word -> word <> "") in let positionals = match words with | "Ar" :: _ -> (match positional_of_mdoc_line true args with | Some p -> p :: positionals | None -> positionals) | _ -> positionals in parse_synopsis positionals rest | _ :: rest -> parse_synopsis positionals rest in let entries, positionals = scan [] [] classified in (* deduplicate positionals by name, preserving order *) let positionals = List.rev positionals |> List.fold_left (fun (seen, acc) p -> if List.mem p.pos_name seen then (seen, acc) else (p.pos_name :: seen, p :: acc) ) ([], []) |> snd |> List.rev in { entries = List.rev entries; subcommands = []; positionals; description = "" } (* --- COMMANDS section subcommand extraction --- * some manpages (notably systemctl) have a dedicated COMMANDS section * listing subcommands with descriptions. these use .PP + bold name + * .RS/.RE blocks: * .PP * \fBstart\fR \fIUNIT\fR... * .RS 4 * Start (activate) one or more units. * .RE * * we extract the bold command name and first sentence of description. *) let extract_commands_section lines = let classified = List.map classify_line lines in (* collect all lines from the current position until the next .SH *) let rec collect_until_next_sh lines acc = match lines with | [] -> List.rev acc | Macro ("SH", _) :: _ -> List.rev acc | line :: rest -> collect_until_next_sh rest (line :: acc) in let is_commands_section name = let upper = String.uppercase_ascii (String.trim name) in upper = "COMMANDS" || upper = "COMMAND" in (* find all COMMANDS/.COMMAND sections and collect their lines *) let rec find_commands acc = function | [] -> List.rev acc | Macro ("SH", args) :: rest when is_commands_section args -> find_commands (collect_until_next_sh rest [] :: acc) rest | _ :: rest -> find_commands acc rest in let sections = find_commands [] classified in List.concat sections (* extract subcommand name from a bold groff text like * "\fBlist\-units\fR [\fIPATTERN\fR...]" -> "list-units" * * validates that the extracted name looks like a subcommand: lowercase, * at least 2 chars, no leading dash. falls back to stripping all groff * and taking the first word if no \fB...\fR wrapper is found. *) let extract_bold_command_name text = let trimmed = String.trim text in (* check whether a string looks like a valid subcommand name *) let is_valid_subcmd name = String.length name >= 2 && name.[0] <> '-' && String.for_all (fun char_val -> (char_val >= 'a' && char_val <= 'z') || (char_val >= '0' && char_val <= '9') || char_val = '-' || char_val = '_' ) name in (* look for \fB...\fR at the start *) if String.length trimmed >= 4 && trimmed.[0] = '\\' && trimmed.[1] = 'f' && trimmed.[2] = 'B' then let start = 3 in let end_marker = "\\fR" in match String.split_on_char '\\' (String.sub trimmed start (String.length trimmed - start)) with | name_part :: _ -> let name = strip_groff_escapes ("\\fB" ^ name_part ^ end_marker) |> String.trim in if is_valid_subcmd name then Some name else None | [] -> None else (* try already-stripped text — take the first word *) let stripped = strip_groff_escapes trimmed in let first_word = match String.split_on_char ' ' stripped with | word :: _ -> word | [] -> "" in if is_valid_subcmd first_word then Some first_word else None (* walk through commands section lines, extracting subcommand name+description * pairs from .PP + Text + .RS/.RE blocks *) let extract_subcommands_from_commands lines = let rec walk lines acc = match lines with | [] -> List.rev acc | Macro ("PP", _) :: rest -> begin match rest with | Text tag :: rest2 -> (* check if this is a subcommand (bold name, not a flag) *) begin match extract_bold_command_name tag with | Some name -> (* collect description from .RS/.RE block *) let rec collect_desc lines desc_acc = match lines with | Macro ("RS", _) :: rest3 -> collect_in_rs rest3 desc_acc | Text text :: rest3 -> collect_desc rest3 (text :: desc_acc) | _ -> (String.concat " " (List.rev desc_acc), lines) and collect_in_rs lines desc_acc = match lines with | Macro ("RE", _) :: rest3 -> (String.concat " " (List.rev desc_acc), rest3) | Text text :: rest3 -> collect_in_rs rest3 (text :: desc_acc) | Macro ("PP", _) :: _ | Macro ("SH", _) :: _ | Macro ("SS", _) :: _ -> (String.concat " " (List.rev desc_acc), lines) | _ :: rest3 -> collect_in_rs rest3 desc_acc | [] -> (String.concat " " (List.rev desc_acc), []) in let (desc, rest3) = collect_desc rest2 [] in let desc = String.trim desc in (* take first sentence as description *) let short_desc = match String.split_on_char '.' desc with | first :: _ when String.length first > 0 -> String.trim first | _ -> desc in let sc : subcommand = { name; desc = short_desc } in walk rest3 (sc :: acc) | None -> walk rest2 acc end | _ -> walk rest acc end | _ :: rest -> walk rest acc in walk lines [] (* --- top-level api --- *) (* parse a manpage from its classified lines. * auto-detects mdoc vs groff format. for groff, runs the multi-strategy * extraction pipeline: extract OPTIONS section -> try all strategies -> * pick best -> extract SYNOPSIS positionals -> extract COMMANDS subcommands. *) let parse_manpage_lines lines = if is_mdoc lines then parse_mdoc_lines lines else begin let options_section = extract_options_section lines in let entries = extract_entries options_section in let positionals = extract_synopsis_positionals_lines lines in let commands_section = extract_commands_section lines in let subcommands = extract_subcommands_from_commands commands_section in { entries; subcommands; positionals; description = "" } end (* parse a manpage from its raw string contents. * splits into lines, parses, then extracts the NAME section description. *) let parse_manpage_string contents = let lines = String.split_on_char '\n' contents in let result = parse_manpage_lines lines in let description = match extract_name_description contents with | Some desc -> desc | None -> "" in { result with description } (* --- clap-style SUBCOMMAND section extraction --- * manpages generated by clap (rust's cli arg parser) put each subcommand * under its own .SH SUBCOMMAND header with a Usage: line giving the name. * this is unusual — most tools list subcommands under a single COMMANDS section. * * we collect all .SH SUBCOMMAND/SUBCOMMANDS sections, find the Usage: line * in each to get the subcommand name, then extract flag entries from the * section body. returns triples of (name, description, help_result). *) let extract_subcommand_sections contents = let lines = String.split_on_char '\n' contents in let classified = List.map classify_line lines in (* split into sections at .SH boundaries, keeping only SUBCOMMAND(S) sections *) let rec collect_sections acc current_name current_lines = function | [] -> let acc = match current_name with | Some section_name -> (section_name, List.rev current_lines) :: acc | None -> acc in List.rev acc | Macro ("SH", args) :: rest -> let acc = match current_name with | Some section_name -> (section_name, List.rev current_lines) :: acc | None -> acc in let name = String.uppercase_ascii (String.trim args) in if name = "SUBCOMMAND" || name = "SUBCOMMANDS" then collect_sections acc (Some name) [] rest else collect_sections acc None [] rest | line :: rest -> collect_sections acc current_name (line :: current_lines) rest in let sections = collect_sections [] None [] classified in (* for each SUBCOMMAND section, extract name from Usage: line and parse entries *) let usage_re = Str.regexp {|Usage: \([a-zA-Z0-9_-]+\)|} in let matches_usage text = try ignore (Str.search_forward usage_re text 0); Some (Str.matched_group 1 text) with Not_found -> None in List.filter_map (fun (_header, section_lines) -> (* scan section lines for the Usage: line to get the subcommand name *) let name, desc_lines = List.fold_left (fun (name, desc_lines) line -> match name with | Some _ -> (name, desc_lines) | None -> match line with | Text text -> (match matches_usage text with | Some _ as found -> (found, desc_lines) | None -> (None, text :: desc_lines)) | Macro (("TP" | "B" | "BI" | "BR"), args) -> let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in (matches_usage text, desc_lines) | _ -> (None, desc_lines) ) (None, []) section_lines in match name with | None -> None | Some subcmd_name -> let entries = extract_entries section_lines in let desc = String.concat " " (List.rev desc_lines) |> strip_groff_escapes |> String.trim in (* strip backtick-quoted words *) let desc = Str.global_replace (Str.regexp "`\\([^`]*\\)`") "\\1" desc in Some (subcmd_name, desc, { entries; subcommands = []; positionals = []; description = desc }) ) sections (* read a manpage file from disk. handles .gz compressed files (the common * case — most installed manpages are gzipped) using the Gzip library. * plain text files are read directly. *) let read_manpage_file path = if Filename.check_suffix path ".gz" then begin let ic = Gzip.open_in path in let buffer = Buffer.create 8192 in let chunk = Bytes.create 8192 in (try while true do let bytes_read = Gzip.input ic chunk 0 8192 in if bytes_read = 0 then raise Exit else Buffer.add_subbytes buffer chunk 0 bytes_read done with Exit | End_of_file -> ()); Gzip.close_in ic; Buffer.contents buffer end else begin let ic = open_in path in let size = in_channel_length ic in let bytes = Bytes.create size in really_input ic bytes 0 size; close_in ic; Bytes.to_string bytes end (* convenience: read + parse a manpage file in one step *) let parse_manpage_file path = read_manpage_file path |> parse_manpage_string