inshellah/lib/manpage.ml

(* manpage.ml — parse unix manpages (groff/mdoc format) into help_result.
 *
 * manpages are written in roff/groff markup — a decades-old typesetting language
 * used by man(1). this module strips the formatting and extracts structured data
 * (flags, subcommands, positionals) from the raw groff source.
 *
 * there are two major manpage macro packages:
 *   - man (groff) — used by gnu/linux tools. uses macros like .SH, .TP, .IP, .PP
 *   - mdoc (bsd) — used by bsd tools. uses .Sh, .Fl, .Ar, .Op, .It, .Bl/.El
 *
 * this module handles both, auto-detecting the format by checking for .Sh macros.
 *
 * for groff manpages, flag extraction uses multiple "strategies" that target
 * different common formatting patterns:
 *   - strategy_tp: .TP tagged paragraphs (gnu coreutils, help2man)
 *   - strategy_ip: .IP indented paragraphs (curl, hand-written)
 *   - strategy_pp_rs: .PP + .RS/.RE blocks (git, docbook)
 *   - strategy_nix: nix3-style bullet .IP with .UR/.UE hyperlinks
 *   - strategy_deroff: fallback — strip all groff, feed to help text parser
 *
 * the module tries all applicable strategies and picks the one that extracts
 * the most flag entries, on the theory that more results = better match.
 *
 * key peculiarities:
 *   - groff has an enormous escape syntax (font changes, named characters,
 *     size changes, color, string variables, etc.) — strip_groff_escapes
 *     handles the common cases but is not exhaustive
 *   - font escapes like \fI (italic) need to insert spaces at word boundaries
 *     to prevent flag names from fusing with their parameter names
 *   - the strategies share the angstrom-based switch_parser from parser.ml
 *     for parsing the actual flag syntax out of the stripped text
 *)

open Parser

(* --- shared helpers for imperative string scanning ---
 * many groff parsing routines use an imperative cursor (ref int) walking
 * through a string. these helpers factor out common scanning patterns. *)

(* advance pos past all characters until the delimiter is found.
 * leaves pos pointing at the delimiter character, or at len if not found. *)
let skip_to_char source len pos delim =
  while !pos < len && source.[!pos] <> delim do incr pos done

(* translate a groff named character escape to its text equivalent.
 * groff uses two-letter codes like "aq" for apostrophe, "lq"/"rq" for
 * left/right quotes, "em"/"en" for dashes. returns None for unknown names. *)
let named_char_of = function
  | "aq"         -> Some '\''
  | "lq" | "Lq" -> Some '\x22'  (* left double quote *)
  | "rq" | "Rq" -> Some '\x22'  (* right double quote *)
  | "em" | "en"  -> Some '-'
  | _            -> None

(* skip a groff reference that uses one of three sub-forms:
 *   single char  — e.g. \*X or \nX
 *   ( + 2 chars  — e.g. \*(XX or \n(XX
 *   [ to ]       — e.g. \*[name] or \n[name]
 * used for \* (string variable) and \n (number register) escapes.
 * advances pos past the consumed characters. *)
let skip_groff_reference source len pos =
  if !pos < len then begin
    if source.[!pos] = '(' then
      pos := !pos + 3  (* skip past '(' + two-character name *)
    else if source.[!pos] = '[' then begin
      incr pos;
      skip_to_char source len pos ']';
      if !pos < len then incr pos
    end else
      incr pos
  end

(* --- groff escape/formatting stripper ---
 * groff escapes start with backslash and use various continuation syntaxes.
 * this function strips them, replacing named characters (like \(aq for
 * apostrophe) with their text equivalents and discarding formatting directives. *)

let strip_groff_escapes source =
  let buffer = Buffer.create (String.length source) in
  let len = String.length source in
  let pos = ref 0 in
  let prev_char = ref '\000' in
  (* emit a character into the output buffer and track it as previous *)
  let put char_val = Buffer.add_char buffer char_val; prev_char := char_val in
  let is_alnum char_val =
    (char_val >= 'a' && char_val <= 'z')
    || (char_val >= 'A' && char_val <= 'Z')
    || (char_val >= '0' && char_val <= '9')
  in
  while !pos < len do
    if source.[!pos] = '\\' && !pos + 1 < len then begin
      let next = source.[!pos + 1] in
      match next with
      | 'f' ->
        (* font escape: \fB, \fI, \fP, \fR, \f(XX, \f[...] *)
        if !pos + 2 < len then begin
          let font_char = source.[!pos + 2] in
          (* insert space before italic font to preserve word boundaries
             e.g. \fB--max-results\fR\fIcount\fR -> "--max-results count" *)
          if font_char = 'I' && is_alnum !prev_char then put ' ';
          if font_char = '(' then
            pos := !pos + 5  (* \f(XX — two-character font name *)
          else if font_char = '[' then begin
            pos := !pos + 3;
            skip_to_char source len pos ']';
            if !pos < len then incr pos
          end else
            pos := !pos + 3  (* \fX — single-character font selector *)
        end else
          pos := !pos + 2
      | '-' ->
        (* escaped hyphen-minus — emit a plain hyphen *)
        put '-';
        pos := !pos + 2
      | '&' | '/' | ',' ->
        (* zero-width characters — discard without output *)
        pos := !pos + 2
      | '(' ->
        (* two-char named character: \(aq, \(lq, \(rq, etc. *)
        if !pos + 3 < len then begin
          let name = String.sub source (!pos + 2) 2 in
          (match named_char_of name with
           | Some char_val -> put char_val
           | None -> ());
          pos := !pos + 4
        end else
          pos := !pos + 2
      | '[' ->
        (* bracketed named character: \[aq], \[lq], etc. *)
        pos := !pos + 2;
        let start = !pos in
        skip_to_char source len pos ']';
        if !pos < len then begin
          let name = String.sub source start (!pos - start) in
          (match named_char_of name with
           | Some char_val -> put char_val
           | None -> ());
          incr pos
        end
      | 's' ->
        (* size escape: \sN, \s+N, \s-N — skip the numeric argument *)
        pos := !pos + 2;
        if !pos < len && (source.[!pos] = '+' || source.[!pos] = '-') then incr pos;
        if !pos < len && source.[!pos] >= '0' && source.[!pos] <= '9' then incr pos;
        if !pos < len && source.[!pos] >= '0' && source.[!pos] <= '9' then incr pos
      | 'm' ->
        (* color escape: \m[...] — skip the bracketed color name *)
        pos := !pos + 2;
        if !pos < len && source.[!pos] = '[' then begin
          incr pos;
          skip_to_char source len pos ']';
          if !pos < len then incr pos
        end
      | 'X' ->
        (* device control: \X'...' — skip the single-quoted payload *)
        pos := !pos + 2;
        if !pos < len && source.[!pos] = '\'' then begin
          incr pos;
          skip_to_char source len pos '\'';
          if !pos < len then incr pos
        end
      | '*' ->
        (* string variable: \*X or \*(XX or \*[...] — skip the reference *)
        pos := !pos + 2;
        skip_groff_reference source len pos
      | 'n' ->
        (* number register: \nX or \n(XX or \n[...] — skip the reference *)
        pos := !pos + 2;
        skip_groff_reference source len pos
      | 'e' ->
        (* escaped backslash literal *)
        put '\\';
        pos := !pos + 2
      | '\\' ->
        (* double backslash — emit one *)
        put '\\';
        pos := !pos + 2
      | ' ' ->
        (* escaped space — emit a regular space *)
        put ' ';
        pos := !pos + 2
      | _ ->
        (* unknown escape — skip the two-character sequence *)
        pos := !pos + 2
    end else begin
      put source.[!pos];
      incr pos
    end
  done;
  Buffer.contents buffer

(* strip inline macro formatting: .BI, .BR, .IR, etc.
 * these macros alternate between fonts for their arguments, e.g.:
 *   .BI "--output " "FILE"
 * becomes "--outputFILE" (arguments concatenated without spaces).
 *
 * quoted strings are kept together (quotes stripped), but unquoted spaces
 * are consumed. this matches groff's actual rendering of these macros,
 * where alternating-font arguments are concatenated. *)
let strip_inline_macro_args text =
  let buffer = Buffer.create (String.length text) in
  let len = String.length text in
  let pos = ref 0 in
  while !pos < len do
    if text.[!pos] = '"' then begin
      (* quoted argument — copy characters up to the closing quote *)
      incr pos;
      while !pos < len && text.[!pos] <> '"' do
        Buffer.add_char buffer text.[!pos];
        incr pos
      done;
      if !pos < len then incr pos
    end else if text.[!pos] = ' ' || text.[!pos] = '\t' then begin
      (* unquoted whitespace — skip (arguments are concatenated) *)
      incr pos
    end else begin
      (* regular character — copy to output *)
      Buffer.add_char buffer text.[!pos];
      incr pos
    end
  done;
  Buffer.contents buffer

(* convenience: strip escapes and trim whitespace *)
let strip_groff line =
  let text = strip_groff_escapes line in
  String.trim text

(* --- line classification ---
 * every line in a manpage is classified as one of four types.
 * this classification drives all subsequent parsing — strategies
 * pattern-match on sequences of classified lines. *)

type groff_line =
  | Macro of string * string   (* macro name + args, e.g. ("SH", "OPTIONS") or ("TP", "") *)
  | Text of string             (* plain text after groff stripping *)
  | Blank                      (* empty line *)
  | Comment                    (* groff comment: .backslash-quote or backslash-quote *)

(* classify a single line of manpage source.
 * macro lines start with '.' or '\'' (groff alternate control char).
 * the macro name is split from its arguments at the first space/tab.
 * arguments wrapped in double quotes are unquoted. *)
let classify_line line =
  let len = String.length line in
  if len = 0 then Blank
  else if len >= 2 && line.[0] = '.' && line.[1] = '\\' && (len < 3 || line.[2] = '"') then
    Comment
  else if len >= 3 && line.[0] = '\\' && line.[1] = '"' then
    Comment
  else if line.[0] = '.' || line.[0] = '\'' then begin
    (* macro line — extract macro name and arguments *)
    let rest = String.sub line 1 (len - 1) in
    let rest = String.trim rest in
    (* split into macro name and arguments at the first whitespace *)
    let space_pos =
      try Some (String.index rest ' ')
      with Not_found ->
        try Some (String.index rest '\t')
        with Not_found -> None
    in
    match space_pos with
    | Some split_at ->
      let name = String.sub rest 0 split_at in
      let args = String.trim (String.sub rest (split_at + 1) (String.length rest - split_at - 1)) in
      (* strip surrounding quotes from arguments *)
      let args =
        let alen = String.length args in
        if alen >= 2 && args.[0] = '"' && args.[alen - 1] = '"' then
          String.sub args 1 (alen - 2)
        else args
      in
      Macro (name, args)
    | None ->
      Macro (rest, "")
  end else begin
    let stripped = strip_groff line in
    if String.length stripped = 0 then Blank
    else Text stripped
  end

(* refined comment detection — the base classify_line may miss some comment
 * forms, so this wrapper checks more carefully before falling through to
 * the general classifier. *)
let is_comment_line line =
  let len = String.length line in
  (len >= 3 && line.[0] = '.' && line.[1] = '\\' && line.[2] = '"')
  || (len >= 2 && line.[0] = '\\' && line.[1] = '"')

let classify_line line =
  if is_comment_line line then Comment
  else classify_line line

(* --- section extraction ---
 * manpages are divided into sections by .SH macros. the OPTIONS section
 * contains the flag definitions we want. if there's no OPTIONS section,
 * we fall back to DESCRIPTION (some simple tools put flags there).
 *
 * old-style nix manpages (nix-build, nix-env-install, etc.) split flags
 * across multiple .SH sections with option-like names: e.g. "Options" for
 * command-specific flags and "Common Options" for flags shared by all nix
 * commands. collecting only the first such section misses the majority of
 * flags, so we collect and concatenate all option-like sections. *)

let extract_options_section lines =
  let classified = List.map classify_line lines in
  (* collect lines until the next .SH header, returning (content, rest)
   * where rest starts at the .SH line (or is empty if at end of file). *)
  let rec collect_section lines acc =
    match lines with
    | [] -> (List.rev acc, [])
    | Macro ("SH", _) :: _ -> (List.rev acc, lines)
    | line :: rest -> collect_section rest (line :: acc)
  in
  (* test whether a section name looks like an options section.
   * matches "OPTIONS", "COMMON OPTIONS", "GLOBAL OPTIONS", etc. *)
  let is_options_section name =
    let upper = String.uppercase_ascii (String.trim name) in
    upper = "OPTIONS"
    || (String.length upper > 0 &&
        try let _ = Str.search_forward (Str.regexp_string "OPTION") upper 0 in true
        with Not_found -> false)
  in
  (* collect from all option-like .SH sections and concatenate them.
   * handles the common nix pattern where "Options" and "Common Options"
   * are separate .SH sections but both contain relevant flags.
   *
   * a synthetic Macro("SH","") separator is inserted between sections so
   * that collect_desc_text (which stops on SH/SS) does not let a description
   * from the last entry in one section bleed into the intro text of the next. *)
  let rec find_all_options lines acc =
    match lines with
    | [] -> acc
    | Macro ("SH", args) :: rest when is_options_section args ->
      let (section, remaining) = collect_section rest [] in
      let sep = if acc = [] then [] else [Macro ("SH", "")] in
      find_all_options remaining (acc @ sep @ section)
    | _ :: rest -> find_all_options rest acc
  in
  (* fallback: DESCRIPTION section for simple tools that put flags there *)
  let rec find_description = function
    | [] -> []
    | Macro ("SH", args) :: rest
      when String.uppercase_ascii (String.trim args) = "DESCRIPTION" ->
      fst (collect_section rest [])
    | _ :: rest -> find_description rest
  in
  match find_all_options classified [] with
  | [] -> find_description classified
  | sections -> sections

(* --- strategy-based entry extraction ---
 * rather than a single monolithic parser, we use multiple "strategies" that
 * each target a specific groff formatting pattern. this is necessary because
 * manpage authors use very different macro combinations for the same purpose.
 *
 * the shared building blocks:
 *   - collect_text_lines: gather consecutive Text lines into one description string
 *   - parse_tag_to_entry: run the angstrom switch parser on a tag string to
 *     extract the flag definition. this reuses the same parser that handles
 *     --help output, giving consistent extraction across both sources.
 *   - tag_of_macro: extract the "tag" text from formatting macros like .B, .BI, etc.
 *)

(* collect consecutive text lines, joining them with spaces *)
let rec collect_text_lines lines acc =
  match lines with
  | Text text :: rest -> collect_text_lines rest (text :: acc)
  | _ -> (String.concat " " (List.rev acc), lines)

(* attempt to parse a tag string (e.g. "-v, --verbose FILE") into an entry.
 * uses the angstrom switch_parser + param_parser from parser.ml.
 * returns None if the tag doesn't look like a flag definition. *)
let parse_tag_to_entry tag desc =
  let tag = strip_groff_escapes tag in
  let tag = String.trim tag in
  match Angstrom.parse_string ~consume:Angstrom.Consume.Prefix
          (Angstrom.lift2 (fun sw p -> (sw, p)) switch_parser param_parser) tag with
  | Ok (switch, param) -> Some { switch; param; desc }
  | Error _ -> None

(* extract tag text from a macro line.
 * .B and .I preserve spaces (single argument); .BI, .BR, .IR alternate
 * fonts and concatenate arguments. *)
let tag_of_macro name args =
  match name with
  | "B" | "I" -> strip_groff_escapes args |> String.trim
  | _ -> strip_inline_macro_args args |> strip_groff_escapes |> String.trim

(* strategy a: .TP style (most common — gnu coreutils, help2man).
 * .TP introduces a tagged paragraph: the next line is the "tag" (flag name)
 * and subsequent text lines are the description. the tag can be plain text
 * or wrapped in a formatting macro (.B, .BI, etc.).
 *
 * example groff:
 *   .TP
 *   \fB\-v\fR, \fB\-\-verbose\fR
 *   increase verbosity *)
let strategy_tp lines =
  let rec walk lines acc =
    match lines with
    | [] -> List.rev acc
    | Macro ("TP", _) :: rest ->
      (* next line is the tag — could be Text or a formatting macro *)
      begin match rest with
        | Text tag :: rest2 ->
          let (desc, rest3) = collect_text_lines rest2 [] in
          let entry = parse_tag_to_entry tag desc in
          walk rest3 (match entry with Some e -> e :: acc | None -> acc)
        | Macro (("B" | "I" | "BI" | "BR" | "IR") as macro_name, args) :: rest2 ->
          let tag = tag_of_macro macro_name args in
          let (desc, rest3) = collect_text_lines rest2 [] in
          let entry = parse_tag_to_entry tag desc in
          walk rest3 (match entry with Some e -> e :: acc | None -> acc)
        | _ -> walk rest acc
      end
    | _ :: rest -> walk rest acc
  in
  walk lines []

(* strategy b: .IP style (curl, hand-written manpages).
 * .IP takes an inline tag argument: .IP "-v, --verbose"
 * the description follows as text lines. simpler than .TP because
 * the tag is on the macro line itself. *)
let strategy_ip lines =
  let rec walk lines acc =
    match lines with
    | [] -> List.rev acc
    | Macro ("IP", tag) :: rest ->
      let tag = strip_groff_escapes tag in
      let (desc, rest2) = collect_text_lines rest [] in
      let entry = parse_tag_to_entry tag desc in
      walk rest2 (match entry with Some e -> e :: acc | None -> acc)
    | _ :: rest -> walk rest acc
  in
  walk lines []

(* strategy c: .PP + .RS/.RE style (git, docbook-generated manpages).
 * flag entries are introduced by .PP (paragraph), with the flag name as
 * plain text, followed by a .RS (indent) block containing the description,
 * closed by .RE (de-indent). this is common in docbook-to-manpage toolchains. *)
let strategy_pp_rs lines =
  let rec walk lines acc =
    match lines with
    | [] -> List.rev acc
    | Macro ("PP", _) :: rest ->
      begin match rest with
        | Text tag :: rest2 ->
          (* look for .RS ... text ... .RE *)
          let rec collect_rs lines desc_acc =
            match lines with
            | Macro ("RS", _) :: rest3 ->
              collect_in_rs rest3 desc_acc
            | Text text :: rest3 ->
              (* sometimes description follows directly *)
              collect_rs rest3 (text :: desc_acc)
            | _ -> (String.concat " " (List.rev desc_acc), lines)
          and collect_in_rs lines desc_acc =
            match lines with
            | Macro ("RE", _) :: rest3 ->
              (String.concat " " (List.rev desc_acc), rest3)
            | Text text :: rest3 ->
              collect_in_rs rest3 (text :: desc_acc)
            | Macro ("PP", _) :: _ | Macro ("SH", _) :: _ ->
              (String.concat " " (List.rev desc_acc), lines)
            | _ :: rest3 -> collect_in_rs rest3 desc_acc
            | [] -> (String.concat " " (List.rev desc_acc), [])
          in
          let (desc, rest3) = collect_rs rest2 [] in
          let entry = parse_tag_to_entry tag desc in
          walk rest3 (match entry with Some e -> e :: acc | None -> acc)
        | _ -> walk rest acc
      end
    | _ :: rest -> walk rest acc
  in
  walk lines []

(* strategy d: deroff fallback — strip all groff markup, then feed the
 * resulting plain text through the --help parser from parser.ml.
 * this is the last resort when no structured macro pattern is recognized.
 * it works surprisingly well for simple manpages but may miss entries
 * in heavily formatted ones. *)
let strategy_deroff_lines lines =
  let buffer = Buffer.create 256 in
  List.iter (fun line ->
    match line with
    | Text text ->
      Buffer.add_string buffer text;
      Buffer.add_char buffer '\n'
    | Macro (("BI" | "BR" | "IR" | "B" | "I"), args) ->
      let text = strip_inline_macro_args args in
      let text = strip_groff_escapes text in
      Buffer.add_string buffer text;
      Buffer.add_char buffer '\n'
    | Blank -> Buffer.add_char buffer '\n'
    | _ -> ()
  ) lines;
  let text = Buffer.contents buffer in
  match parse_help text with
  | Ok result -> result.entries
  | Error _ -> []

(* strategy e: nix3-style bullet .IP with .UR/.UE hyperlinks.
 * nix's manpages use .IP with bullet markers for flag entries, interleaved
 * with .UR/.UE hyperlink macros. the flag tag is in text lines after the
 * bullet .IP, and the description follows a non-bullet .IP marker.
 *
 * nix manpages nest .RS/.RE blocks inside descriptions for sub-examples.
 * the skip_rs helper tracks nesting depth to skip these without losing
 * the rest of the description. *)
let strategy_nix lines =
  (* a bullet .IP has non-empty args (the bullet marker) *)
  let is_bullet_ip args =
    String.length (String.trim args) > 0
  in
  let rec walk lines acc =
    match lines with
    | [] -> List.rev acc
    | Macro ("IP", args) :: rest when is_bullet_ip args ->
      (* collect tag: skip .UR/.UE macros, collect Text lines *)
      let rec collect_tag lines parts =
        match lines with
        | Macro ("UR", _) :: rest2 -> collect_tag rest2 parts
        | Macro ("UE", _) :: rest2 -> collect_tag rest2 parts
        | Text text :: rest2 -> collect_tag rest2 (text :: parts)
        | _ -> (String.concat " " (List.rev parts), lines)
      in
      let (tag, rest2) = collect_tag rest [] in
      (* collect description after the description .IP marker *)
      let rec collect_desc lines parts =
        match lines with
        | Macro ("IP", dargs) :: rest3 when not (is_bullet_ip dargs) ->
          collect_desc_text rest3 parts
        | _ -> (String.concat " " (List.rev parts), lines)
      and collect_desc_text lines parts =
        match lines with
        | Text text :: rest3 -> collect_desc_text rest3 (text :: parts)
        | Macro ("IP", args2) :: _ when is_bullet_ip args2 ->
          (* next bullet entry — stop collecting *)
          (String.concat " " (List.rev parts), lines)
        | Macro (("SS" | "SH"), _) :: _ ->
          (* section boundary — stop collecting *)
          (String.concat " " (List.rev parts), lines)
        | Macro ("RS", _) :: rest3 ->
          skip_rs rest3 parts 1
        | Macro ("IP", _) :: rest3 ->
          (* non-bullet .IP = continuation paragraph *)
          collect_desc_text rest3 parts
        | Macro _ :: rest3 -> collect_desc_text rest3 parts
        | Blank :: rest3 -> collect_desc_text rest3 parts
        | Comment :: rest3 -> collect_desc_text rest3 parts
        | [] -> (String.concat " " (List.rev parts), [])
      and skip_rs lines parts depth =
        match lines with
        | Macro ("RE", _) :: rest3 ->
          if depth <= 1 then collect_desc_text rest3 parts
          else skip_rs rest3 parts (depth - 1)
        | Macro ("RS", _) :: rest3 -> skip_rs rest3 parts (depth + 1)
        | _ :: rest3 -> skip_rs rest3 parts depth
        | [] -> (String.concat " " (List.rev parts), [])
      in
      let (desc, rest3) = collect_desc rest2 [] in
      let entry = parse_tag_to_entry tag desc in
      walk rest3 (match entry with Some e -> e :: acc | None -> acc)
    | _ :: rest -> walk rest acc
  in
  walk lines []

(* count occurrences of a specific macro in the section.
 * used by extract_entries to decide which strategies are worth trying. *)
let count_macro name lines =
  List.fold_left (fun count line ->
    match line with Macro (macro_name, _) when macro_name = name -> count + 1 | _ -> count
  ) 0 lines

(* auto-detect and try strategies, return the one with most entries.
 * first counts macros to determine which strategies are applicable,
 * then runs all applicable ones and picks the winner by entry count.
 * if no specialized strategy produces results, falls back to deroff.
 *
 * this "try everything, pick the best" approach is intentional.
 * manpage formatting is too varied and inconsistent to reliably detect the
 * format from macro counts alone. running multiple strategies and comparing
 * results is more robust. *)
let extract_entries lines =
  let tp = count_macro "TP" lines
  and ip = count_macro "IP" lines
  and pp = count_macro "PP" lines
  and rs = count_macro "RS" lines
  and ur = count_macro "UR" lines in
  (* build a list of (label, entries) for each applicable strategy *)
  let specialized = List.filter_map Fun.id [
    (if tp > 0 then Some ("TP", strategy_tp lines) else None);
    (if ip > 0 then Some ("IP", strategy_ip lines) else None);
    (if pp > 0 && rs > 0 then Some ("PP+RS", strategy_pp_rs lines) else None);
    (if ur > 0 && ip > 0 then Some ("nix", strategy_nix lines) else None);
  ] in
  (* filter to strategies that found at least one entry, fall back to deroff *)
  let candidates = match List.filter (fun (_, entries) -> entries <> []) specialized with
    | [] -> [("deroff", strategy_deroff_lines lines)]
    | filtered -> filtered
  in
  (* pick the strategy with the most entries *)
  List.fold_left (fun (_, best) (name, entries) ->
    if List.length entries >= List.length best then (name, entries)
    else (name, best)
  ) ("none", []) candidates |> snd

(* --- NAME section description extraction ---
 * the NAME section in manpages follows the convention:
 *   "command \- short description"
 * we extract the part after "\-" as the command's description.
 * handles both "\-" (groff) and " - " (plain text) separators. *)

let extract_name_description contents =
  let lines = String.split_on_char '\n' contents in
  let classified = List.map classify_line lines in
  let rec find = function
    | [] -> None
    | Macro ("SH", args) :: rest
      when String.uppercase_ascii (String.trim args) = "NAME" ->
      collect rest []
    | _ :: rest -> find rest
  and collect lines acc =
    match lines with
    | Macro ("SH", _) :: _ | [] -> finish acc
    | Text text :: rest -> collect rest (text :: acc)
    | Macro (("B" | "BI" | "BR" | "I" | "IR"), args) :: rest ->
      let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in
      collect rest (if String.length text > 0 then text :: acc else acc)
    | Macro ("Nm", args) :: rest ->
      let text = strip_groff_escapes args |> String.trim in
      collect rest (if String.length text > 0 then text :: acc else acc)
    | Macro ("Nd", args) :: rest ->
      let text = strip_groff_escapes args |> String.trim in
      collect rest (if String.length text > 0 then ("\\- " ^ text) :: acc else acc)
    | _ :: rest -> collect rest acc
  and finish acc =
    let full = String.concat " " (List.rev acc) |> String.trim in
    (* NAME lines look like: "git-add \- Add file contents to the index" *)
    let sep = Str.regexp {| *\\- *\| +- +|} in
    match Str.bounded_split sep full 2 with
    | [_; desc] -> Some (String.trim desc)
    | _ -> None
  in
  find classified

(* --- SYNOPSIS command name extraction ---
 * the SYNOPSIS section shows how to invoke the command:
 *   .SH SYNOPSIS
 *   .B git add
 *   [\fIOPTIONS\fR] [\fB\-\-\fR] [\fI<pathspec>\fR...]
 *
 * we extract the command name by taking consecutive "word" tokens until
 * we hit something that looks like an argument (starts with [, <, -, etc.). *)

let extract_synopsis_command_lines lines =
  (* replace italic text (\fI...\fR) with angle-bracketed placeholders
   * before classification strips the font info. italic in groff indicates
   * a parameter/placeholder (e.g. \fIoperation\fR), not a command word.
   * the angle brackets cause extract_cmd to stop at these tokens since
   * '<' is in its stop set. without this, "nix-env \fIoperation\fR"
   * would be parsed as command "nix-env operation" instead of "nix-env". *)
  let lines = List.map (fun line ->
    Str.global_replace (Str.regexp {|\\fI\([^\\]*\)\\f[RP]|}) {|<\1>|} line
  ) lines in
  let classified = List.map classify_line lines in
  let is_synopsis name =
    String.uppercase_ascii (String.trim name) = "SYNOPSIS"
  in
  (* extract the command name from a line by taking leading word tokens *)
  let extract_cmd line =
    let words = String.split_on_char ' ' (String.trim line) in
    let words = List.filter (fun word -> String.length word > 0) words in
    let is_cmd_char = function
      | 'a'..'z' | 'A'..'Z' | '0'..'9' | '-' | '_' | '.' -> true
      | _ -> false
    in
    (* take words that look like command name parts, stop at arguments *)
    let rec take = function
      | [] -> []
      | word :: rest ->
        if String.length word > 0
           && (word.[0] = '[' || word.[0] = '-' || word.[0] = '<'
               || word.[0] = '(' || word.[0] = '{')
        then []
        else if String.for_all is_cmd_char word then
          word :: take rest
        else []
    in
    match take words with
    | [] -> None
    | cmd -> Some (String.concat " " cmd)
  in
  let rec find = function
    | [] -> None
    | Macro ("SH", args) :: rest when is_synopsis args -> collect rest
    | _ :: rest -> find rest
  and collect = function
    | [] -> None
    | Macro ("SH", _) :: _ -> None
    | Text text :: _ ->
      let text = String.trim text in
      if String.length text > 0 then extract_cmd text else None
    | Macro (("B" | "BI" | "BR"), args) :: _ ->
      let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in
      if String.length text > 0 then extract_cmd text else None
    | _ :: rest -> collect rest
  in
  find classified

let extract_synopsis_command contents =
  let lines = String.split_on_char '\n' contents in
  extract_synopsis_command_lines lines

(* --- SYNOPSIS positional extraction ---
 * extract positional arguments from the SYNOPSIS section by collecting
 * all text/formatting macro lines, joining them, skipping the command
 * name prefix, then running parse_usage_args from parser.ml on the remainder. *)

let extract_synopsis_positionals_lines lines =
  let classified = List.map classify_line lines in
  let is_synopsis name =
    String.uppercase_ascii (String.trim name) = "SYNOPSIS"
  in
  let rec find = function
    | [] -> []
    | Macro ("SH", args) :: rest when is_synopsis args -> collect rest []
    | _ :: rest -> find rest
  and collect lines acc =
    match lines with
    | [] -> finish acc
    | Macro ("SH", _) :: _ -> finish acc
    | Macro ("SS", _) :: _ -> finish acc
    | Macro ("br", _) :: _ -> finish acc
    | Text text :: rest ->
      let text = strip_groff_escapes text |> String.trim in
      collect rest (if String.length text > 0 then text :: acc else acc)
    | Macro (("B" | "BI" | "BR" | "I" | "IR" | "IB" | "RB" | "RI"), args) :: rest ->
      let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in
      collect rest (if String.length text > 0 then text :: acc else acc)
    | _ :: rest -> collect rest acc
  and finish acc =
    let parts = List.rev acc in
    let full = String.concat " " parts |> String.trim in
    if String.length full = 0 then []
    else
      let cmd_end = skip_command_prefix full in
      let args = String.sub full cmd_end (String.length full - cmd_end) in
      parse_usage_args args
  in
  find classified

(* --- mdoc (bsd) format support ---
 * mdoc is the bsd manpage macro package. it uses semantic macros rather than
 * presentation macros:
 *   .Fl v    -> flag: -v
 *   .Ar file -> argument: file
 *   .Op ...  -> optional: [...]
 *   .Bl/.It/.El -> list begin/item/end
 *   .Sh      -> section header (note lowercase 'h', vs groff's .SH)
 *
 * the parser walks through classified lines looking for .Bl (list begin)
 * blocks containing .It (items) with .Fl (flag) entries. *)

let is_mdoc lines =
  List.exists (fun line ->
    match classify_line line with Macro ("Sh", _) -> true | _ -> false
  ) lines

(* extract renderable text from an mdoc line, skipping structural macros *)
let mdoc_text_of line =
  match line with
  | Text text -> Some (strip_groff_escapes text)
  | Macro (macro_name, args) ->
    (match macro_name with
     | "Pp" | "Bl" | "El" | "Sh" | "Ss" | "Os" | "Dd" | "Dt"
     | "Oo" | "Oc" | "Op" -> None
     | _ ->
       let text = strip_groff_escapes args |> String.trim in
       if text = "" then None else Some text)
  | _ -> None

(* parse an mdoc .It (list item) line that contains flag definitions.
 * mdoc .It lines look like: ".It Fl v Ar file"
 * where Fl = flag, Ar = argument. we extract the flag name and parameter.
 *
 * only handles single-char short flags and long flags starting with '-'.
 * mdoc's .Fl macro automatically prepends '-', so "Fl v" means "-v"
 * and "Fl -verbose" means "--verbose". *)
let parse_mdoc_it args =
  let words = String.split_on_char ' ' args
              |> List.filter (fun word -> word <> "" && word <> "Ns") in
  let param = match words with
    | _ :: _ :: "Ar" :: param_name :: _ -> Some (Mandatory param_name)
    | _ -> None
  in
  match words with
  | "Fl" :: char_str :: _ when String.length char_str = 1 && is_alphanumeric char_str.[0] ->
    Some { switch = Short char_str.[0]; param; desc = "" }
  | "Fl" :: name :: _ when String.length name > 1 && name.[0] = '-' ->
    Some { switch = Long (String.sub name 1 (String.length name - 1)); param; desc = "" }
  | _ -> None

(* extract a positional argument from an mdoc line (.Ar or .Op Ar) *)
let positional_of_mdoc_line optional args =
  let words = String.split_on_char ' ' args
              |> List.filter (fun word -> word <> "") in
  match words with
  | name :: _ when String.length name >= 2 ->
    Some { pos_name = String.lowercase_ascii name;
           optional; variadic = List.mem "..." words }
  | _ -> None

(* parse an entire mdoc-format manpage.
 * walks through all classified lines looking for:
 *   1. .Bl/.It/.El list blocks containing flag definitions
 *   2. .Sh SYNOPSIS sections containing positional arguments (.Ar, .Op Ar)
 *
 * the scan function handles nested .Bl blocks — if the first .It in a .Bl
 * starts with .Fl (a flag), the entire list is parsed as options. otherwise
 * the list is skipped (it might be an example list or a description list). *)
let parse_mdoc_lines lines =
  let classified = List.map classify_line lines in
  (* skip lines until the matching .El closing tag *)
  let rec skip_to_el = function
    | [] -> []
    | Macro ("El", _) :: rest -> rest
    | _ :: rest -> skip_to_el rest
  in
  (* collect description text lines until the next structural macro *)
  let rec collect_desc acc = function
    | [] -> (acc, [])
    | (Macro ("It", _) | Macro ("El", _)
      | Macro ("Sh", _) | Macro ("Ss", _)) :: _ as rest -> (acc, rest)
    | line :: rest ->
      collect_desc (match mdoc_text_of line with Some text -> text :: acc | None -> acc) rest
  in
  (* convenience: collect desc and join into a trimmed string *)
  let desc_of rest =
    let parts, rest = collect_desc [] rest in
    (String.concat " " (List.rev parts) |> String.trim, rest)
  in
  (* parse a single .It entry: extract flag, collect description *)
  let parse_it args rest entries =
    let desc, rest = desc_of rest in
    let entries = match parse_mdoc_it args with
      | Some entry -> { entry with desc } :: entries
      | None -> entries
    in
    (entries, rest)
  in
  (* parse all .It entries within a .Bl/.El option list *)
  let rec parse_option_list entries = function
    | [] -> (entries, [])
    | Macro ("El", _) :: rest -> (entries, rest)
    | Macro ("It", args) :: rest ->
      let entries, rest = parse_it args rest entries in
      parse_option_list entries rest
    | _ :: rest -> parse_option_list entries rest
  in
  (* main scan: walk through all lines, collecting flags and positionals *)
  let rec scan entries positionals = function
    | [] -> (entries, positionals)
    | Macro ("Bl", _) :: Macro ("It", it_args) :: rest ->
      (* peek at first .It to decide if this is a flag list *)
      let words = String.split_on_char ' ' it_args
                  |> List.filter (fun word -> word <> "") in
      if (match words with "Fl" :: _ -> true | _ -> false) then
        let entries, rest = parse_it it_args rest entries in
        let entries, rest = parse_option_list entries rest in
        scan entries positionals rest
      else
        scan entries positionals (skip_to_el rest)
    | Macro ("Bl", _) :: rest -> scan entries positionals (skip_to_el rest)
    | Macro ("Sh", args) :: rest
      when String.uppercase_ascii (String.trim args) = "SYNOPSIS" ->
      let positionals, rest = parse_synopsis positionals rest in
      scan entries positionals rest
    | _ :: rest -> scan entries positionals rest
  and parse_synopsis positionals = function
    | [] -> (positionals, [])
    | Macro ("Sh", _) :: _ as rest -> (positionals, rest)
    | Macro ("Ar", args) :: rest ->
      let positionals = match positional_of_mdoc_line false args with
        | Some p -> p :: positionals | None -> positionals in
      parse_synopsis positionals rest
    | Macro ("Op", args) :: rest ->
      let words = String.split_on_char ' ' args
                  |> List.filter (fun word -> word <> "") in
      let positionals = match words with
        | "Ar" :: _ ->
          (match positional_of_mdoc_line true args with
           | Some p -> p :: positionals | None -> positionals)
        | _ -> positionals in
      parse_synopsis positionals rest
    | _ :: rest -> parse_synopsis positionals rest
  in
  let entries, positionals = scan [] [] classified in
  (* deduplicate positionals by name, preserving order *)
  let positionals =
    List.rev positionals
    |> List.fold_left (fun (seen, acc) p ->
         if List.mem p.pos_name seen then (seen, acc)
         else (p.pos_name :: seen, p :: acc)
       ) ([], [])
    |> snd |> List.rev
  in
  { entries = List.rev entries; subcommands = []; positionals; description = "" }

(* --- COMMANDS section subcommand extraction ---
 * some manpages (notably systemctl) have a dedicated COMMANDS section
 * listing subcommands with descriptions. these use .PP + bold name +
 * .RS/.RE blocks:
 *   .PP
 *   \fBstart\fR \fIUNIT\fR...
 *   .RS 4
 *   Start (activate) one or more units.
 *   .RE
 *
 * we extract the bold command name and first sentence of description. *)

let extract_commands_section lines =
  let classified = List.map classify_line lines in
  (* collect all lines from the current position until the next .SH *)
  let rec collect_until_next_sh lines acc =
    match lines with
    | [] -> List.rev acc
    | Macro ("SH", _) :: _ -> List.rev acc
    | line :: rest -> collect_until_next_sh rest (line :: acc)
  in
  let is_commands_section name =
    let upper = String.uppercase_ascii (String.trim name) in
    upper = "COMMANDS" || upper = "COMMAND"
  in
  (* find all COMMANDS/.COMMAND sections and collect their lines *)
  let rec find_commands acc = function
    | [] -> List.rev acc
    | Macro ("SH", args) :: rest when is_commands_section args ->
      find_commands (collect_until_next_sh rest [] :: acc) rest
    | _ :: rest -> find_commands acc rest
  in
  let sections = find_commands [] classified in
  List.concat sections

(* extract subcommand name from a bold groff text like
 *   "\fBlist\-units\fR [\fIPATTERN\fR...]" -> "list-units"
 *
 * validates that the extracted name looks like a subcommand: lowercase,
 * at least 2 chars, no leading dash. falls back to stripping all groff
 * and taking the first word if no \fB...\fR wrapper is found. *)
let extract_bold_command_name text =
  let trimmed = String.trim text in
  (* check whether a string looks like a valid subcommand name *)
  let is_valid_subcmd name =
    String.length name >= 2
    && name.[0] <> '-'
    && String.for_all (fun char_val ->
         (char_val >= 'a' && char_val <= 'z')
         || (char_val >= '0' && char_val <= '9')
         || char_val = '-' || char_val = '_'
       ) name
  in
  (* look for \fB...\fR at the start *)
  if String.length trimmed >= 4
     && trimmed.[0] = '\\' && trimmed.[1] = 'f' && trimmed.[2] = 'B' then
    let start = 3 in
    let end_marker = "\\fR" in
    match String.split_on_char '\\' (String.sub trimmed start (String.length trimmed - start)) with
    | name_part :: _ ->
      let name = strip_groff_escapes ("\\fB" ^ name_part ^ end_marker) |> String.trim in
      if is_valid_subcmd name then Some name else None
    | [] -> None
  else
    (* try already-stripped text — take the first word *)
    let stripped = strip_groff_escapes trimmed in
    let first_word = match String.split_on_char ' ' stripped with
      | word :: _ -> word | [] -> "" in
    if is_valid_subcmd first_word then Some first_word else None

(* walk through commands section lines, extracting subcommand name+description
 * pairs from .PP + Text + .RS/.RE blocks *)
let extract_subcommands_from_commands lines =
  let rec walk lines acc =
    match lines with
    | [] -> List.rev acc
    | Macro ("PP", _) :: rest ->
      begin match rest with
        | Text tag :: rest2 ->
          (* check if this is a subcommand (bold name, not a flag) *)
          begin match extract_bold_command_name tag with
            | Some name ->
              (* collect description from .RS/.RE block *)
              let rec collect_desc lines desc_acc =
                match lines with
                | Macro ("RS", _) :: rest3 ->
                  collect_in_rs rest3 desc_acc
                | Text text :: rest3 ->
                  collect_desc rest3 (text :: desc_acc)
                | _ -> (String.concat " " (List.rev desc_acc), lines)
              and collect_in_rs lines desc_acc =
                match lines with
                | Macro ("RE", _) :: rest3 ->
                  (String.concat " " (List.rev desc_acc), rest3)
                | Text text :: rest3 ->
                  collect_in_rs rest3 (text :: desc_acc)
                | Macro ("PP", _) :: _ | Macro ("SH", _) :: _ | Macro ("SS", _) :: _ ->
                  (String.concat " " (List.rev desc_acc), lines)
                | _ :: rest3 -> collect_in_rs rest3 desc_acc
                | [] -> (String.concat " " (List.rev desc_acc), [])
              in
              let (desc, rest3) = collect_desc rest2 [] in
              let desc = String.trim desc in
              (* take first sentence as description *)
              let short_desc = match String.split_on_char '.' desc with
                | first :: _ when String.length first > 0 -> String.trim first
                | _ -> desc in
              let sc : subcommand = { name; desc = short_desc } in
              walk rest3 (sc :: acc)
            | None -> walk rest2 acc
          end
        | _ -> walk rest acc
      end
    | _ :: rest -> walk rest acc
  in
  walk lines []

(* --- top-level api --- *)

(* parse a manpage from its classified lines.
 * auto-detects mdoc vs groff format. for groff, runs the multi-strategy
 * extraction pipeline: extract OPTIONS section -> try all strategies ->
 * pick best -> extract SYNOPSIS positionals -> extract COMMANDS subcommands. *)
let parse_manpage_lines lines =
  if is_mdoc lines then
    parse_mdoc_lines lines
  else begin
    let options_section = extract_options_section lines in
    let entries = extract_entries options_section in
    let positionals = extract_synopsis_positionals_lines lines in
    let commands_section = extract_commands_section lines in
    let subcommands = extract_subcommands_from_commands commands_section in
    { entries; subcommands; positionals; description = "" }
  end

(* parse a manpage from its raw string contents.
 * splits into lines, parses, then extracts the NAME section description. *)
let parse_manpage_string contents =
  let lines = String.split_on_char '\n' contents in
  let result = parse_manpage_lines lines in
  let description = match extract_name_description contents with
    | Some desc -> desc | None -> "" in
  { result with description }

(* --- clap-style SUBCOMMAND section extraction ---
 * manpages generated by clap (rust's cli arg parser) put each subcommand
 * under its own .SH SUBCOMMAND header with a Usage: line giving the name.
 * this is unusual — most tools list subcommands under a single COMMANDS section.
 *
 * we collect all .SH SUBCOMMAND/SUBCOMMANDS sections, find the Usage: line
 * in each to get the subcommand name, then extract flag entries from the
 * section body. returns triples of (name, description, help_result). *)
let extract_subcommand_sections contents =
  let lines = String.split_on_char '\n' contents in
  let classified = List.map classify_line lines in
  (* split into sections at .SH boundaries, keeping only SUBCOMMAND(S) sections *)
  let rec collect_sections acc current_name current_lines = function
    | [] ->
      let acc = match current_name with
        | Some section_name -> (section_name, List.rev current_lines) :: acc
        | None -> acc in
      List.rev acc
    | Macro ("SH", args) :: rest ->
      let acc = match current_name with
        | Some section_name -> (section_name, List.rev current_lines) :: acc
        | None -> acc in
      let name = String.uppercase_ascii (String.trim args) in
      if name = "SUBCOMMAND" || name = "SUBCOMMANDS" then
        collect_sections acc (Some name) [] rest
      else
        collect_sections acc None [] rest
    | line :: rest ->
      collect_sections acc current_name (line :: current_lines) rest
  in
  let sections = collect_sections [] None [] classified in
  (* for each SUBCOMMAND section, extract name from Usage: line and parse entries *)
  let usage_re = Str.regexp {|Usage: \([a-zA-Z0-9_-]+\)|} in
  let matches_usage text =
    try ignore (Str.search_forward usage_re text 0); Some (Str.matched_group 1 text)
    with Not_found -> None in
  List.filter_map (fun (_header, section_lines) ->
    (* scan section lines for the Usage: line to get the subcommand name *)
    let name, desc_lines =
      List.fold_left (fun (name, desc_lines) line ->
        match name with
        | Some _ -> (name, desc_lines)
        | None ->
          match line with
          | Text text ->
            (match matches_usage text with
             | Some _ as found -> (found, desc_lines)
             | None -> (None, text :: desc_lines))
          | Macro (("TP" | "B" | "BI" | "BR"), args) ->
            let text = strip_inline_macro_args args |> strip_groff_escapes |> String.trim in
            (matches_usage text, desc_lines)
          | _ -> (None, desc_lines)
      ) (None, []) section_lines in
    match name with
    | None -> None
    | Some subcmd_name ->
      let entries = extract_entries section_lines in
      let desc = String.concat " " (List.rev desc_lines)
                 |> strip_groff_escapes |> String.trim in
      (* strip backtick-quoted words *)
      let desc = Str.global_replace (Str.regexp "`\\([^`]*\\)`") "\\1" desc in
      Some (subcmd_name, desc, { entries; subcommands = []; positionals = []; description = desc })
  ) sections

(* read a manpage file from disk. handles .gz compressed files (the common
 * case — most installed manpages are gzipped) using the Gzip library.
 * plain text files are read directly. *)
let read_manpage_file path =
  if Filename.check_suffix path ".gz" then begin
    let ic = Gzip.open_in path in
    let buffer = Buffer.create 8192 in
    let chunk = Bytes.create 8192 in
    (try while true do
       let bytes_read = Gzip.input ic chunk 0 8192 in
       if bytes_read = 0 then raise Exit
       else Buffer.add_subbytes buffer chunk 0 bytes_read
     done with Exit | End_of_file -> ());
    Gzip.close_in ic;
    Buffer.contents buffer
  end else begin
    let ic = open_in path in
    let size = in_channel_length ic in
    let bytes = Bytes.create size in
    really_input ic bytes 0 size;
    close_in ic;
    Bytes.to_string bytes
  end

(* convenience: read + parse a manpage file in one step *)
let parse_manpage_file path =
  read_manpage_file path |> parse_manpage_string