415 lines
13 KiB
OCaml
415 lines
13 KiB
OCaml
open Parser
|
|
|
|
(* --- Groff escape/formatting stripper --- *)
|
|
|
|
let strip_groff_escapes s =
|
|
let buf = Buffer.create (String.length s) in
|
|
let len = String.length s in
|
|
let i = ref 0 in
|
|
while !i < len do
|
|
if s.[!i] = '\\' && !i + 1 < len then begin
|
|
let next = s.[!i + 1] in
|
|
match next with
|
|
| 'f' ->
|
|
(* Font escape: \fB, \fI, \fP, \fR, \f(XX, \f[...] *)
|
|
if !i + 2 < len then begin
|
|
if s.[!i + 2] = '(' then
|
|
i := !i + 4 (* \f(XX *)
|
|
else if s.[!i + 2] = '[' then begin
|
|
(* \f[...] - skip to ] *)
|
|
i := !i + 3;
|
|
while !i < len && s.[!i] <> ']' do incr i done;
|
|
if !i < len then incr i
|
|
end else
|
|
i := !i + 3 (* \fX *)
|
|
end else
|
|
i := !i + 2
|
|
| '-' ->
|
|
Buffer.add_char buf '-';
|
|
i := !i + 2
|
|
| '&' | '/' | ',' ->
|
|
(* Zero-width characters *)
|
|
i := !i + 2
|
|
| '(' ->
|
|
(* Two-char named character: \(aq, \(lq, \(rq, etc. *)
|
|
if !i + 3 < len then begin
|
|
let name = String.sub s (!i + 2) 2 in
|
|
(match name with
|
|
| "aq" -> Buffer.add_char buf '\''
|
|
| "lq" | "Lq" -> Buffer.add_char buf '"'
|
|
| "rq" | "Rq" -> Buffer.add_char buf '"'
|
|
| "em" | "en" -> Buffer.add_char buf '-'
|
|
| _ -> ());
|
|
i := !i + 4
|
|
end else
|
|
i := !i + 2
|
|
| '[' ->
|
|
(* Named character: \[...] *)
|
|
i := !i + 2;
|
|
let start = !i in
|
|
while !i < len && s.[!i] <> ']' do incr i done;
|
|
if !i < len then begin
|
|
let name = String.sub s start (!i - start) in
|
|
(match name with
|
|
| "aq" -> Buffer.add_char buf '\''
|
|
| "lq" | "Lq" -> Buffer.add_char buf '"'
|
|
| "rq" | "Rq" -> Buffer.add_char buf '"'
|
|
| _ -> ());
|
|
incr i
|
|
end
|
|
| 's' ->
|
|
(* Size escape: \sN, \s+N, \s-N, \s'N' *)
|
|
i := !i + 2;
|
|
if !i < len && (s.[!i] = '+' || s.[!i] = '-') then incr i;
|
|
if !i < len && s.[!i] >= '0' && s.[!i] <= '9' then incr i;
|
|
if !i < len && s.[!i] >= '0' && s.[!i] <= '9' then incr i
|
|
| 'm' ->
|
|
(* Color escape: \m[...] *)
|
|
i := !i + 2;
|
|
if !i < len && s.[!i] = '[' then begin
|
|
incr i;
|
|
while !i < len && s.[!i] <> ']' do incr i done;
|
|
if !i < len then incr i
|
|
end
|
|
| 'X' ->
|
|
(* Device control: \X'...' *)
|
|
i := !i + 2;
|
|
if !i < len && s.[!i] = '\'' then begin
|
|
incr i;
|
|
while !i < len && s.[!i] <> '\'' do incr i done;
|
|
if !i < len then incr i
|
|
end
|
|
| '*' ->
|
|
(* String variable: \*X or \*(XX or \*[...] *)
|
|
i := !i + 2;
|
|
if !i < len then begin
|
|
if s.[!i] = '(' then
|
|
i := !i + 2
|
|
else if s.[!i] = '[' then begin
|
|
incr i;
|
|
while !i < len && s.[!i] <> ']' do incr i done;
|
|
if !i < len then incr i
|
|
end else
|
|
incr i
|
|
end
|
|
| 'n' ->
|
|
(* Number register: \nX or \n(XX or \n[...] *)
|
|
i := !i + 2;
|
|
if !i < len then begin
|
|
if s.[!i] = '(' then
|
|
i := !i + 2
|
|
else if s.[!i] = '[' then begin
|
|
incr i;
|
|
while !i < len && s.[!i] <> ']' do incr i done;
|
|
if !i < len then incr i
|
|
end else
|
|
incr i
|
|
end
|
|
| 'e' ->
|
|
Buffer.add_char buf '\\';
|
|
i := !i + 2
|
|
| '\\' ->
|
|
Buffer.add_char buf '\\';
|
|
i := !i + 2
|
|
| ' ' ->
|
|
Buffer.add_char buf ' ';
|
|
i := !i + 2
|
|
| _ ->
|
|
(* Unknown escape, skip *)
|
|
i := !i + 2
|
|
end else begin
|
|
Buffer.add_char buf s.[!i];
|
|
incr i
|
|
end
|
|
done;
|
|
Buffer.contents buf
|
|
|
|
(* Strip inline macro formatting: .BI, .BR, .IR, etc.
|
|
These macros alternate between fonts for their arguments.
|
|
We just concatenate the arguments. *)
|
|
let strip_inline_macro_args s =
|
|
(* Arguments are separated by spaces, quoted strings are kept together *)
|
|
let buf = Buffer.create (String.length s) in
|
|
let len = String.length s in
|
|
let i = ref 0 in
|
|
while !i < len do
|
|
if s.[!i] = '"' then begin
|
|
incr i;
|
|
while !i < len && s.[!i] <> '"' do
|
|
Buffer.add_char buf s.[!i];
|
|
incr i
|
|
done;
|
|
if !i < len then incr i
|
|
end else if s.[!i] = ' ' || s.[!i] = '\t' then begin
|
|
incr i
|
|
end else begin
|
|
Buffer.add_char buf s.[!i];
|
|
incr i
|
|
end
|
|
done;
|
|
Buffer.contents buf
|
|
|
|
let strip_groff line =
|
|
let s = strip_groff_escapes line in
|
|
String.trim s
|
|
|
|
(* --- Line classification --- *)
|
|
|
|
type groff_line =
|
|
| Macro of string * string (* e.g. ("SH", "OPTIONS") or ("TP", "") *)
|
|
| Text of string (* plain text after stroff stripping *)
|
|
| Blank
|
|
| Comment
|
|
|
|
let classify_line line =
|
|
let len = String.length line in
|
|
if len = 0 then Blank
|
|
else if len >= 2 && line.[0] = '.' && line.[1] = '\\' && (len < 3 || line.[2] = '"') then
|
|
Comment
|
|
else if len >= 3 && line.[0] = '\\' && line.[1] = '"' then
|
|
Comment
|
|
else if line.[0] = '.' || line.[0] = '\'' then begin
|
|
(* Macro line *)
|
|
let rest = String.sub line 1 (len - 1) in
|
|
let rest = String.trim rest in
|
|
(* Split into macro name and arguments *)
|
|
let space_pos =
|
|
try Some (String.index rest ' ')
|
|
with Not_found ->
|
|
try Some (String.index rest '\t')
|
|
with Not_found -> None
|
|
in
|
|
match space_pos with
|
|
| Some pos ->
|
|
let name = String.sub rest 0 pos in
|
|
let args = String.trim (String.sub rest (pos + 1) (String.length rest - pos - 1)) in
|
|
(* Strip quotes from args *)
|
|
let args =
|
|
let alen = String.length args in
|
|
if alen >= 2 && args.[0] = '"' && args.[alen - 1] = '"' then
|
|
String.sub args 1 (alen - 2)
|
|
else args
|
|
in
|
|
Macro (name, args)
|
|
| None ->
|
|
Macro (rest, "")
|
|
end else begin
|
|
let stripped = strip_groff line in
|
|
if String.length stripped = 0 then Blank
|
|
else Text stripped
|
|
end
|
|
|
|
(* Check for dot-backslash-quote style comments more carefully *)
|
|
let is_comment_line line =
|
|
let len = String.length line in
|
|
(len >= 3 && line.[0] = '.' && line.[1] = '\\' && line.[2] = '"')
|
|
|| (len >= 2 && line.[0] = '\\' && line.[1] = '"')
|
|
|
|
let classify_line line =
|
|
if is_comment_line line then Comment
|
|
else classify_line line
|
|
|
|
(* --- Section extraction --- *)
|
|
|
|
let extract_options_section lines =
|
|
let classified = List.map classify_line lines in
|
|
let rec collect_until_next_sh lines acc =
|
|
match lines with
|
|
| [] -> List.rev acc
|
|
| Macro ("SH", _) :: _ -> List.rev acc
|
|
| line :: rest -> collect_until_next_sh rest (line :: acc)
|
|
in
|
|
let is_options_section name =
|
|
let s = String.uppercase_ascii (String.trim name) in
|
|
s = "OPTIONS"
|
|
|| (String.length s > 0 &&
|
|
try let _ = Str.search_forward (Str.regexp_string "OPTION") s 0 in true
|
|
with Not_found -> false)
|
|
in
|
|
(* First pass: look for OPTIONS section *)
|
|
let rec find_options = function
|
|
| [] -> None
|
|
| Macro ("SH", args) :: rest when is_options_section args ->
|
|
Some (collect_until_next_sh rest [])
|
|
| _ :: rest -> find_options rest
|
|
in
|
|
(* Fallback: DESCRIPTION section *)
|
|
let rec find_description = function
|
|
| [] -> []
|
|
| Macro ("SH", args) :: rest
|
|
when String.uppercase_ascii (String.trim args) = "DESCRIPTION" ->
|
|
collect_until_next_sh rest []
|
|
| _ :: rest -> find_description rest
|
|
in
|
|
match find_options classified with
|
|
| Some section -> section
|
|
| None -> find_description classified
|
|
|
|
(* --- Strategy-based entry extraction --- *)
|
|
|
|
(* Collect text lines until next macro or blank *)
|
|
let rec collect_text_lines lines acc =
|
|
match lines with
|
|
| Text s :: rest -> collect_text_lines rest (s :: acc)
|
|
| _ -> (String.concat " " (List.rev acc), lines)
|
|
|
|
(* Parse a tag line to extract entry using the Angstrom switch_parser *)
|
|
let parse_tag_to_entry tag desc =
|
|
let tag = strip_groff_escapes tag in
|
|
let tag = String.trim tag in
|
|
match Angstrom.parse_string ~consume:Angstrom.Consume.Prefix
|
|
(Angstrom.lift2 (fun sw p -> (sw, p)) switch_parser param_parser) tag with
|
|
| Ok (switch, param) -> Some { switch; param; desc }
|
|
| Error _ -> None
|
|
|
|
(* Strategy A: .TP style (most common — GNU coreutils, help2man) *)
|
|
let strategy_tp lines =
|
|
let rec walk lines acc =
|
|
match lines with
|
|
| [] -> List.rev acc
|
|
| Macro ("TP", _) :: rest ->
|
|
(* Next text line is the tag *)
|
|
begin match rest with
|
|
| Text tag :: rest2 ->
|
|
let (desc, rest3) = collect_text_lines rest2 [] in
|
|
let entry = parse_tag_to_entry tag desc in
|
|
walk rest3 (match entry with Some e -> e :: acc | None -> acc)
|
|
| _ -> walk rest acc
|
|
end
|
|
| _ :: rest -> walk rest acc
|
|
in
|
|
walk lines []
|
|
|
|
(* Strategy B: .IP style (curl, hand-written) *)
|
|
let strategy_ip lines =
|
|
let rec walk lines acc =
|
|
match lines with
|
|
| [] -> List.rev acc
|
|
| Macro ("IP", tag) :: rest ->
|
|
let tag = strip_groff_escapes tag in
|
|
let (desc, rest2) = collect_text_lines rest [] in
|
|
let entry = parse_tag_to_entry tag desc in
|
|
walk rest2 (match entry with Some e -> e :: acc | None -> acc)
|
|
| _ :: rest -> walk rest acc
|
|
in
|
|
walk lines []
|
|
|
|
(* Strategy C: .PP + .RS/.RE style (git, DocBook) *)
|
|
let strategy_pp_rs lines =
|
|
let rec walk lines acc =
|
|
match lines with
|
|
| [] -> List.rev acc
|
|
| Macro ("PP", _) :: rest ->
|
|
begin match rest with
|
|
| Text tag :: rest2 ->
|
|
(* Look for .RS ... text ... .RE *)
|
|
let rec collect_rs lines desc_acc =
|
|
match lines with
|
|
| Macro ("RS", _) :: rest3 ->
|
|
collect_in_rs rest3 desc_acc
|
|
| Text s :: rest3 ->
|
|
(* Sometimes description follows directly *)
|
|
collect_rs rest3 (s :: desc_acc)
|
|
| _ -> (String.concat " " (List.rev desc_acc), lines)
|
|
and collect_in_rs lines desc_acc =
|
|
match lines with
|
|
| Macro ("RE", _) :: rest3 ->
|
|
(String.concat " " (List.rev desc_acc), rest3)
|
|
| Text s :: rest3 ->
|
|
collect_in_rs rest3 (s :: desc_acc)
|
|
| Macro ("PP", _) :: _ | Macro ("SH", _) :: _ ->
|
|
(String.concat " " (List.rev desc_acc), lines)
|
|
| _ :: rest3 -> collect_in_rs rest3 desc_acc
|
|
| [] -> (String.concat " " (List.rev desc_acc), [])
|
|
in
|
|
let (desc, rest3) = collect_rs rest2 [] in
|
|
let entry = parse_tag_to_entry tag desc in
|
|
walk rest3 (match entry with Some e -> e :: acc | None -> acc)
|
|
| _ -> walk rest acc
|
|
end
|
|
| _ :: rest -> walk rest acc
|
|
in
|
|
walk lines []
|
|
|
|
(* Strategy D: Deroff fallback — strip all groff, use help text parser *)
|
|
let strategy_deroff_lines lines =
|
|
let buf = Buffer.create 256 in
|
|
List.iter (fun line ->
|
|
match line with
|
|
| Text s ->
|
|
Buffer.add_string buf s;
|
|
Buffer.add_char buf '\n'
|
|
| Macro (("BI" | "BR" | "IR" | "B" | "I"), args) ->
|
|
let text = strip_inline_macro_args args in
|
|
let text = strip_groff_escapes text in
|
|
Buffer.add_string buf text;
|
|
Buffer.add_char buf '\n'
|
|
| Blank -> Buffer.add_char buf '\n'
|
|
| _ -> ()
|
|
) lines;
|
|
let text = Buffer.contents buf in
|
|
match parse_help text with
|
|
| Ok result -> result.entries
|
|
| Error _ -> []
|
|
|
|
(* Count macros of a given type *)
|
|
let count_macro name lines =
|
|
List.fold_left (fun n line ->
|
|
match line with Macro (m, _) when m = name -> n + 1 | _ -> n
|
|
) 0 lines
|
|
|
|
(* Auto-detect and try strategies, return the one with most entries *)
|
|
let extract_entries lines =
|
|
let results = ref [] in
|
|
(* Try TP if .TP macros present *)
|
|
if count_macro "TP" lines > 0 then
|
|
results := ("TP", strategy_tp lines) :: !results;
|
|
(* Try IP if .IP macros present *)
|
|
if count_macro "IP" lines > 0 then
|
|
results := ("IP", strategy_ip lines) :: !results;
|
|
(* Try PP+RS if both present *)
|
|
if count_macro "PP" lines > 0 && count_macro "RS" lines > 0 then
|
|
results := ("PP+RS", strategy_pp_rs lines) :: !results;
|
|
(* Always try deroff as fallback *)
|
|
results := ("deroff", strategy_deroff_lines lines) :: !results;
|
|
(* Pick the result with the most entries *)
|
|
let best =
|
|
List.fold_left (fun (best_name, best_entries) (name, entries) ->
|
|
if List.length entries >= List.length best_entries then (name, entries)
|
|
else (best_name, best_entries)
|
|
) ("none", []) !results
|
|
in
|
|
snd best
|
|
|
|
(* --- Top-level API --- *)
|
|
|
|
let parse_manpage_lines lines =
|
|
let options_section = extract_options_section lines in
|
|
extract_entries options_section
|
|
|
|
let parse_manpage_string contents =
|
|
let lines = String.split_on_char '\n' contents in
|
|
parse_manpage_lines lines
|
|
|
|
let parse_manpage_gzipped_file path =
|
|
let ic = Unix.open_process_in (Printf.sprintf "gzip -dc %s" (Filename.quote path)) in
|
|
let buf = Buffer.create 4096 in
|
|
(try while true do
|
|
let line = input_line ic in
|
|
Buffer.add_string buf line;
|
|
Buffer.add_char buf '\n'
|
|
done with End_of_file -> ());
|
|
let _ = Unix.close_process_in ic in
|
|
parse_manpage_string (Buffer.contents buf)
|
|
|
|
let parse_manpage_file path =
|
|
if Filename.check_suffix path ".gz" then
|
|
parse_manpage_gzipped_file path
|
|
else begin
|
|
let ic = open_in path in
|
|
let n = in_channel_length ic in
|
|
let s = Bytes.create n in
|
|
really_input ic s 0 n;
|
|
close_in ic;
|
|
parse_manpage_string (Bytes.to_string s)
|
|
end
|