inshellah/src/parsers/manpage.rs
2026-05-20 23:39:15 +10:00

327 lines
12 KiB
Rust

//! parse unix manpages (groff/mdoc format) into a structured result.
//!
//! manpages are written in roff/groff markup — a decades-old typesetting language
//! used by man(1). this module strips the formatting and extracts structured data
//! (flags, subcommands, positionals) from the raw groff source.
//!
//! there are two major manpage macro packages:
//! - man (groff) — used by gnu/linux tools. uses macros like .SH, .TP, .IP, .PP
//! - mdoc (bsd) — used by bsd tools. uses .Sh, .Fl, .Ar, .Op, .It, .Bl/.El
//!
//! this module handles both, auto-detecting the format by checking for .Sh macros.
//!
//! for groff manpages, flag extraction uses multiple "strategies" that target
//! different common formatting patterns:
//! - strategy_tp: .TP tagged paragraphs (gnu coreutils, help2man)
//! - strategy_ip: .IP indented paragraphs (curl, hand-written)
//! - strategy_pp_rs: .PP + .RS/.RE blocks (git, docbook)
//! - strategy_nix: nix3-style bullet .IP with .UR/.UE hyperlinks
//! - strategy_deroff: fallback — strip all groff, feed to help text parser
//!
//! the module tries all applicable strategies and picks the one that extracts
//! the most flag entries, on the theory that more results = better match.
mod commands;
mod groff;
mod mdoc;
mod sections;
mod strategies;
use std::io::{self, Read};
use std::path::Path;
use crate::types::{HelpResult, OptionEntry, Param, Positional, Subcommand, Switch};
pub use self::groff::{GroffLine, classify_line, strip_groff_escapes};
pub use self::sections::{extract_subcommand_sections, extract_synopsis_command};
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum OwnedSwitch {
Short(char),
Long(String),
Both(char, String),
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum OwnedParam {
Mandatory(String),
Optional(String),
}
#[derive(Debug, Clone)]
pub struct ManpageEntry {
pub switch: OwnedSwitch,
pub param: Option<OwnedParam>,
pub desc: String,
}
#[derive(Debug, Clone)]
pub struct ManpageSubcommand {
pub name: String,
pub desc: String,
}
#[derive(Debug, Clone, Default)]
pub struct ManpageResult {
pub entries: Vec<ManpageEntry>,
pub subcommands: Vec<ManpageSubcommand>,
pub positionals: Vec<(String, Positional)>,
pub description: String,
}
impl From<&Switch<'_>> for OwnedSwitch {
fn from(s: &Switch<'_>) -> Self {
match s {
Switch::Short(c) => OwnedSwitch::Short(*c),
Switch::Long(l) => OwnedSwitch::Long((*l).to_string()),
Switch::Both(c, l) => OwnedSwitch::Both(*c, (*l).to_string()),
}
}
}
impl From<&Param<'_>> for OwnedParam {
fn from(p: &Param<'_>) -> Self {
match p {
Param::Mandatory(s) => OwnedParam::Mandatory((*s).to_string()),
Param::Optional(s) => OwnedParam::Optional((*s).to_string()),
}
}
}
impl From<&OptionEntry<'_>> for ManpageEntry {
fn from(e: &OptionEntry<'_>) -> Self {
let desc: String = e
.desc
.iter()
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
ManpageEntry {
switch: (&e.switch).into(),
param: e.param.as_ref().map(Into::into),
desc,
}
}
}
impl From<&Subcommand<'_>> for ManpageSubcommand {
fn from(sc: &Subcommand<'_>) -> Self {
// lowercase the subcommand name here so (a) file naming is
// consistent (meat_yum.json vs meat_YUM.json) and (b) recursive
// --help probes use the lowercase form, which is what most real
// CLIs accept — even tools like meat that DISPLAY uppercase
// names in their help text dispatch on the lowercased argument.
ManpageSubcommand {
name: sc.name.to_ascii_lowercase(),
desc: sc.desc.to_string(),
}
}
}
impl From<&HelpResult<'_>> for ManpageResult {
fn from(r: &HelpResult<'_>) -> Self {
ManpageResult {
entries: r.entries.iter().map(Into::into).collect(),
subcommands: r.subcommands.iter().map(Into::into).collect(),
// positional names are stored lowercased so output is
// stable across the various places we extract them from
// (synopsis, usage, cli11 sections).
positionals: r
.positionals
.iter()
.map(|(k, v)| (k.to_ascii_lowercase(), v.clone()))
.collect(),
description: r.desc.to_string(),
}
}
}
/// parse a manpage from its classified lines.
/// auto-detects mdoc vs groff format. for groff, runs the multi-strategy
/// extraction pipeline.
pub fn parse_manpage_lines(lines: &[GroffLine]) -> ManpageResult {
if mdoc::is_mdoc(lines) {
mdoc::parse_mdoc_lines(lines)
} else {
let options_section = sections::extract_options_section(lines);
let mut entries = strategies::extract_entries(&options_section);
// merge SYNOPSIS-only flags (nix-env's `[{--profile | -p} path]`
// pattern, where the flag is declared in the synopsis but never
// listed as an entry in the OPTIONS body). body entries take
// precedence on duplicate names — they carry the descriptions.
let synopsis_flags = sections::extract_synopsis_flags(lines);
if !synopsis_flags.is_empty() {
let have_long: std::collections::HashSet<String> = entries
.iter()
.filter_map(|e| match &e.switch {
OwnedSwitch::Long(l) | OwnedSwitch::Both(_, l) => Some(l.to_ascii_lowercase()),
_ => None,
})
.collect();
let have_short: std::collections::HashSet<char> = entries
.iter()
.filter_map(|e| match &e.switch {
OwnedSwitch::Short(c) | OwnedSwitch::Both(c, _) => Some(*c),
_ => None,
})
.collect();
for e in synopsis_flags {
let dup = match &e.switch {
OwnedSwitch::Long(l) => have_long.contains(&l.to_ascii_lowercase()),
OwnedSwitch::Short(c) => have_short.contains(c),
OwnedSwitch::Both(c, l) => {
have_short.contains(c) || have_long.contains(&l.to_ascii_lowercase())
}
};
if !dup {
entries.push(e);
}
}
}
let positionals = sections::extract_synopsis_positionals(lines);
let commands_section = sections::extract_commands_section(lines);
let subcommands = commands::extract_subcommands_from_commands(&commands_section);
ManpageResult {
entries,
subcommands,
positionals,
description: String::new(),
}
}
}
/// parse a manpage from its raw string contents.
/// splits into lines, parses, then extracts the NAME section description.
pub fn parse_manpage_string(contents: &str) -> ManpageResult {
let lines: Vec<GroffLine> = contents.split('\n').map(classify_line).collect();
let mut result = parse_manpage_lines(&lines);
if let Some(desc) = sections::extract_name_description(&lines) {
result.description = desc;
}
result
}
/// parse a manpage and also pull out clap-style `.SH SUBCOMMAND` sections
/// as separate per-subcommand results. each subcommand section in a
/// clap-generated manpage is its own command with its own flags; the
/// parent's subcommand list is populated from their names.
///
/// returns (main_result, sub_results) where each sub_result has
/// name=full_command ("nh os"), desc, and its own ManpageResult.
pub fn parse_manpage_with_subs(contents: &str) -> (ManpageResult, Vec<(String, ManpageResult)>) {
let lines: Vec<GroffLine> = contents.split('\n').map(classify_line).collect();
let mut result = parse_manpage_lines(&lines);
if let Some(desc) = sections::extract_name_description(&lines) {
result.description = desc;
}
let sub_sections = sections::extract_subcommand_sections(&lines);
if !sub_sections.is_empty() {
// overwrite subcommands with the SUBCOMMAND-section names —
// these are the authoritative list for clap-generated manpages.
result.subcommands = sub_sections
.iter()
.map(|(name, desc, _)| ManpageSubcommand {
name: name.to_ascii_lowercase(),
desc: desc.clone(),
})
.collect();
}
// each SUBCOMMAND section body is parsed via the same strategy-picker
// as the top-level OPTIONS section — clap puts flag definitions
// directly under the .SH SUBCOMMAND header with no inner .SH wrapping,
// so parse_manpage_lines (which looks for a child OPTIONS section)
// would come back empty.
let subs: Vec<(String, ManpageResult)> = sub_sections
.into_iter()
.map(|(name, desc, lines)| {
let entries = strategies::extract_entries(&lines);
let sub_result = ManpageResult {
entries,
subcommands: Vec::new(),
positionals: Default::default(),
description: desc,
};
(name, sub_result)
})
.collect();
(result, subs)
}
/// read a manpage file from disk. handles .gz compressed files (the common
/// case — most installed manpages are gzipped). plain text files are read directly.
pub fn read_manpage_file<P: AsRef<Path>>(path: P) -> io::Result<String> {
let path = path.as_ref();
let bytes = std::fs::read(path)?;
if path.extension().and_then(|e| e.to_str()) == Some("gz") {
let mut decoder = flate2::read::GzDecoder::new(&bytes[..]);
let mut out = String::new();
decoder.read_to_string(&mut out)?;
Ok(out)
} else {
String::from_utf8(bytes).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
}
}
/// read + parse a manpage file in one step.
pub fn parse_manpage_file<P: AsRef<Path>>(path: P) -> io::Result<ManpageResult> {
let contents = read_manpage_file(path)?;
Ok(parse_manpage_string(&contents))
}
#[cfg(test)]
mod tests {
use super::*;
const TP_MANPAGE: &str = r#".TH FOO 1 "2024" "1.0" "User Commands"
.SH NAME
foo \- a synthetic test command
.SH SYNOPSIS
.B foo
[\fIOPTIONS\fR] <input> [output]
.SH OPTIONS
.TP
\fB\-v\fR, \fB\-\-verbose\fR
increase output verbosity
.TP
\fB\-o\fR \fIFILE\fR, \fB\-\-output\fR=\fIFILE\fR
write to FILE
.TP
\fB\-h\fR, \fB\-\-help\fR
show this help and exit
"#;
#[test]
fn tp_strategy_extracts_flags() {
let r = parse_manpage_string(TP_MANPAGE);
assert_eq!(
r.entries.len(),
3,
"expected 3 entries, got {:?}",
r.entries
);
assert_eq!(r.description, "a synthetic test command");
assert!(matches!(
r.entries[0].switch,
OwnedSwitch::Both('v', ref l) if l == "verbose"
));
assert!(matches!(
r.entries[2].switch,
OwnedSwitch::Both('h', ref l) if l == "help"
));
assert!(r.entries[0].desc.contains("verbosity"));
}
#[test]
fn mdoc_format_detected() {
let src = ".Sh NAME\n.Nm test\n.Nd a test\n.Sh DESCRIPTION\nstuff\n";
let lines: Vec<GroffLine> = src.split('\n').map(classify_line).collect();
assert!(mdoc::is_mdoc(&lines));
}
#[test]
fn groff_escapes_stripped() {
let stripped = groff::strip_groff_escapes("\\fB\\-v\\fR \\fIfile\\fR");
assert_eq!(stripped.trim(), "-v file");
}
}