This commit is contained in:
atagen 2026-05-19 23:32:51 +10:00
parent da4bc139eb
commit 0aa6ae9fbf
49 changed files with 10554 additions and 5482 deletions

4
src/lib.rs Normal file
View file

@ -0,0 +1,4 @@
pub mod parsers;
pub mod pool;
pub mod store;
pub mod types;

2241
src/main.rs Normal file

File diff suppressed because it is too large Load diff

187
src/parsers/help.rs Normal file
View file

@ -0,0 +1,187 @@
mod description;
mod helpers;
mod options;
mod positionals;
mod subcommands;
pub use options::{param_parser, parse_usage_flags, switch_parser};
pub use positionals::{
extract_cli11_positionals, extract_usage_positionals, parse_usage_args, skip_command_name,
};
use std::collections::HashMap;
use crate::{
parsers::help::{description::description, helpers::get_indent, subcommands::subcommand_entry},
types::*,
};
use nom::{IResult, Parser, character::complete::space0, combinator::opt};
use crate::make_parser;
type EntryParts<'a> = (
&'a str,
(Switch<'a>, Option<Param<'a>>),
(&'a str, Vec<&'a str>),
);
// parse a single flag entry: indent + switch + optional param + description.
make_parser!(entry -> OptionEntry<'a>,
(
space0,
(switch_parser, opt(param_parser)),
description,
)
=> |(_, (switch, param), (first, cont))
: EntryParts<'a>|
{
let mut desc: Vec<&str> = Vec::with_capacity(1 + cont.len());
if !first.trim().is_empty() { desc.push(first); }
desc.extend(cont.into_iter().filter(|l| !l.trim().is_empty()));
OptionEntry { switch, param, desc }
}
);
/// dedup raw subcommands by case-insensitive name, keeping the entry with
/// the longest description. preserves first-seen ordering.
fn dedup_subcommands<'a>(raw: Vec<Subcommand<'a>>) -> Vec<Subcommand<'a>> {
let mut by_name: HashMap<String, Subcommand<'a>> = HashMap::new();
let mut order: Vec<String> = Vec::new();
for sc in raw {
let key = sc.name.to_ascii_lowercase();
match by_name.get(&key) {
Some(prev) if prev.desc.len() >= sc.desc.len() => {}
_ => {
if !by_name.contains_key(&key) {
order.push(key.clone());
}
by_name.insert(key, sc);
}
}
}
order
.into_iter()
.map(|k| by_name.remove(&k).unwrap())
.collect()
}
#[derive(Clone, Copy, PartialEq, Eq)]
enum HelpSection {
Unknown,
Options,
Commands,
Other,
}
fn classify_section_line(line: &str) -> Option<HelpSection> {
let (idx, indent) = get_indent(line);
if indent > 4 {
return None;
}
let trimmed = line[idx..].trim();
if trimmed.is_empty() {
return None;
}
let without_colon = trimmed.trim_end_matches(':').trim();
let lower = without_colon.to_ascii_lowercase();
if lower.starts_with("usage") {
return Some(HelpSection::Unknown);
}
if lower.starts_with("valid arguments")
|| lower.contains(" is one of the following")
|| lower.contains(" defaults to")
|| lower == "examples"
|| lower == "example"
{
return Some(HelpSection::Other);
}
let command_header = matches!(lower.as_str(), "command" | "commands" | "subcommands")
|| lower.ends_with(" commands")
|| lower.ends_with(" subcommands");
if command_header && !lower.contains("option") && !lower.contains("flag") {
return Some(HelpSection::Commands);
}
if lower.contains("argument")
|| lower == "args"
|| lower == "positionals"
|| lower == "positional arguments"
{
return Some(HelpSection::Other);
}
if lower.contains("option") || lower.contains("flag") || trimmed.ends_with(':') {
return Some(HelpSection::Options);
}
None
}
fn consume_line(s: &str) -> &str {
match s.find('\n') {
Some(idx) => &s[idx + 1..],
None => "",
}
}
fn parser_made_progress(original: &str, rem: &str) -> bool {
rem.len() < original.len()
}
/// build the final HelpResult by scanning help text with lightweight section
/// awareness. options are accepted in option-like sections and before a
/// section is known; subcommands are accepted only in command-like sections.
fn build_help_result<'a>(original: &'a str) -> HelpResult<'a> {
let mut entries = Vec::new();
let mut raw_subcommands: Vec<Subcommand<'a>> = Vec::new();
let mut section = HelpSection::Unknown;
let mut rem = original;
while !rem.is_empty() {
let line = rem.split_once('\n').map(|(line, _)| line).unwrap_or(rem);
if let Some(next_section) = classify_section_line(line) {
section = next_section;
rem = consume_line(rem);
continue;
}
if matches!(section, HelpSection::Unknown | HelpSection::Options)
&& let Ok((next, parsed)) = entry(rem)
&& parser_made_progress(rem, next)
{
entries.push(parsed);
rem = next;
continue;
}
if section == HelpSection::Commands
&& let Ok((next, parsed)) = subcommand_entry(rem)
&& parser_made_progress(rem, next)
{
raw_subcommands.push(parsed);
rem = next;
continue;
}
rem = consume_line(rem);
}
let subcommands = dedup_subcommands(raw_subcommands);
// cli11 positional section takes priority over the usage-line scan
// when both are present — cli11 carries types and optionality.
let positionals = match extract_cli11_positionals(original) {
Ok((_, p)) if !p.is_empty() => p,
_ => extract_usage_positionals(original)
.map(|(_, p)| p)
.unwrap_or_default(),
};
HelpResult {
entries,
subcommands,
positionals,
desc: "",
}
}
/// top-level help parser.
pub fn help_parser(s: &str) -> IResult<&str, HelpResult<'_>> {
Ok(("", build_help_result(s)))
}

View file

@ -0,0 +1,37 @@
use nom::{
IResult, Parser,
character::complete::space0,
combinator::verify,
multi::many0,
sequence::{preceded, terminated},
};
use crate::make_parser;
use crate::parsers::help::helpers::{at_least_indent, eol, rest_of_line};
// continuation line: an indented (≥8 visual cols), non-flag-shaped line
// belonging to the previous flag's description. blank-but-indented lines
// are accepted (content = ""), filtered out by the caller's join.
make_parser!(continuation_line -> &'a str,
verify(
preceded(
// assert ≥8 visual cols of leading horizontal whitespace
// without consuming — space0 inside `rest_of_line`'s preceded
// will eat them next.
at_least_indent(8),
terminated(preceded(space0, rest_of_line), eol)
),
// reject lines whose first non-space char is '-' — that's a new
// flag entry, not a continuation of the previous one.
|content: &&str| !content.starts_with('-')
)
);
// description: the line of text after the switch+param, plus any
// continuation lines. always succeeds — first line may be empty (when
// the switch is followed immediately by a newline, "clap long" style).
make_parser!(pub description -> (&'a str, Vec<&'a str>),
(
terminated(preceded(space0, rest_of_line), eol),
many0(continuation_line),
));

105
src/parsers/help/helpers.rs Normal file
View file

@ -0,0 +1,105 @@
use nom::{
AsChar, IResult, Parser, branch::alt, bytes::complete::take_till,
character::complete::line_ending, combinator::eof,
};
#[allow(unused_imports)]
use nom::{bytes::complete::take_while, combinator::peek, combinator::verify};
#[macro_export]
macro_rules! make_parser {
(pub $name:ident -> $out:ty, $parser:expr => $wrap:expr) => {
#[allow(clippy::needless_lifetimes)]
#[allow(mismatched_lifetime_syntaxes)]
pub fn $name<'a>(s: &'a str) -> IResult<&'a str, $out> {
let (rem, val) = $parser.parse(s)?;
Ok((rem, $wrap(val)))
}
};
(pub $name:ident -> $out:ty, $parser:expr) => {
#[allow(clippy::needless_lifetimes)]
#[allow(mismatched_lifetime_syntaxes)]
pub fn $name<'a>(s: &'a str) -> IResult<&'a str, $out> {
$parser.parse(s)
}
};
($name:ident -> $out:ty, $parser:expr => $wrap:expr) => {
#[allow(clippy::needless_lifetimes)]
#[allow(mismatched_lifetime_syntaxes)]
fn $name<'a>(s: &'a str) -> IResult<&'a str, $out> {
let (rem, val) = $parser.parse(s)?;
Ok((rem, $wrap(val)))
}
};
($name:ident -> $out:ty, $parser:expr) => {
#[allow(clippy::needless_lifetimes)]
#[allow(mismatched_lifetime_syntaxes)]
fn $name<'a>(s: &'a str) -> IResult<&'a str, $out> {
$parser.parse(s)
}
};
}
#[macro_export]
macro_rules! make_predicate {
(pub $name:ident, |$c:ident| $($body:tt)*) => {
pub fn $name($c: char) -> bool { $($body)* }
};
($name:ident, |$c:ident| $($body:tt)*) => {
fn $name($c: char) -> bool { $($body)* }
};
}
make_predicate!(pub is_option_char, |c| c.is_alphanumeric() || matches!(c, '-' | '_'));
make_parser!(pub rest_of_line -> &'a str,
take_till(|c: char| c.is_newline())
);
// end of line — matches either a newline or end of input.
// permissive version used in most line-consuming parsers.
make_parser!(pub eol -> &'a str, alt((line_ending, eof)));
/// compute the visual indent of a leading whitespace run.
/// spaces count 1, tabs count 8 (typical terminal default).
pub fn visual_indent(s: &str) -> u8 {
s.chars().fold(0u8, |acc, c| {
acc.saturating_add(match c {
' ' => 1,
'\t' => 8,
_ => 0,
})
})
}
/// nom-shaped check that the input begins with at least `min` visual
/// columns of horizontal whitespace (spaces or tabs). doesn't consume —
/// pair with `space0`/`take_while` to actually eat the indent.
pub fn at_least_indent<'a>(
min: u8,
) -> impl Parser<&'a str, Output = &'a str, Error = nom::error::Error<&'a str>> {
verify(
peek(take_while(|c: char| c == ' ' || c == '\t')),
move |s: &str| visual_indent(s) >= min,
)
}
/// legacy helper: returns (byte index of first non-space, visual indent).
/// used by callers that still need the byte index.
pub fn get_indent(s: &str) -> (usize, u8) {
let mut traversed = 0;
let mut indent = 0;
for (i, c) in s.char_indices() {
let incr = match c {
' ' => 1,
'\t' => 8,
_ => 0,
};
if incr == 0 {
traversed = i;
break;
} else {
indent += incr;
}
}
(traversed, indent)
}

192
src/parsers/help/options.rs Normal file
View file

@ -0,0 +1,192 @@
use crate::make_parser;
use crate::parsers::help::helpers::is_option_char;
use crate::types::*;
use nom::bytes::complete::{take_till, take_till1};
use nom::character::complete::{space0, space1};
use nom::combinator::{map, opt};
use nom::multi::many0;
use nom::sequence::separated_pair;
use nom::{
IResult, Parser,
branch::alt,
bytes::complete::{tag, take_while1},
character::complete::{char, satisfy},
combinator::{value, verify},
sequence::{delimited, preceded},
};
make_parser!(short_switch -> char,
preceded(char('-'), satisfy(|c| c.is_alphanumeric())));
make_parser!(long_switch -> &'a str,
preceded(tag("--"), take_while1(is_option_char)));
make_parser!(negatable_long_switch -> &'a str,
preceded(tag("--[no-]"), take_while1(is_option_char)));
make_parser!(comma -> (),
value((), preceded(char(','), space0)));
make_parser!(eq_optional_param -> Param<'a>,
delimited(tag("[="), take_while1(is_option_char), char(']')) => Param::Optional);
make_parser!(eq_optional_angle_param -> Param<'a>,
delimited(tag("[=<"), take_till1(|c| c == '>'), tag(">]")) => Param::Optional);
make_parser!(eq_mandatory_param -> Param<'a>,
preceded(char('='), take_while1(is_option_char)) => Param::Mandatory);
// take a wide alphanumeric/_/- token then verify the WHOLE thing looks
// like an ALL_CAPS-style param name. taking only uppercase chars would
// match just "N" of " Needs: ..." and leave "eeds:..." as desc, so we
// widen, then reject anything that doesn't pass the all-caps check.
make_parser!(spaced_uppercase_param -> Param<'a>,
preceded(
char(' '),
verify(
take_while1(|c: char|
c.is_ascii_alphabetic() || c.is_ascii_digit() || c == '_' || c == '-'
),
|s: &str| {
let first = match s.chars().next() { Some(c) => c, None => return false };
if !(first.is_ascii_uppercase() || first == '_') { return false; }
s.chars().all(|c| c.is_ascii_uppercase() || c.is_ascii_digit() || c == '_')
}
)
) => Param::Mandatory);
make_parser!(spaced_angle_param -> Param<'a>,
preceded(char(' '), delimited(char('<'), take_till1(|c| c == '>'), char('>'))) => Param::Mandatory);
make_parser!(spaced_opt_angle_param -> Param<'a>,
preceded(char(' '), delimited(char('<'),
delimited(char('['), take_while1(|c| c != ']'), char(']')),
char('>'))) => Param::Optional);
make_parser!(spaced_angle_param_after_space -> Param<'a>,
preceded(space1, delimited(char('<'), take_till1(|c| c == '>'), char('>'))) => Param::Mandatory);
// take the full lowercase token then verify it's <=10 chars. a
// take_while_m_n with a 10-char cap would leave a partial match — e.g.
// "--foo nanoseconds" would extract param "nanosecond" and leave "s" as
// the description. a word longer than 10 chars is almost certainly the
// start of the description, not a type annotation.
make_parser!(spaced_type_param -> Param<'a>,
preceded(
char(' '),
verify(
take_while1(|c: char| !c.is_whitespace()),
|s: &str| s.len() <= 10 && s.chars().all(|c| c.is_ascii_lowercase())
)
) => Param::Mandatory
);
make_parser!(pub param_parser -> Param<'a>, alt((
eq_optional_angle_param,
eq_optional_param,
eq_mandatory_param,
spaced_opt_angle_param,
spaced_angle_param_after_space,
spaced_angle_param,
spaced_uppercase_param,
spaced_type_param,
)));
macro_rules! switch_pair {
($name:ident, $left:expr, $sep:expr, $right:expr => |$a:ident, $b:ident| $body:expr) => {
fn $name<'a>(s: &'a str) -> IResult<&'a str, Switch<'a>> {
use nom::sequence::separated_pair;
let (rem, ($a, $b)) = separated_pair($left, $sep, $right).parse(s)?;
Ok((rem, $body))
}
};
}
switch_pair!(short_comma_long,
short_switch, comma, long_switch => |s, l| Switch::Both(s, l));
switch_pair!(short_comma_negatable_long,
short_switch, comma, negatable_long_switch => |s, l| Switch::Both(s, l));
switch_pair!(short_space_long,
short_switch, char(' '), long_switch => |s, l| Switch::Both(s, l));
switch_pair!(short_space_negatable_long,
short_switch, char(' '), negatable_long_switch => |s, l| Switch::Both(s, l));
make_parser!(slash_sep -> (),
value((), delimited(space0, char('/'), space0)));
switch_pair!(long_slash_short,
long_switch, slash_sep, short_switch => |l, s| Switch::Both(s, l));
make_parser!(short_as_switch -> Switch<'a>, short_switch => Switch::Short);
make_parser!(negatable_long_as_switch -> Switch<'a>, negatable_long_switch => Switch::Long);
make_parser!(long_as_switch -> Switch<'a>, long_switch => Switch::Long);
make_parser!(pub switch_parser -> Switch<'a>,
alt((
short_comma_negatable_long,
short_space_negatable_long,
short_comma_long,
short_space_long,
long_slash_short,
short_as_switch,
negatable_long_as_switch,
long_as_switch,
))
);
// `{--long | -s}` — manpage SYNOPSIS-line switch pair. nix-env's
// synopsis is the canonical case: `[{--file | -f} path] [{--profile |
// -p} path]`. emits Switch::Both with the long name.
make_parser!(brace_pipe_long_short -> Switch<'a>,
separated_pair(long_switch, (space0, char('|'), space0), short_switch)
=> |(l, s): (&'a str, char)| Switch::Both(s, l)
);
make_parser!(brace_pipe_short_long -> Switch<'a>,
separated_pair(short_switch, (space0, char('|'), space0), long_switch)
=> |(s, l): (char, &'a str)| Switch::Both(s, l)
);
make_parser!(brace_pipe_switch -> Switch<'a>,
delimited(
(char('{'), space0),
alt((brace_pipe_long_short, brace_pipe_short_long)),
(space0, char('}'))
)
);
make_parser!(usage_switch_parser -> Switch<'a>,
alt((brace_pipe_switch, switch_parser))
);
// consume any chars except `]`. used to swallow trailing tokens inside a
// flag bracket — e.g. `[--option name value]` keeps switch=Long("option")
// and param=Mandatory("name"), discarding ` value` before the closing `]`.
make_parser!(take_till_bracket -> &'a str, take_till(|c: char| c == ']'));
// `[<switch> [param] <junk>]` inside the SYNOPSIS line.
make_parser!(flag_in_bracket -> (Switch<'a>, Option<Param<'a>>),
delimited(
(char('['), space0),
(usage_switch_parser, opt(param_parser)),
(take_till_bracket, char(']'))
)
);
// walk the joined SYNOPSIS-line text, collecting every flag-bracketed
// switch + its first param. non-flag tokens (positional brackets,
// command name, ellipses) are skipped one char at a time.
make_parser!(pub parse_usage_flags -> Vec<(Switch<'a>, Option<Param<'a>>)>,
many0(alt((
map(flag_in_bracket, Some),
// `value(None, ...)` requires `None: Clone` which forces Clone
// on Switch/Param; `map(..., |_| None)` doesn't.
map(satisfy(|c| c != '\n' && c != '\r'), |_| None),
)))
=> |v: Vec<Option<(Switch<'a>, Option<Param<'a>>)>>|
v.into_iter().flatten().collect()
);

View file

@ -0,0 +1,400 @@
use crate::parsers::help::helpers::rest_of_line;
use crate::types::Positional;
use crate::{make_parser, make_predicate};
use nom::branch::alt;
use nom::bytes::complete::{tag, tag_no_case, take_till, take_till1, take_while, take_while1};
use nom::character::complete::{char, line_ending, satisfy, space0, space1};
use nom::combinator::{map, not, opt, peek, recognize, value, verify};
use nom::multi::many0;
use nom::sequence::{delimited, preceded, terminated};
use nom::{AsChar, IResult, Parser};
#[derive(Clone)]
enum PositionalParse<'a> {
Curly,
Flag,
Skip,
Mandatory(&'a str),
Optional(&'a str),
ManVariadic(&'a str),
OptVariadic(&'a str),
}
make_predicate!(is_word_char, |c| c.is_alphanumeric()
|| matches!(c, '-' | '_' | '/' | '.'));
make_predicate!(is_pos_char, |c| c.is_ascii_uppercase()
|| c.is_numeric()
|| matches!(c, '_' | '-'));
make_parser!(section_label -> (),
value((), alt((
tag_no_case("options"),
tag_no_case("option"),
tag_no_case("flags"),
tag_no_case("flag")
)))
);
make_parser!(ellipses -> (),
value((),
alt((tag("..."), tag("\u{2026}")))
)
);
make_parser!(braces -> PositionalParse<'a>,
value(PositionalParse::Curly, delimited(char('{'), take_till1(|c| c == '}'), char('}')))
);
// FIXME should this be a take_while is_option_char?
// why tf do we have a ']' condition
make_parser!(flag -> PositionalParse<'a>,
value(PositionalParse::Flag, preceded(char('-'), take_till1(|c: char| c.is_space() || c == ']')))
);
fn check_positional(s: &str) -> bool {
let s = s.trim();
if s.is_empty() {
return false;
}
// reject names starting with '-' — these are flag tokens accidentally
// captured by the bracket parser, e.g. "[--at-operation]" in jj's
// synopsis. without this guard every `[--flag]` token would be
// recorded as a positional named "--flag".
if s.starts_with('-') {
return false;
}
if section_label.parse(s).is_ok() {
return false;
}
let upper = s.to_ascii_uppercase();
if matches!(upper.as_str(), "OPTIONS" | "OPTION" | "FLAGS" | "FLAG") {
return false;
}
s.chars()
.all(|c| c.is_alphanumeric() || matches!(c, '-' | '_' | '/' | '.'))
}
// recognize a balanced `[...]` block, tolerating ONE level of nested
// brackets inside. expressed entirely via nom combinators:
//
// `[` + many0(alt((nested_bracket_block, non_bracket_char))) + `]`
//
// nested_bracket_block is `[ chars_until_] ]`, which means we accept a
// single inner `[...]` correctly but not arbitrarily-deep nesting —
// manpages don't go deeper than two levels.
// returns the inner content (everything between the outer brackets).
make_parser!(balanced_bracket_inner -> &'a str,
recognize(delimited(
char('['),
many0(alt((
recognize((char('['), take_till(|c: char| c == ']'), char(']'))),
recognize(satisfy(|c: char| c != ']' && c != '[')),
))),
char(']'),
))
=> |whole: &'a str| &whole[1..whole.len() - 1]
);
/// extract a positional name from already-trimmed bracket-inner content.
/// returns the name slice and a flag indicating whether the bracket inner
/// carried a trailing `...` (in-bracket variadic marker).
fn parse_bracket_inner_name(inner: &str) -> Option<(&str, bool)> {
let inner = inner.trim();
// strip trailing "..." for in-bracket variadic.
let (rest, has_dots) = if let Some(stripped) = inner.strip_suffix("...") {
(stripped.trim_end(), true)
} else if let Some(stripped) = inner.strip_suffix('\u{2026}') {
(stripped.trim_end(), true)
} else {
(inner, false)
};
if rest.starts_with('[') {
let mut found = None;
let mut remaining = rest;
while let Some(start) = remaining.find('[') {
let after_start = &remaining[start + 1..];
let Some(end) = after_start.find(']') else {
break;
};
let nested = &after_start[..end];
if let Some((nested_name, nested_dots)) = parse_bracket_inner_name(nested)
&& check_positional(nested_name)
{
found = Some((nested_name, has_dots || nested_dots));
}
remaining = &after_start[end + 1..];
}
return found;
}
let name = if let Some(after_lt) = rest.strip_prefix('<') {
// angle-bracket name: take everything up to the matching '>'
let end = after_lt.find('>')?;
let inner = after_lt[..end].trim();
let (inner, inner_dots) = if let Some(stripped) = inner.strip_suffix("...") {
(stripped.trim_end(), true)
} else if let Some(stripped) = inner.strip_suffix('\u{2026}') {
(stripped.trim_end(), true)
} else {
(inner, false)
};
return Some((inner, has_dots || inner_dots));
} else {
// bare name: take leading word
let end = rest
.find(|c: char| c.is_whitespace() || c == '[' || c == ']')
.unwrap_or(rest.len());
if end == 0 {
return None;
}
&rest[..end]
};
Some((name, has_dots))
}
// extract a balanced `[...]` block and decompose its inner content into
// (name, has-inner-`...` flag). `map_opt` turns a `None` from
// `parse_bracket_inner_name` into a nom parse error.
make_parser!(opt_bracket_name -> (&'a str, bool),
nom::combinator::map_opt(balanced_bracket_inner, parse_bracket_inner_name)
);
make_parser!(
opt_positional -> PositionalParse<'a>,
verify(
// tuple parser: (name + in-bracket variadic, post-bracket ellipsis).
// matches "[name]", "[name...]", "[name ...]", "[name] ...",
// "[<name>]", and one-level nests like "[<program> [<arg>...]]".
(opt_bracket_name, opt(ellipses)),
|((name, _), _): &((&'a str, bool), Option<()>)| check_positional(name)
) => |((name, has_inner_dots), post_dots): ((&'a str, bool), Option<()>)| {
if has_inner_dots || post_dots.is_some() {
PositionalParse::OptVariadic(name)
} else {
PositionalParse::Optional(name)
}
}
);
make_parser!(man_positional -> PositionalParse<'a>,
verify(
(
delimited(
char('<'),
(
take_till1(|c| c == '.' || c == '\u{2026}' || c == '>'),
opt(ellipses)
),
char('>')
),
opt(ellipses)
),
|((ss, _), _)| check_positional(ss)
) => |((p, v), v1): ((&'a str, Option<()>), Option<()>)|
if v.is_some() || v1.is_some() { PositionalParse::ManVariadic(p) }
else { PositionalParse::Mandatory(p) }
);
make_parser!(allcaps_positional -> PositionalParse<'a>,
verify(
(
preceded(
peek(
satisfy(|c: char| c.is_ascii_uppercase())
),
take_while1(is_pos_char)
),
opt(
alt((
tag("..."),
tag("\u{2026}"))
)
)
),
|(ss, _): &(&str, _)| check_positional(ss)
) => |(p, v): (&'a str, Option<&'a str>)|
if v.is_some() { PositionalParse::ManVariadic(p) } else { PositionalParse::Mandatory(p) }
);
fn caseless_push<'a>(k: &'a str, v: Positional, acc: &mut Vec<(&'a str, Positional)>) {
let dupe = acc.iter().any(|(ik, _)| ik.eq_ignore_ascii_case(k));
if !dupe {
acc.push((k, v));
}
}
// parse_usage_args runs on a single logical usage line. SKIP refuses to
// cross a newline boundary so many0 stops at end-of-line — without this
// the parser would happily wander into the OPTIONS section and treat
// every `--flag <name>` angle-bracket parameter as a positional.
//
// the inner positional terminator uses peek(line_ending) instead of
// consuming the newline, so the trailing `opt(line_ending)` in the
// outer delimited eats it cleanly and we never advance past the usage
// line.
make_parser!(pub parse_usage_args -> Vec<(&'a str, Positional)>,
(delimited(
space0,
many0(
alt((
map(
(
terminated(
alt((
braces,
opt_positional,
value(PositionalParse::Skip, balanced_bracket_inner),
man_positional,
flag,
allcaps_positional,
)),
alt((
space1,
value("", peek(line_ending)),
value("", peek(nom::combinator::eof)),
))
),
// catch "[section] ..." patterns where the ellipsis is
// on the *next* token, separated by whitespace.
opt(terminated(
alt((tag("..."), tag("\u{2026}"))),
alt((
space1,
value("", peek(line_ending)),
value("", peek(nom::combinator::eof)),
))
))
),
|(positional, trailing): (PositionalParse<'a>, Option<_>)| {
if trailing.is_none() { positional }
else {
match positional {
PositionalParse::Optional(n) => PositionalParse::OptVariadic(n),
PositionalParse::Mandatory(n) => PositionalParse::ManVariadic(n),
other => other,
}
}
}
),
// SKIP must NOT consume a newline. without this, many0 keeps
// iterating past the usage line into OPTIONS-section flag
// syntax and over-extracts positionals.
value(PositionalParse::Skip, satisfy(|c: char| c != '\n' && c != '\r')),
))
),
opt((space0, line_ending))
)) => |p: Vec<PositionalParse<'a>>|
p.into_iter().fold(Vec::new(), |mut acc, parse|
{
match parse {
PositionalParse::Curly => (),
PositionalParse::Flag => (),
PositionalParse::Skip => (),
PositionalParse::OptVariadic(arg) => caseless_push(arg, Positional {
optional: true,
variadic: true
}, &mut acc),
PositionalParse::ManVariadic(arg) => caseless_push(arg, Positional {
optional: false,
variadic: true
}, &mut acc),
PositionalParse::Optional(arg) => caseless_push(arg, Positional {
optional: true,
variadic: false,
}, &mut acc),
PositionalParse::Mandatory(arg) => caseless_push(arg, Positional {
optional: false,
variadic: false
}, &mut acc),
}
acc
})
);
make_parser!(pub skip_command_name -> (),
value((), preceded(space0,
many0(
(
verify(
preceded(not(char('-')), take_while1(is_word_char)),
|ss: &str| ss.chars().any(|c: char| c.is_ascii_lowercase())
),
space1
)
)
))
);
make_parser!(find_usage_line -> (),
value((), preceded(
space0,
terminated(
tag_no_case("usage"),
// accept any of:
// "Usage:" — inline form with colon
// "Usage args" — inline form, space follows the word
// "USAGE\n cmd args" — clap-style header on its own line
alt(
(
value((), char(':')),
value((), peek(line_ending)),
value((), peek(satisfy(|c: char| c == ' ' || c == '\t'))),
)
)
)
))
);
make_parser!(pub extract_usage_positionals -> Vec<(&'a str, Positional)>,
preceded(
many0(preceded(not(find_usage_line), (rest_of_line, line_ending))),
preceded(
(find_usage_line, space0, opt(line_ending), space0, skip_command_name),
parse_usage_args
)
)
);
make_predicate!(is_cli11_name_char, |c| c.is_alphanumeric()
|| matches!(c, '_' | '-'));
make_parser!(cli11_section_header -> (),
value((),
delimited(
space0,
alt((tag("POSITIONALS:"), tag("Positionals:"))),
(rest_of_line, opt(line_ending))
)
)
);
make_parser!(cli11_pos_line -> (&'a str, bool),
preceded(
verify(space0, |ss: &str| !ss.is_empty()),
terminated(
(
verify(take_while1(is_cli11_name_char), |s: &str| s.len() >= 2),
preceded(
(space0, take_while(|c: char| c.is_ascii_uppercase()), space0),
opt(tag("..."))
)
),
(rest_of_line, opt(line_ending))
)
) => |(name, variadic): (&'a str, Option<_>)| (name, variadic.is_some())
);
make_parser!(parse_cli11_body -> Vec<(&'a str, Positional)>,
many0(cli11_pos_line) => |entries: Vec<(&'a str, bool)>|
entries.into_iter().fold(Vec::new(), |mut acc, (name, variadic)| {
caseless_push(name, Positional { optional: false, variadic }, &mut acc);
acc
})
);
make_parser!(pub extract_cli11_positionals -> Vec<(&'a str, Positional)>,
preceded(
many0(preceded(not(cli11_section_header), (rest_of_line, line_ending))),
preceded(cli11_section_header, parse_cli11_body)
)
);

View file

@ -0,0 +1,83 @@
use nom::{
AsChar, IResult, Parser,
branch::alt,
bytes::complete::{tag, take_till, take_while1},
character::complete::{char, space0},
combinator::{not, value, verify},
multi::many0,
sequence::{delimited, preceded, terminated},
};
use crate::make_parser;
use crate::parsers::help::helpers::{eol, is_option_char};
use crate::types::Subcommand;
fn is_placeholder(c: char) -> bool {
match c {
_ if c.is_alphanumeric() => true,
'_' | '-' | '.' | '|' | ',' => true,
_ => false,
}
}
/// chars allowed inside a bare (unbracketed) placeholder token, e.g.
/// "FILE", "PATTERN...", "A|B". excludes lowercase letters so mixed-case
/// description words like "NixOS" or "Home-manager" don't get swallowed
/// as placeholders.
fn is_bare_placeholder_char(c: char) -> bool {
matches!(c, 'A'..='Z' | '0'..='9' | '_' | '-' | '.' | '|' | ',')
}
make_parser!(
skip_arg_placeholders -> (),
value(
(),
many0(preceded(
// peek ahead one char (don't consume) so the per-branch parser can
// see the full token. needed because the bare ALL_CAPS branch must
// verify the *entire* token before deciding to consume.
char(' '),
alt((
// <...> bracketed placeholder
delimited(char('<'), take_while1(is_placeholder), char('>')),
// [...] optional bracketed placeholder
delimited(char('['), take_while1(is_placeholder), char(']')),
// bare ALL_CAPS placeholder — first char must be uppercase or
// a digit (allows e.g. "N", "M2"), and the whole token must
// be uppercase-friendly. rejects "NixOS"-style mixed-case so
// descriptions don't get swallowed.
verify(
take_while1(is_bare_placeholder_char),
|s: &str| {
let first = s.chars().next().unwrap();
first.is_ascii_uppercase() || first.is_ascii_digit()
}
),
)),
)),
)
);
// parse a subcommand entry: leading whitespace, then a name (2+ option
// chars, not starting with '-'), optional argument placeholders, exactly
// two spaces, optional padding, then the description text and eol.
make_parser!(pub subcommand_entry -> Subcommand<'a>,
(
preceded(
space0,
verify(
preceded(not(char('-')), take_while1(is_option_char)),
|n: &str| n.len() >= 2,
),
),
skip_arg_placeholders,
tag(" "),
space0,
terminated(take_till(|c: char| c.is_newline()), eol),
) => |(name, _, _, _, desc): (&'a str, _, _, _, &'a str)| {
// some help formats prefix desc with "- " (manpage-style); strip it.
let d = desc.trim_start();
let desc = d.strip_prefix("- ").map(|s| s.trim_start()).unwrap_or(d);
Subcommand { name, desc }
}
);

335
src/parsers/manpage.rs Normal file
View file

@ -0,0 +1,335 @@
//! parse unix manpages (groff/mdoc format) into a structured result.
//!
//! manpages are written in roff/groff markup — a decades-old typesetting language
//! used by man(1). this module strips the formatting and extracts structured data
//! (flags, subcommands, positionals) from the raw groff source.
//!
//! there are two major manpage macro packages:
//! - man (groff) — used by gnu/linux tools. uses macros like .SH, .TP, .IP, .PP
//! - mdoc (bsd) — used by bsd tools. uses .Sh, .Fl, .Ar, .Op, .It, .Bl/.El
//!
//! this module handles both, auto-detecting the format by checking for .Sh macros.
//!
//! for groff manpages, flag extraction uses multiple "strategies" that target
//! different common formatting patterns:
//! - strategy_tp: .TP tagged paragraphs (gnu coreutils, help2man)
//! - strategy_ip: .IP indented paragraphs (curl, hand-written)
//! - strategy_pp_rs: .PP + .RS/.RE blocks (git, docbook)
//! - strategy_nix: nix3-style bullet .IP with .UR/.UE hyperlinks
//! - strategy_deroff: fallback — strip all groff, feed to help text parser
//!
//! the module tries all applicable strategies and picks the one that extracts
//! the most flag entries, on the theory that more results = better match.
mod commands;
mod groff;
mod mdoc;
mod sections;
mod strategies;
use std::io::{self, Read};
use std::path::Path;
use crate::types::{HelpResult, OptionEntry, Param, Positional, Subcommand, Switch};
pub use self::groff::{GroffLine, classify_line, strip_groff_escapes};
pub use self::sections::{extract_subcommand_sections, extract_synopsis_command};
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum OwnedSwitch {
Short(char),
Long(String),
Both(char, String),
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum OwnedParam {
Mandatory(String),
Optional(String),
}
#[derive(Debug, Clone)]
pub struct ManpageEntry {
pub switch: OwnedSwitch,
pub param: Option<OwnedParam>,
pub desc: String,
}
#[derive(Debug, Clone)]
pub struct ManpageSubcommand {
pub name: String,
pub desc: String,
}
#[derive(Debug, Clone, Default)]
pub struct ManpageResult {
pub entries: Vec<ManpageEntry>,
pub subcommands: Vec<ManpageSubcommand>,
pub positionals: Vec<(String, Positional)>,
pub description: String,
}
impl From<&Switch<'_>> for OwnedSwitch {
fn from(s: &Switch<'_>) -> Self {
match s {
Switch::Short(c) => OwnedSwitch::Short(*c),
Switch::Long(l) => OwnedSwitch::Long((*l).to_string()),
Switch::Both(c, l) => OwnedSwitch::Both(*c, (*l).to_string()),
}
}
}
impl From<&Param<'_>> for OwnedParam {
fn from(p: &Param<'_>) -> Self {
match p {
Param::Mandatory(s) => OwnedParam::Mandatory((*s).to_string()),
Param::Optional(s) => OwnedParam::Optional((*s).to_string()),
}
}
}
impl From<&OptionEntry<'_>> for ManpageEntry {
fn from(e: &OptionEntry<'_>) -> Self {
let desc: String = e
.desc
.iter()
.map(|s| s.trim())
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
ManpageEntry {
switch: (&e.switch).into(),
param: e.param.as_ref().map(Into::into),
desc,
}
}
}
impl From<&Subcommand<'_>> for ManpageSubcommand {
fn from(sc: &Subcommand<'_>) -> Self {
// lowercase the subcommand name here so (a) file naming is
// consistent (meat_yum.json vs meat_YUM.json) and (b) recursive
// --help probes use the lowercase form, which is what most real
// CLIs accept — even tools like meat that DISPLAY uppercase
// names in their help text dispatch on the lowercased argument.
ManpageSubcommand {
name: sc.name.to_ascii_lowercase(),
desc: sc.desc.to_string(),
}
}
}
impl From<&HelpResult<'_>> for ManpageResult {
fn from(r: &HelpResult<'_>) -> Self {
ManpageResult {
entries: r.entries.iter().map(Into::into).collect(),
subcommands: r.subcommands.iter().map(Into::into).collect(),
// positional names are stored lowercased so output is
// stable across the various places we extract them from
// (synopsis, usage, cli11 sections).
positionals: r
.positionals
.iter()
.map(|(k, v)| (k.to_ascii_lowercase(), v.clone()))
.collect(),
description: r.desc.to_string(),
}
}
}
/// parse a manpage from its classified lines.
/// auto-detects mdoc vs groff format. for groff, runs the multi-strategy
/// extraction pipeline.
pub fn parse_manpage_lines(lines: &[GroffLine]) -> ManpageResult {
if mdoc::is_mdoc(lines) {
mdoc::parse_mdoc_lines(lines)
} else {
let options_section = sections::extract_options_section(lines);
let mut entries = strategies::extract_entries(&options_section);
// merge SYNOPSIS-only flags (nix-env's `[{--profile | -p} path]`
// pattern, where the flag is declared in the synopsis but never
// listed as an entry in the OPTIONS body). body entries take
// precedence on duplicate names — they carry the descriptions.
let synopsis_flags = sections::extract_synopsis_flags(lines);
if !synopsis_flags.is_empty() {
let have_long: std::collections::HashSet<String> = entries
.iter()
.filter_map(|e| match &e.switch {
OwnedSwitch::Long(l) | OwnedSwitch::Both(_, l) => Some(l.to_ascii_lowercase()),
_ => None,
})
.collect();
let have_short: std::collections::HashSet<char> = entries
.iter()
.filter_map(|e| match &e.switch {
OwnedSwitch::Short(c) | OwnedSwitch::Both(c, _) => Some(*c),
_ => None,
})
.collect();
for e in synopsis_flags {
let dup = match &e.switch {
OwnedSwitch::Long(l) => have_long.contains(&l.to_ascii_lowercase()),
OwnedSwitch::Short(c) => have_short.contains(c),
OwnedSwitch::Both(c, l) => {
have_short.contains(c) || have_long.contains(&l.to_ascii_lowercase())
}
};
if !dup {
entries.push(e);
}
}
}
let positionals = sections::extract_synopsis_positionals(lines);
let commands_section = sections::extract_commands_section(lines);
let mut subcommands = commands::extract_subcommands_from_commands(&commands_section);
for positional in sections::extract_description_positionals(lines) {
if !subcommands
.iter()
.any(|sc| sc.name.eq_ignore_ascii_case(&positional.name))
{
subcommands.push(positional);
}
}
ManpageResult {
entries,
subcommands,
positionals,
description: String::new(),
}
}
}
/// parse a manpage from its raw string contents.
/// splits into lines, parses, then extracts the NAME section description.
pub fn parse_manpage_string(contents: &str) -> ManpageResult {
let lines: Vec<GroffLine> = contents.split('\n').map(classify_line).collect();
let mut result = parse_manpage_lines(&lines);
if let Some(desc) = sections::extract_name_description(&lines) {
result.description = desc;
}
result
}
/// parse a manpage and also pull out clap-style `.SH SUBCOMMAND` sections
/// as separate per-subcommand results. each subcommand section in a
/// clap-generated manpage is its own command with its own flags; the
/// parent's subcommand list is populated from their names.
///
/// returns (main_result, sub_results) where each sub_result has
/// name=full_command ("nh os"), desc, and its own ManpageResult.
pub fn parse_manpage_with_subs(contents: &str) -> (ManpageResult, Vec<(String, ManpageResult)>) {
let lines: Vec<GroffLine> = contents.split('\n').map(classify_line).collect();
let mut result = parse_manpage_lines(&lines);
if let Some(desc) = sections::extract_name_description(&lines) {
result.description = desc;
}
let sub_sections = sections::extract_subcommand_sections(&lines);
if !sub_sections.is_empty() {
// overwrite subcommands with the SUBCOMMAND-section names —
// these are the authoritative list for clap-generated manpages.
result.subcommands = sub_sections
.iter()
.map(|(name, desc, _)| ManpageSubcommand {
name: name.to_ascii_lowercase(),
desc: desc.clone(),
})
.collect();
}
// each SUBCOMMAND section body is parsed via the same strategy-picker
// as the top-level OPTIONS section — clap puts flag definitions
// directly under the .SH SUBCOMMAND header with no inner .SH wrapping,
// so parse_manpage_lines (which looks for a child OPTIONS section)
// would come back empty.
let subs: Vec<(String, ManpageResult)> = sub_sections
.into_iter()
.map(|(name, desc, lines)| {
let entries = strategies::extract_entries(&lines);
let sub_result = ManpageResult {
entries,
subcommands: Vec::new(),
positionals: Default::default(),
description: desc,
};
(name, sub_result)
})
.collect();
(result, subs)
}
/// read a manpage file from disk. handles .gz compressed files (the common
/// case — most installed manpages are gzipped). plain text files are read directly.
pub fn read_manpage_file<P: AsRef<Path>>(path: P) -> io::Result<String> {
let path = path.as_ref();
let bytes = std::fs::read(path)?;
if path.extension().and_then(|e| e.to_str()) == Some("gz") {
let mut decoder = flate2::read::GzDecoder::new(&bytes[..]);
let mut out = String::new();
decoder.read_to_string(&mut out)?;
Ok(out)
} else {
String::from_utf8(bytes).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))
}
}
/// read + parse a manpage file in one step.
pub fn parse_manpage_file<P: AsRef<Path>>(path: P) -> io::Result<ManpageResult> {
let contents = read_manpage_file(path)?;
Ok(parse_manpage_string(&contents))
}
#[cfg(test)]
mod tests {
use super::*;
const TP_MANPAGE: &str = r#".TH FOO 1 "2024" "1.0" "User Commands"
.SH NAME
foo \- a synthetic test command
.SH SYNOPSIS
.B foo
[\fIOPTIONS\fR] <input> [output]
.SH OPTIONS
.TP
\fB\-v\fR, \fB\-\-verbose\fR
increase output verbosity
.TP
\fB\-o\fR \fIFILE\fR, \fB\-\-output\fR=\fIFILE\fR
write to FILE
.TP
\fB\-h\fR, \fB\-\-help\fR
show this help and exit
"#;
#[test]
fn tp_strategy_extracts_flags() {
let r = parse_manpage_string(TP_MANPAGE);
assert_eq!(
r.entries.len(),
3,
"expected 3 entries, got {:?}",
r.entries
);
assert_eq!(r.description, "a synthetic test command");
assert!(matches!(
r.entries[0].switch,
OwnedSwitch::Both('v', ref l) if l == "verbose"
));
assert!(matches!(
r.entries[2].switch,
OwnedSwitch::Both('h', ref l) if l == "help"
));
assert!(r.entries[0].desc.contains("verbosity"));
}
#[test]
fn mdoc_format_detected() {
let src = ".Sh NAME\n.Nm test\n.Nd a test\n.Sh DESCRIPTION\nstuff\n";
let lines: Vec<GroffLine> = src.split('\n').map(classify_line).collect();
assert!(mdoc::is_mdoc(&lines));
}
#[test]
fn groff_escapes_stripped() {
let stripped = groff::strip_groff_escapes("\\fB\\-v\\fR \\fIfile\\fR");
assert_eq!(stripped.trim(), "-v file");
}
}

View file

@ -0,0 +1,157 @@
//! COMMANDS section subcommand extraction.
//!
//! some manpages (notably systemctl) have a dedicated COMMANDS section
//! listing subcommands with descriptions. these use .PP + bold name +
//! .RS/.RE blocks:
//! .PP
//! \fBstart\fR \fIUNIT\fR...
//! .RS 4
//! Start (activate) one or more units.
//! .RE
use crate::parsers::manpage::ManpageSubcommand;
use crate::parsers::manpage::groff::{GroffLine, strip_groff_escapes, strip_inline_macro_args};
/// validate that the extracted name looks like a subcommand: lowercase,
/// at least 2 chars, no leading dash.
fn is_valid_subcmd(name: &str) -> bool {
name.len() >= 2
&& !name.starts_with('-')
&& name
.chars()
.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-' || c == '_')
}
/// extract subcommand name from a bold groff text like
/// "\fBlist\-units\fR [\fIPATTERN\fR...]" -> "list-units"
fn extract_bold_command_name(text: &str) -> Option<String> {
let trimmed = text.trim();
if trimmed.len() >= 4 && trimmed.starts_with("\\fB") {
// look for \fB...\fR at the start: find the next '\\' and take
// the segment between \fB and there.
let after = &trimmed[3..];
let segment_end = after.find('\\').unwrap_or(after.len());
let name_part = &after[..segment_end];
let reconstructed = format!("\\fB{name_part}\\fR");
let name = normalize_command_token(strip_groff_escapes(&reconstructed).trim());
if is_valid_subcmd(&name) {
return Some(name);
}
return None;
}
// fallback: take the first whitespace-delimited word of the stripped text
let stripped = strip_groff_escapes(trimmed);
let first_word = stripped.split_whitespace().next().unwrap_or("");
let name = normalize_command_token(first_word);
if is_valid_subcmd(&name) {
Some(name)
} else {
None
}
}
fn normalize_command_token(token: &str) -> String {
let token = token.trim();
let token = token
.find('(')
.map(|idx| &token[..idx])
.unwrap_or(token)
.trim_end_matches(',');
token.to_string()
}
fn extract_command_name_from_line(line: &GroffLine) -> Option<String> {
match line {
GroffLine::Text(tag) => extract_bold_command_name(tag),
GroffLine::Macro { name, args }
if matches!(
name.as_str(),
"B" | "BI" | "BR" | "I" | "IR" | "IB" | "RB" | "RI"
) =>
{
let rendered = strip_groff_escapes(&strip_inline_macro_args(args));
extract_bold_command_name(&rendered)
}
_ => None,
}
}
/// walk through commands section lines, extracting subcommand name+description
/// pairs from .PP + Text + .RS/.RE blocks.
pub fn extract_subcommands_from_commands(lines: &[GroffLine]) -> Vec<ManpageSubcommand> {
let mut out = Vec::new();
let mut i = 0;
while i < lines.len() {
if let GroffLine::Macro { name, .. } = &lines[i]
&& name == "PP"
{
i += 1;
if i >= lines.len() {
continue;
}
if let Some(name) = extract_command_name_from_line(&lines[i]) {
let (desc, new_i) = collect_subcmd_desc(lines, i + 1);
let short_desc = first_sentence(&desc);
out.push(ManpageSubcommand {
name: name.to_ascii_lowercase(),
desc: short_desc,
});
i = new_i;
continue;
} else {
i += 1;
}
} else {
i += 1;
}
}
out
}
/// collect the description for a subcommand entry. handles .RS/.RE blocks
/// and stops at the next .PP/.SH/.SS boundary.
fn collect_subcmd_desc(lines: &[GroffLine], start: usize) -> (String, usize) {
let mut acc: Vec<String> = Vec::new();
let mut i = start;
while i < lines.len() {
match &lines[i] {
GroffLine::Macro { name, .. } if name == "RS" => {
i += 1;
// inside .RS — collect until .RE or boundary
while i < lines.len() {
match &lines[i] {
GroffLine::Macro { name, .. } if name == "RE" => {
return (acc.join(" "), i + 1);
}
GroffLine::Text(t) => {
acc.push(t.clone());
i += 1;
}
GroffLine::Macro { name, .. }
if name == "PP" || name == "SH" || name == "SS" =>
{
return (acc.join(" "), i);
}
_ => i += 1,
}
}
return (acc.join(" "), i);
}
GroffLine::Text(t) => {
acc.push(t.clone());
i += 1;
}
_ => return (acc.join(" "), i),
}
}
(acc.join(" "), i)
}
/// take the first sentence (up to '.') as the description.
fn first_sentence(s: &str) -> String {
let s = s.trim();
match s.find('.') {
Some(idx) if idx > 0 => s[..idx].trim().to_string(),
_ => s.to_string(),
}
}

View file

@ -0,0 +1,385 @@
//! groff escape/formatting stripping and line classification.
//!
//! groff escapes start with backslash and use various continuation syntaxes.
//! we strip them, replacing named characters (like \(aq for apostrophe) with
//! their text equivalents and discarding formatting directives.
//!
//! also exports `make_macro_walker!`, the manpage-side analogue of the
//! help parser's `make_parser!`. all of our strategy_* functions are
//! "scan lines, on each .MACRO_NAME run a handler, advance, accumulate"
//! — this macro factors out the loop scaffolding so each strategy reduces
//! to its specific extraction logic.
/// walk a `&[GroffLine]` slice, and on each macro whose name matches
/// `$mname`, invoke the body with `(lines, i, args)` where:
/// - `lines` is the full slice (for slicing further bodies)
/// - `i` is the current index of the matched macro
/// - `args` is the macro's argument string (by reference)
///
/// the body returns `Option<(T, usize)>`. `Some((value, new_i))` pushes
/// `value` and advances the cursor to `new_i` (typically computed as
/// `lines.len() - rest.len()` after `collect_text_lines`). `None`
/// advances by one line and keeps scanning.
///
/// matches the help-parser pattern `make_parser!(name -> T, parser => wrap)`:
/// the macro hides the loop scaffolding, the handler expresses the actual
/// extraction logic.
#[macro_export]
macro_rules! make_macro_walker {
(pub $name:ident -> Vec<$t:ty>, on macro $mname:expr =>
|$lines:ident, $i:ident, $args:ident| $body:expr) => {
pub fn $name(lines_input: &[$crate::parsers::manpage::GroffLine]) -> Vec<$t> {
let mut out = Vec::new();
let mut cursor = 0;
let $lines: &[$crate::parsers::manpage::GroffLine] = lines_input;
while cursor < $lines.len() {
if let $crate::parsers::manpage::GroffLine::Macro {
name: macro_name,
args: $args,
} = &$lines[cursor]
{
if macro_name == $mname {
let $i = cursor;
// wrap the handler body in an IIFE so an early
// `return None` inside the handler returns from the
// closure, not from the surrounding strategy function.
#[allow(clippy::redundant_closure_call)]
let result: Option<($t, usize)> = (|| $body)();
if let Some((value, new_i)) = result {
out.push(value);
cursor = new_i;
continue;
}
}
}
cursor += 1;
}
out
}
};
}
/// every line in a manpage is classified as one of four types.
/// this classification drives all subsequent parsing — strategies
/// pattern-match on sequences of classified lines.
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum GroffLine {
/// macro name + args, e.g. ("SH", "OPTIONS") or ("TP", "")
Macro { name: String, args: String },
/// plain text after groff stripping
Text(String),
/// empty line
Blank,
/// groff comment: .backslash-quote or backslash-quote
Comment,
}
/// translate a groff named character escape to its text equivalent.
/// groff uses two-letter codes like "aq" for apostrophe, "lq"/"rq" for
/// left/right quotes, "em"/"en" for dashes.
fn named_char_of(name: &str) -> Option<char> {
match name {
"aq" => Some('\''),
"lq" | "Lq" | "rq" | "Rq" => Some('"'),
"em" | "en" => Some('-'),
_ => None,
}
}
fn is_alnum(c: u8) -> bool {
c.is_ascii_alphanumeric()
}
/// strip groff escape sequences, replacing named characters with text
/// equivalents and discarding formatting directives.
pub fn strip_groff_escapes(source: &str) -> String {
let bytes = source.as_bytes();
let len = bytes.len();
let mut buffer = String::with_capacity(len);
let mut pos = 0;
let mut prev_char: u8 = 0;
while pos < len {
if bytes[pos] == b'\\' && pos + 1 < len {
let next = bytes[pos + 1];
match next {
b'f' => {
// font escape: \fB, \fI, \fP, \fR, \f(XX, \f[...]
if pos + 2 < len {
let font_char = bytes[pos + 2];
// insert space before italic font to preserve word boundaries
// e.g. \fB--max-results\fR\fIcount\fR -> "--max-results count"
if font_char == b'I' && is_alnum(prev_char) {
buffer.push(' ');
prev_char = b' ';
}
if font_char == b'(' {
pos += 5; // \f(XX — two-character font name
} else if font_char == b'[' {
pos += 3;
skip_to_byte(bytes, len, &mut pos, b']');
if pos < len {
pos += 1;
}
} else {
pos += 3; // \fX — single-character font selector
}
} else {
pos += 2;
}
}
b'-' => {
// escaped hyphen-minus — emit a plain hyphen
buffer.push('-');
prev_char = b'-';
pos += 2;
}
b'&' | b'/' | b',' => {
// zero-width characters — discard without output
pos += 2;
}
b'(' => {
// two-char named character: \(aq, \(lq, \(rq, etc.
if pos + 3 < len {
let name = &source[pos + 2..pos + 4];
if let Some(c) = named_char_of(name) {
buffer.push(c);
prev_char = c as u8;
}
pos += 4;
} else {
pos += 2;
}
}
b'[' => {
// bracketed named character: \[aq], \[lq], etc.
pos += 2;
let start = pos;
skip_to_byte(bytes, len, &mut pos, b']');
if pos < len {
let name = &source[start..pos];
if let Some(c) = named_char_of(name) {
buffer.push(c);
prev_char = c as u8;
}
pos += 1;
}
}
b's' => {
// size escape: \sN, \s+N, \s-N — skip the numeric argument
pos += 2;
if pos < len && (bytes[pos] == b'+' || bytes[pos] == b'-') {
pos += 1;
}
if pos < len && bytes[pos].is_ascii_digit() {
pos += 1;
}
if pos < len && bytes[pos].is_ascii_digit() {
pos += 1;
}
}
b'm' => {
// color escape: \m[...] — skip the bracketed color name
pos += 2;
if pos < len && bytes[pos] == b'[' {
pos += 1;
skip_to_byte(bytes, len, &mut pos, b']');
if pos < len {
pos += 1;
}
}
}
b'X' => {
// device control: \X'...' — skip the single-quoted payload
pos += 2;
if pos < len && bytes[pos] == b'\'' {
pos += 1;
skip_to_byte(bytes, len, &mut pos, b'\'');
if pos < len {
pos += 1;
}
}
}
b'*' => {
// string variable: \*X or \*(XX or \*[...] — skip the reference
pos += 2;
skip_groff_reference(bytes, len, &mut pos);
}
b'n' => {
// number register: \nX or \n(XX or \n[...] — skip the reference
pos += 2;
skip_groff_reference(bytes, len, &mut pos);
}
b'e' => {
// escaped backslash literal
buffer.push('\\');
prev_char = b'\\';
pos += 2;
}
b'\\' => {
// double backslash — emit one
buffer.push('\\');
prev_char = b'\\';
pos += 2;
}
b' ' | b'~' => {
// escaped/non-breaking space — emit a regular space
buffer.push(' ');
prev_char = b' ';
pos += 2;
}
_ => {
// unknown escape — skip the two-character sequence
pos += 2;
}
}
} else {
// copy a full utf-8 char from source to buffer
let c = source[pos..].chars().next().unwrap();
buffer.push(c);
prev_char = if c.is_ascii() { c as u8 } else { 0 };
pos += c.len_utf8();
}
}
buffer
}
fn skip_to_byte(bytes: &[u8], len: usize, pos: &mut usize, delim: u8) {
while *pos < len && bytes[*pos] != delim {
*pos += 1;
}
}
/// skip a groff reference that uses one of three sub-forms:
/// single char — e.g. \*X or \nX
/// ( + 2 chars — e.g. \*(XX or \n(XX
/// [ to ] — e.g. \*[name] or \n[name]
fn skip_groff_reference(bytes: &[u8], len: usize, pos: &mut usize) {
if *pos < len {
if bytes[*pos] == b'(' {
*pos += 3; // skip past '(' + two-character name
} else if bytes[*pos] == b'[' {
*pos += 1;
skip_to_byte(bytes, len, pos, b']');
if *pos < len {
*pos += 1;
}
} else {
*pos += 1;
}
}
}
/// strip inline macro formatting: .BI, .BR, .IR, etc.
/// these macros alternate between fonts for their arguments, e.g.:
/// .BI "--output " "FILE"
/// becomes "--outputFILE" (arguments concatenated without spaces).
///
/// quoted strings are kept together (quotes stripped), but unquoted spaces
/// are consumed. this matches groff's actual rendering of these macros.
pub fn strip_inline_macro_args(text: &str) -> String {
let bytes = text.as_bytes();
let len = bytes.len();
let mut buffer = String::with_capacity(len);
let mut pos = 0;
while pos < len {
if bytes[pos] == b'"' {
// quoted argument — copy characters up to the closing quote
pos += 1;
while pos < len && bytes[pos] != b'"' {
let c = text[pos..].chars().next().unwrap();
buffer.push(c);
pos += c.len_utf8();
}
if pos < len {
pos += 1;
}
} else if bytes[pos] == b' ' || bytes[pos] == b'\t' {
// unquoted whitespace — skip (arguments are concatenated)
pos += 1;
} else {
let c = text[pos..].chars().next().unwrap();
buffer.push(c);
pos += c.len_utf8();
}
}
buffer
}
/// render same-font macro arguments (.B/.I) where arguments are separated
/// by spaces. quote delimiters group arguments in roff source but should
/// not become part of the visible text.
pub fn strip_space_macro_args(text: &str) -> String {
strip_groff_escapes(&text.replace('"', ""))
.trim()
.to_string()
}
/// strip escapes and trim whitespace.
pub fn strip_groff(line: &str) -> String {
strip_groff_escapes(line).trim().to_string()
}
/// refined comment detection — the base classify_line may miss some comment
/// forms, so this wrapper checks more carefully before falling through.
fn is_comment_line(line: &str) -> bool {
let bytes = line.as_bytes();
let len = bytes.len();
(len >= 3 && bytes[0] == b'.' && bytes[1] == b'\\' && bytes[2] == b'"')
|| (len >= 2 && bytes[0] == b'\\' && bytes[1] == b'"')
}
/// classify a single line of manpage source.
/// macro lines start with '.' or '\'' (groff alternate control char).
/// the macro name is split from its arguments at the first space/tab.
/// arguments wrapped in double quotes are unquoted.
pub fn classify_line(line: &str) -> GroffLine {
if is_comment_line(line) {
return GroffLine::Comment;
}
let len = line.len();
if len == 0 {
return GroffLine::Blank;
}
let bytes = line.as_bytes();
// base classify also flags dot-backslash forms as comments
if len >= 2 && bytes[0] == b'.' && bytes[1] == b'\\' && (len < 3 || bytes[2] == b'"') {
return GroffLine::Comment;
}
if len >= 3 && bytes[0] == b'\\' && bytes[1] == b'"' {
return GroffLine::Comment;
}
if bytes[0] == b'.' || bytes[0] == b'\'' {
// macro line — extract macro name and arguments
let rest = line[1..].trim();
let split_at = rest.find([' ', '\t']);
match split_at {
Some(idx) => {
let name = rest[..idx].to_string();
let args = rest[idx + 1..].trim();
// strip surrounding quotes from arguments
let args = if args.len() >= 2
&& args.starts_with('"')
&& args.ends_with('"')
&& !args[1..args.len() - 1].contains('"')
{
args[1..args.len() - 1].to_string()
} else {
args.to_string()
};
GroffLine::Macro { name, args }
}
None => GroffLine::Macro {
name: rest.to_string(),
args: String::new(),
},
}
} else {
let stripped = strip_groff(line);
if stripped.is_empty() {
GroffLine::Blank
} else {
GroffLine::Text(stripped)
}
}
}

237
src/parsers/manpage/mdoc.rs Normal file
View file

@ -0,0 +1,237 @@
//! BSD mdoc format support.
//!
//! mdoc is the bsd manpage macro package. it uses semantic macros rather than
//! presentation macros:
//! .Fl v -> flag: -v
//! .Ar file -> argument: file
//! .Op ... -> optional: [...]
//! .Bl/.It/.El -> list begin/item/end
//! .Sh -> section header (note lowercase 'h', vs groff's .SH)
use crate::parsers::manpage::groff::{GroffLine, strip_groff_escapes};
use crate::parsers::manpage::{ManpageEntry, ManpageResult, OwnedParam, OwnedSwitch};
use crate::types::Positional;
/// detect mdoc format by looking for any .Sh macro.
pub fn is_mdoc(lines: &[GroffLine]) -> bool {
lines
.iter()
.any(|l| matches!(l, GroffLine::Macro { name, .. } if name == "Sh"))
}
/// extract renderable text from an mdoc line, skipping structural macros.
fn mdoc_text_of(line: &GroffLine) -> Option<String> {
match line {
GroffLine::Text(t) => Some(strip_groff_escapes(t)),
GroffLine::Macro { name, args } => match name.as_str() {
"Pp" | "Bl" | "El" | "Sh" | "Ss" | "Os" | "Dd" | "Dt" | "Oo" | "Oc" | "Op" => None,
_ => {
let text = strip_groff_escapes(args);
let text = text.trim();
if text.is_empty() {
None
} else {
Some(text.to_string())
}
}
},
_ => None,
}
}
/// parse an mdoc .It (list item) line that contains flag definitions.
/// mdoc .It lines look like: ".It Fl v Ar file"
/// where Fl = flag, Ar = argument.
fn parse_mdoc_it(args: &str) -> Option<ManpageEntry> {
let words: Vec<&str> = args
.split(' ')
.filter(|w| !w.is_empty() && *w != "Ns")
.collect();
let param = match words.as_slice() {
[_, _, "Ar", name, ..] => Some(OwnedParam::Mandatory(name.to_string())),
_ => None,
};
match words.as_slice() {
["Fl", ch, ..] if ch.len() == 1 && ch.chars().next().unwrap().is_ascii_alphanumeric() => {
Some(ManpageEntry {
switch: OwnedSwitch::Short(ch.chars().next().unwrap()),
param,
desc: String::new(),
})
}
["Fl", name, ..] if name.len() > 1 && name.starts_with('-') => Some(ManpageEntry {
switch: OwnedSwitch::Long(name[1..].to_string()),
param,
desc: String::new(),
}),
_ => None,
}
}
/// extract a positional argument from an mdoc line (.Ar or .Op Ar).
fn positional_of_mdoc_line(args: &str) -> Option<(String, bool)> {
let words: Vec<&str> = args.split(' ').filter(|w| !w.is_empty()).collect();
let variadic = words.contains(&"...");
match words.first() {
Some(name) if name.len() >= 2 => Some((name.to_ascii_lowercase(), variadic)),
_ => None,
}
}
/// parse an entire mdoc-format manpage.
/// walks through all classified lines looking for:
/// 1. .Bl/.It/.El list blocks containing flag definitions
/// 2. .Sh SYNOPSIS sections containing positional arguments (.Ar, .Op Ar)
pub fn parse_mdoc_lines(lines: &[GroffLine]) -> ManpageResult {
// collect description for an entry — until next structural macro
fn desc_of(lines: &[GroffLine], start: usize) -> (String, usize) {
let mut acc: Vec<String> = Vec::new();
let mut i = start;
while i < lines.len() {
if let GroffLine::Macro { name, .. } = &lines[i]
&& matches!(name.as_str(), "It" | "El" | "Sh" | "Ss")
{
break;
}
if let Some(t) = mdoc_text_of(&lines[i]) {
acc.push(t);
}
i += 1;
}
(acc.join(" ").trim().to_string(), i)
}
fn skip_to_el(lines: &[GroffLine], start: usize) -> usize {
let mut i = start;
while i < lines.len() {
if let GroffLine::Macro { name, .. } = &lines[i]
&& name == "El"
{
return i + 1;
}
i += 1;
}
i
}
/// parse a single .It entry: extract flag, collect description.
fn parse_it(
args: &str,
lines: &[GroffLine],
start: usize,
entries: &mut Vec<ManpageEntry>,
) -> usize {
let (desc, new_start) = desc_of(lines, start);
if let Some(mut entry) = parse_mdoc_it(args) {
entry.desc = desc;
entries.push(entry);
}
new_start
}
/// parse all .It entries within a .Bl/.El option list.
fn parse_option_list(
entries: &mut Vec<ManpageEntry>,
lines: &[GroffLine],
start: usize,
) -> usize {
let mut i = start;
while i < lines.len() {
match &lines[i] {
GroffLine::Macro { name, .. } if name == "El" => return i + 1,
GroffLine::Macro { name, args } if name == "It" => {
i = parse_it(args, lines, i + 1, entries);
}
_ => i += 1,
}
}
i
}
fn parse_synopsis(
positionals: &mut Vec<(String, bool, bool)>,
lines: &[GroffLine],
start: usize,
) -> usize {
let mut i = start;
while i < lines.len() {
match &lines[i] {
GroffLine::Macro { name, .. } if name == "Sh" => return i,
GroffLine::Macro { name, args } if name == "Ar" => {
if let Some((n, v)) = positional_of_mdoc_line(args) {
positionals.push((n, false, v));
}
i += 1;
}
GroffLine::Macro { name, args } if name == "Op" => {
let words: Vec<&str> = args.split(' ').filter(|w| !w.is_empty()).collect();
if matches!(words.first(), Some(&"Ar")) {
let rest = if args.len() > 3 { &args[3..] } else { "" };
if let Some((n, v)) = positional_of_mdoc_line(rest) {
positionals.push((n, true, v));
}
}
i += 1;
}
_ => i += 1,
}
}
i
}
let mut entries: Vec<ManpageEntry> = Vec::new();
let mut positionals: Vec<(String, bool, bool)> = Vec::new();
let mut i = 0;
while i < lines.len() {
// .Bl + .It header sequence — peek at first .It to decide if this is a flag list
if let GroffLine::Macro { name: n1, .. } = &lines[i]
&& n1 == "Bl"
{
let j = i + 1;
if j < lines.len()
&& let GroffLine::Macro {
name: n2,
args: it_args,
} = &lines[j]
&& n2 == "It"
{
let words: Vec<&str> = it_args.split(' ').filter(|w| !w.is_empty()).collect();
if matches!(words.first(), Some(&"Fl")) {
let k = parse_it(it_args, lines, j + 1, &mut entries);
i = parse_option_list(&mut entries, lines, k);
continue;
} else {
i = skip_to_el(lines, j + 1);
continue;
}
}
i = skip_to_el(lines, j);
continue;
}
if let GroffLine::Macro { name, args } = &lines[i]
&& name == "Sh"
&& args.trim().eq_ignore_ascii_case("SYNOPSIS")
{
i = parse_synopsis(&mut positionals, lines, i + 1);
continue;
}
i += 1;
}
// deduplicate positionals by name, preserving first-seen order
let mut seen: Vec<String> = Vec::new();
let mut deduped: Vec<(String, Positional)> = Vec::new();
for (name, optional, variadic) in positionals {
if !seen.contains(&name) {
seen.push(name.clone());
deduped.push((name, Positional { optional, variadic }));
}
}
ManpageResult {
entries,
subcommands: Vec::new(),
positionals: deduped,
description: String::new(),
}
}

View file

@ -0,0 +1,851 @@
//! section extraction from manpages.
//!
//! manpages are divided into sections by .SH macros. we extract OPTIONS,
//! NAME, SYNOPSIS, and COMMANDS sections for their specific content.
use nom::{Parser, sequence::preceded};
use crate::parsers::help::{parse_usage_args, parse_usage_flags, skip_command_name};
use crate::parsers::manpage::groff::{
GroffLine, strip_groff_escapes, strip_inline_macro_args, strip_space_macro_args,
};
use crate::parsers::manpage::{ManpageEntry, ManpageSubcommand, OwnedParam, OwnedSwitch};
use crate::types::{Param, Positional, Switch};
fn is_options_section(name: &str) -> bool {
let upper = name.trim().to_ascii_uppercase();
upper == "OPTIONS" || upper.contains("OPTION")
}
/// extract the lines from the OPTIONS section(s). collects from all
/// option-like .SH sections and concatenates them (handles the nix pattern
/// of "Options" and "Common Options" being separate sections).
/// falls back to DESCRIPTION if no OPTIONS section exists.
pub fn extract_options_section(lines: &[GroffLine]) -> Vec<GroffLine> {
let mut acc: Vec<GroffLine> = Vec::new();
let mut i = 0;
while i < lines.len() {
if let GroffLine::Macro { name, args } = &lines[i]
&& name == "SH"
&& is_options_section(args)
{
i += 1;
// synthetic separator between concatenated sections so that
// collect_desc_text (which stops on SH/SS) does not let descriptions
// bleed between sections.
if !acc.is_empty() {
acc.push(GroffLine::Macro {
name: "SH".to_string(),
args: String::new(),
});
}
while i < lines.len() {
if let GroffLine::Macro { name, .. } = &lines[i]
&& name == "SH"
{
break;
}
acc.push(lines[i].clone());
i += 1;
}
} else {
i += 1;
}
}
if !acc.is_empty() {
return acc;
}
// fallback: DESCRIPTION section
let mut i = 0;
while i < lines.len() {
if let GroffLine::Macro { name, args } = &lines[i]
&& name == "SH"
&& args.trim().eq_ignore_ascii_case("DESCRIPTION")
{
i += 1;
let mut desc_acc: Vec<GroffLine> = Vec::new();
while i < lines.len() {
if let GroffLine::Macro { name, .. } = &lines[i]
&& name == "SH"
{
break;
}
desc_acc.push(lines[i].clone());
i += 1;
}
return desc_acc;
}
i += 1;
}
Vec::new()
}
fn extract_named_section(lines: &[GroffLine], section_name: &str) -> Vec<GroffLine> {
let mut i = 0;
while i < lines.len() {
if let GroffLine::Macro { name, args } = &lines[i]
&& name == "SH"
&& args.trim().eq_ignore_ascii_case(section_name)
{
i += 1;
let mut acc: Vec<GroffLine> = Vec::new();
while i < lines.len() {
if let GroffLine::Macro { name, .. } = &lines[i]
&& name == "SH"
{
break;
}
acc.push(lines[i].clone());
i += 1;
}
return acc;
}
i += 1;
}
Vec::new()
}
/// the NAME section follows the convention "command \- short description".
/// extract the part after "\-" as the command's description.
/// handles both "\-" (groff) and " - " (plain text) separators.
pub fn extract_name_description(lines: &[GroffLine]) -> Option<String> {
let mut i = 0;
while i < lines.len() {
if let GroffLine::Macro { name, args } = &lines[i]
&& name == "SH"
&& args.trim().eq_ignore_ascii_case("NAME")
{
i += 1;
let mut acc: Vec<String> = Vec::new();
while i < lines.len() {
if let GroffLine::Macro { name, .. } = &lines[i]
&& name == "SH"
{
break;
}
match &lines[i] {
GroffLine::Text(t) => acc.push(t.clone()),
GroffLine::Macro { name, args }
if matches!(name.as_str(), "B" | "BI" | "BR" | "I" | "IR") =>
{
let text = strip_groff_escapes(&strip_inline_macro_args(args));
let text = text.trim();
if !text.is_empty() {
acc.push(text.to_string());
}
}
GroffLine::Macro { name, args } if name == "Nm" => {
let text = strip_groff_escapes(args);
let text = text.trim();
if !text.is_empty() {
acc.push(text.to_string());
}
}
GroffLine::Macro { name, args } if name == "Nd" => {
let text = strip_groff_escapes(args);
let text = text.trim();
if !text.is_empty() {
acc.push(format!("\\- {text}"));
}
}
_ => (),
}
i += 1;
}
let full = acc.join(" ").trim().to_string();
return split_name_separator(&full);
}
i += 1;
}
None
}
/// split a NAME line on either "\-" (groff) or " - " (plain).
/// returns the part after the separator, trimmed.
fn split_name_separator(full: &str) -> Option<String> {
// search for either marker
let groff_idx = find_padded(full, "\\-");
let dash_idx = find_padded(full, " - ");
let idx = match (groff_idx, dash_idx) {
(Some(a), Some(b)) => Some(a.min(b)),
(Some(a), None) => Some(a),
(None, Some(b)) => Some(b),
(None, None) => None,
}?;
// skip past the matched separator
let after = if full[idx..].starts_with("\\-") {
&full[idx + 2..]
} else {
&full[idx + 3..]
};
let desc = after.trim().to_string();
if desc.is_empty() { None } else { Some(desc) }
}
/// find a marker preceded and followed by optional surrounding space.
/// approximated by a simple substring search — accepts spaces on either
/// side without enforcing how many.
fn find_padded(s: &str, needle: &str) -> Option<usize> {
s.find(needle)
}
/// extract the command name from the SYNOPSIS section.
///
/// the SYNOPSIS section shows how to invoke the command:
/// .SH SYNOPSIS
/// .B git add
/// [\fIOPTIONS\fR] [\fB\-\-\fR] [\fI<pathspec>\fR...]
///
/// we extract the command name by taking consecutive "word" tokens until
/// we hit something that looks like an argument (starts with [, <, -, etc.).
pub fn extract_synopsis_command(contents: &str) -> Option<String> {
// pre-replace italic text (\fI...\fR) with angle-bracketed placeholders
// before classification strips the font info. italic in groff indicates
// a parameter/placeholder (e.g. \fIoperation\fR), not a command word.
// the angle brackets cause extract_cmd to stop at these tokens since
// '<' is in its stop set.
let preprocessed: Vec<String> = contents
.split('\n')
.map(replace_italic_with_angles)
.collect();
let classified: Vec<GroffLine> = preprocessed
.iter()
.map(|line| crate::parsers::manpage::groff::classify_line(line))
.collect();
let mut i = 0;
while i < classified.len() {
if let Some((stop_on_ss, content_start)) = synopsis_heading_at(&classified, i) {
i = content_start;
while i < classified.len() {
match &classified[i] {
GroffLine::Macro { name, .. }
if name == "SH" || (stop_on_ss && name == "SS") =>
{
return None;
}
GroffLine::Text(text) => {
let trimmed = text.trim();
if let Some(cmd) = synopsis_command_candidate(trimmed, true) {
return Some(cmd);
}
i += 1;
}
GroffLine::Macro { name, args } if name == "SY" => {
let text = strip_groff_escapes(args);
if let Some(cmd) = synopsis_command_candidate(text.trim(), false) {
return Some(cmd);
}
i += 1;
}
GroffLine::Macro { name, args }
if matches!(name.as_str(), "B" | "BI" | "BR") =>
{
let text = render_synopsis_command_macro(name, args);
if let Some(cmd) = synopsis_command_candidate(text.trim(), false) {
return Some(cmd);
}
i += 1;
}
_ => i += 1,
}
}
return None;
}
i += 1;
}
None
}
fn synopsis_heading_at(lines: &[GroffLine], i: usize) -> Option<(bool, usize)> {
let GroffLine::Macro { name, args } = &lines[i] else {
return None;
};
if !matches!(name.as_str(), "SH" | "SS") {
return None;
}
if args.trim().eq_ignore_ascii_case("SYNOPSIS") {
return Some((name == "SS", i + 1));
}
if !args.trim().is_empty() {
return None;
}
let mut j = i + 1;
while j < lines.len() {
match &lines[j] {
GroffLine::Text(text) if text.trim().eq_ignore_ascii_case("SYNOPSIS") => {
return Some((name == "SS", j + 1));
}
GroffLine::Blank | GroffLine::Comment => j += 1,
_ => return None,
}
}
None
}
fn render_synopsis_command_macro(name: &str, args: &str) -> String {
match name {
"B" | "I" => strip_space_macro_args(args),
_ => strip_groff_escapes(&strip_inline_macro_args(args))
.trim()
.to_string(),
}
}
fn synopsis_command_candidate(line: &str, reject_long_unmarked: bool) -> Option<String> {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed.ends_with(':') {
return None;
}
let cmd = extract_cmd(trimmed)?;
if cmd.starts_with('.') {
return None;
}
if looks_like_synopsis_prose(trimmed, &cmd, reject_long_unmarked) {
None
} else {
Some(cmd)
}
}
fn looks_like_synopsis_prose(line: &str, cmd: &str, reject_long_unmarked: bool) -> bool {
let Some(first) = cmd.split_whitespace().next() else {
return true;
};
if matches!(
first.to_ascii_lowercase().as_str(),
"a" | "an" | "and" | "or" | "the" | "this" | "these"
) {
return true;
}
let line_has_invocation_marker = line.split_whitespace().any(|word| {
word.starts_with('[')
|| word.starts_with('<')
|| word.starts_with('-')
|| word.starts_with('{')
}) || line.contains('|');
if line.ends_with('.') && !line_has_invocation_marker {
return true;
}
if reject_long_unmarked && cmd.split_whitespace().count() > 3 && !line_has_invocation_marker {
return true;
}
let looks_like_sentence_starter = first.chars().next().is_some_and(|c| c.is_ascii_uppercase())
&& first.chars().skip(1).all(|c| c.is_ascii_lowercase());
looks_like_sentence_starter
&& line.split_whitespace().count() > 1
&& !line_has_invocation_marker
}
/// replace \fI...\f[RP] sequences with <...> so italic params are seen as
/// non-word tokens by extract_cmd.
///
/// exception: some manpages put the command name itself in italics (e.g.
/// git-am.1's synopsis reads `\fIgit am\fR ...`). when the first italic
/// block on the line appears at the very start (preceded only by
/// whitespace) and its content looks like a command word, we strip the
/// font markers but leave the content bare so extract_cmd treats it as
/// the command name rather than a placeholder.
fn replace_italic_with_angles(line: &str) -> String {
let bytes = line.as_bytes();
let len = bytes.len();
let mut out = String::with_capacity(len);
let mut i = 0;
let mut command_consumed = false;
while i < len {
// byte-compare to avoid panicking on non-ASCII char boundaries
if i + 3 <= len && &bytes[i..i + 3] == b"\\fI" {
// find closing \fR or \fP — scan to next '\\'
let inner_start = i + 3;
let mut j = inner_start;
while j < len && bytes[j] != b'\\' {
j += 1;
}
if j + 3 <= len
&& bytes[j] == b'\\'
&& bytes[j + 1] == b'f'
&& (bytes[j + 2] == b'R' || bytes[j + 2] == b'P')
{
let inner = &line[inner_start..j];
let at_line_start = !command_consumed && line[..i].chars().all(char::is_whitespace);
if at_line_start && italic_looks_like_command(inner) {
out.push_str(inner);
command_consumed = true;
} else {
out.push('<');
out.push_str(inner);
out.push('>');
}
i = j + 3;
continue;
}
}
let c = line[i..].chars().next().unwrap();
out.push(c);
i += c.len_utf8();
}
out
}
/// is the italic content something that looks like a command name (rather
/// than a placeholder)? lowercase letters, digits, hyphens, underscores,
/// dots, and spaces only, after groff escapes (like `\-`) are resolved.
fn italic_looks_like_command(inner: &str) -> bool {
let stripped = strip_groff_escapes(inner);
let trimmed = stripped.trim();
!trimmed.is_empty()
&& trimmed.chars().all(|c| {
c.is_ascii_lowercase() || c.is_ascii_digit() || matches!(c, '-' | '_' | '.' | ' ')
})
}
/// extract the command name from a synopsis line by taking leading word tokens.
fn extract_cmd(line: &str) -> Option<String> {
let words: Vec<&str> = line.split(' ').filter(|w| !w.is_empty()).collect();
let is_cmd_char = |c: char| c.is_ascii_alphanumeric() || matches!(c, '-' | '_' | '.');
let mut taken: Vec<&str> = Vec::new();
for word in words {
let first = word.chars().next().unwrap();
if matches!(first, '[' | '-' | '<' | '(' | '{') {
break;
}
if word.chars().all(is_cmd_char) {
taken.push(word);
} else {
break;
}
}
if taken.is_empty() {
None
} else {
Some(taken.join(" "))
}
}
/// extract the lines that form the SYNOPSIS section.
fn extract_synopsis_section(lines: &[GroffLine]) -> Vec<GroffLine> {
extract_named_section(lines, "SYNOPSIS")
}
/// extract positional arguments from the SYNOPSIS section.
/// joins all text/formatting macro lines via `join_synopsis_text`, then
/// skips the command name prefix and runs `parse_usage_args` on the rest.
pub fn extract_synopsis_positionals(lines: &[GroffLine]) -> Vec<(String, Positional)> {
let full = join_synopsis_text(lines);
if full.is_empty() {
return Vec::new();
}
let result: nom::IResult<&str, Vec<(&str, Positional)>> =
preceded(skip_command_name, parse_usage_args).parse(&full);
match result {
Ok((_, map)) => map
.into_iter()
.map(|(k, v)| (k.to_ascii_lowercase(), v))
.collect(),
Err(_) => Vec::new(),
}
}
/// join the SYNOPSIS section into a single line of plain text, stripping
/// groff escapes and inline font macros. shared by both the positional
/// and flag extractors so they see identical input.
fn join_synopsis_text(lines: &[GroffLine]) -> String {
let section = extract_synopsis_section(lines);
let mut acc: Vec<String> = Vec::new();
for line in section {
match line {
GroffLine::Macro { name, .. } if name == "SS" || name == "br" => break,
GroffLine::Macro { name, args } if name == "SY" => {
let text = strip_groff_escapes(&args).trim().to_string();
if !text.is_empty() {
acc.push(text);
}
}
GroffLine::Macro { name, args } if name == "I" => {
let text = strip_groff_escapes(&args).trim().to_string();
if !text.is_empty() {
acc.push(format!("<{text}>"));
}
}
GroffLine::Macro { name, args } if name == "IR" => {
let text = render_leading_italic_arg(&args);
if !text.is_empty() {
acc.push(text);
}
}
GroffLine::Text(t) => {
let text = strip_groff_escapes(&t).trim().to_string();
if !text.is_empty() {
acc.push(text);
}
}
GroffLine::Macro { name, args } if name == "B" => {
let text = strip_space_macro_args(&args);
if !text.is_empty() {
acc.push(text);
}
}
GroffLine::Macro { name, args }
if matches!(name.as_str(), "B" | "BI" | "BR" | "IB" | "RB" | "RI") =>
{
let text = strip_groff_escapes(&strip_inline_macro_args(&args));
let text = text.trim();
if !text.is_empty() {
acc.push(text.to_string());
}
}
_ => (),
}
}
acc.join(" ").trim().to_string()
}
fn render_leading_italic_arg(args: &str) -> String {
let trimmed = args.trim();
if trimmed.is_empty() {
return String::new();
}
let (first, rest) = match trimmed.find(char::is_whitespace) {
Some(idx) => (&trimmed[..idx], trimmed[idx..].trim()),
None => (trimmed, ""),
};
let first = strip_groff_escapes(first).trim().to_string();
if first.is_empty() {
return String::new();
}
let rest = strip_groff_escapes(&strip_inline_macro_args(rest));
let rest = rest.trim();
if rest.is_empty() {
format!("<{first}>")
} else {
format!("<{first}> {rest}")
}
}
fn to_owned_switch(s: Switch<'_>) -> OwnedSwitch {
match s {
Switch::Short(c) => OwnedSwitch::Short(c),
Switch::Long(l) => OwnedSwitch::Long(l.to_string()),
Switch::Both(c, l) => OwnedSwitch::Both(c, l.to_string()),
}
}
fn to_owned_param(p: Param<'_>) -> OwnedParam {
match p {
Param::Mandatory(s) => OwnedParam::Mandatory(s.to_string()),
Param::Optional(s) => OwnedParam::Optional(s.to_string()),
}
}
/// extract flag-tagged entries from the SYNOPSIS line. some manpages
/// (notably nix-env, sed) declare flags only in the synopsis and never
/// repeat them as entries in the OPTIONS body, so the body-only pass
/// misses them. we join the synopsis text the same way the positional
/// extractor does, then run `parse_usage_flags` over every bracketed
/// switch+param. callers merge with body entries; body wins on duplicate
/// flag names since body descriptions are richer.
pub fn extract_synopsis_flags(lines: &[GroffLine]) -> Vec<ManpageEntry> {
let full = join_synopsis_text(lines);
if full.is_empty() {
return Vec::new();
}
let result: nom::IResult<&str, Vec<(Switch<'_>, Option<Param<'_>>)>> =
preceded(skip_command_name, parse_usage_flags).parse(&full);
match result {
Ok((_, pairs)) => pairs
.into_iter()
.map(|(switch, param)| ManpageEntry {
switch: to_owned_switch(switch),
param: param.map(to_owned_param),
desc: String::new(),
})
.collect(),
Err(_) => Vec::new(),
}
}
/// extract first-positional choices from prose lists in DESCRIPTION.
///
/// getent(1) is the motivating shape: the synopsis has a `database`
/// positional, while the actual database names are documented as a tagged
/// list under DESCRIPTION rather than as subcommands or options. The
/// completion model currently has no separate "positional choices" channel,
/// so these are represented as subcommand-like candidates for completion.
pub fn extract_description_positionals(lines: &[GroffLine]) -> Vec<ManpageSubcommand> {
let description = extract_named_section(lines, "DESCRIPTION");
if description.is_empty() || !description_mentions_listed_database(&description) {
return Vec::new();
}
let mut out = Vec::new();
let mut seen = std::collections::HashSet::new();
let mut i = 0;
let mut in_database_list = false;
while i < description.len() {
match &description[i] {
GroffLine::Text(text)
if text.to_ascii_lowercase().contains("listed below")
|| text.to_ascii_lowercase().contains("may be any of") =>
{
in_database_list = true;
i += 1;
}
GroffLine::Macro { name, .. } if name == "TP" && in_database_list => {
if i + 1 >= description.len() {
break;
}
let Some(name) = description_tag_name(&description[i + 1]) else {
i += 1;
continue;
};
if !is_description_choice_name(&name) {
i += 1;
continue;
}
let (desc, new_i) = collect_description_choice_desc(&description, i + 2);
if seen.insert(name.clone()) {
out.push(ManpageSubcommand { name, desc });
}
i = new_i;
}
_ => {
i += 1;
}
}
}
out
}
fn description_mentions_listed_database(lines: &[GroffLine]) -> bool {
let mut saw_database = false;
let mut saw_list = false;
for line in lines {
let text = match line {
GroffLine::Text(text) => text.clone(),
GroffLine::Macro { name, args }
if matches!(name.as_str(), "B" | "BI" | "BR" | "I" | "IR" | "RI") =>
{
strip_groff_escapes(&strip_inline_macro_args(args))
}
_ => String::new(),
};
let lower = text.to_ascii_lowercase();
saw_database |= lower.contains("database");
saw_list |= lower.contains("listed below") || lower.contains("may be any of");
}
saw_database && saw_list
}
fn description_tag_name(line: &GroffLine) -> Option<String> {
match line {
GroffLine::Text(text) => Some(text.trim().to_string()),
GroffLine::Macro { name, args }
if matches!(name.as_str(), "B" | "BI" | "BR" | "I" | "IR") =>
{
Some(
strip_groff_escapes(&strip_inline_macro_args(args))
.trim()
.to_string(),
)
}
_ => None,
}
}
fn is_description_choice_name(name: &str) -> bool {
!name.is_empty()
&& name.len() <= 32
&& !name.starts_with('-')
&& name
.chars()
.all(|c| c.is_ascii_lowercase() || c.is_ascii_digit() || c == '-' || c == '_')
}
fn collect_description_choice_desc(lines: &[GroffLine], start: usize) -> (String, usize) {
let mut parts = Vec::new();
let mut i = start;
while i < lines.len() {
match &lines[i] {
GroffLine::Macro { name, .. } if matches!(name.as_str(), "TP" | "SH" | "SS") => {
break;
}
GroffLine::Text(text) => {
parts.push(text.clone());
i += 1;
}
GroffLine::Macro { name, args }
if matches!(name.as_str(), "B" | "BI" | "BR" | "I" | "IR" | "RI") =>
{
let text = strip_groff_escapes(&strip_inline_macro_args(args));
let text = text.trim();
if !text.is_empty() {
parts.push(text.to_string());
}
i += 1;
}
GroffLine::Blank | GroffLine::Comment => {
i += 1;
}
GroffLine::Macro { .. } => {
i += 1;
}
}
}
(first_sentence(&parts.join(" ")), i)
}
fn first_sentence(text: &str) -> String {
let text = text.split_whitespace().collect::<Vec<_>>().join(" ");
for marker in [". ", ".) "] {
if let Some(idx) = text.find(marker) {
return text[..idx + 1].trim().to_string();
}
}
text.trim().to_string()
}
fn is_commands_section(name: &str) -> bool {
let trimmed = name.trim();
// strip a trailing parenthetical group so "HIGH-LEVEL COMMANDS (PORCELAIN)"
// (which is git.1's pattern) is treated as "HIGH-LEVEL COMMANDS".
let core = match (trimmed.rfind('('), trimmed.ends_with(')')) {
(Some(open), true) => trimmed[..open].trim(),
_ => trimmed,
};
let upper = core.to_ascii_uppercase();
if upper == "COMMAND" || upper == "COMMANDS" {
return true;
}
// accept headings ending in " COMMANDS" — catches "GIT COMMANDS",
// "MAIN COMMANDS", "HIGH-LEVEL COMMANDS", "LOW-LEVEL COMMANDS". the
// leading space prevents matches against "COMMAND LINE OPTIONS" etc.
upper.ends_with(" COMMANDS")
}
/// find all COMMANDS/.COMMAND sections and collect their lines.
pub fn extract_commands_section(lines: &[GroffLine]) -> Vec<GroffLine> {
let mut acc: Vec<GroffLine> = Vec::new();
let mut i = 0;
while i < lines.len() {
if let GroffLine::Macro { name, args } = &lines[i]
&& name == "SH"
&& is_commands_section(args)
{
i += 1;
while i < lines.len() {
if let GroffLine::Macro { name, .. } = &lines[i]
&& name == "SH"
{
break;
}
acc.push(lines[i].clone());
i += 1;
}
} else {
i += 1;
}
}
acc
}
/// extract SUBCOMMAND-style sections (clap-generated manpages put each
/// subcommand under its own .SH SUBCOMMAND header with a Usage: line).
/// returns triples of (name, description, lines) so the caller can re-parse
/// each section as its own help_result.
pub fn extract_subcommand_sections(lines: &[GroffLine]) -> Vec<(String, String, Vec<GroffLine>)> {
// split into sections at .SH boundaries, keeping only SUBCOMMAND(S) ones
let mut sections: Vec<Vec<GroffLine>> = Vec::new();
let mut current_name: Option<String> = None;
let mut current: Vec<GroffLine> = Vec::new();
for line in lines {
if let GroffLine::Macro { name, args } = line
&& name == "SH"
{
if current_name.is_some() {
sections.push(std::mem::take(&mut current));
}
let n = args.trim().to_ascii_uppercase();
if n == "SUBCOMMAND" || n == "SUBCOMMANDS" {
current_name = Some(n);
} else {
current_name = None;
}
continue;
}
if current_name.is_some() {
current.push(line.clone());
}
}
if current_name.is_some() {
sections.push(current);
}
let mut out = Vec::new();
for section in sections {
// scan section lines for the Usage: line to get the subcommand name
let mut subcmd_name: Option<String> = None;
let mut desc_lines: Vec<String> = Vec::new();
for line in &section {
if subcmd_name.is_some() {
break;
}
match line {
GroffLine::Text(t) => match find_usage_name(t) {
Some(name) => subcmd_name = Some(name),
None => desc_lines.push(t.clone()),
},
GroffLine::Macro { name, args }
if matches!(name.as_str(), "TP" | "B" | "BI" | "BR") =>
{
let text = strip_groff_escapes(&strip_inline_macro_args(args));
let text = text.trim();
subcmd_name = find_usage_name(text);
}
_ => (),
}
}
if let Some(name) = subcmd_name {
let desc_raw = desc_lines.join(" ");
let desc = strip_groff_escapes(&desc_raw).trim().to_string();
let desc = strip_backtick_words(&desc);
out.push((name, desc, section));
}
}
out
}
/// look for "Usage: NAME" and return NAME if found.
/// NAME contains alphanumeric, underscore, or dash.
fn find_usage_name(text: &str) -> Option<String> {
const MARKER: &str = "Usage: ";
let idx = text.find(MARKER)?;
let after = &text[idx + MARKER.len()..];
let end = after
.find(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '-'))
.unwrap_or(after.len());
if end == 0 {
None
} else {
Some(after[..end].to_string())
}
}
/// strip backtick-quoted words: `word` -> word.
fn strip_backtick_words(s: &str) -> String {
let mut out = String::with_capacity(s.len());
let mut i = 0;
let bytes = s.as_bytes();
while i < bytes.len() {
if bytes[i] == b'`'
&& let Some(end) = s[i + 1..].find('`')
{
out.push_str(&s[i + 1..i + 1 + end]);
i += end + 2;
continue;
}
let c = s[i..].chars().next().unwrap();
out.push(c);
i += c.len_utf8();
}
out
}

View file

@ -0,0 +1,456 @@
//! strategy-based entry extraction.
//!
//! rather than a single monolithic parser, we use multiple "strategies" that
//! each target a specific groff formatting pattern. this is necessary because
//! manpage authors use very different macro combinations for the same purpose.
use nom::{Parser, combinator::opt};
use crate::make_macro_walker;
use crate::parsers::help::{help_parser, param_parser, switch_parser};
use crate::parsers::manpage::groff::{
GroffLine, strip_groff_escapes, strip_inline_macro_args, strip_space_macro_args,
};
use crate::parsers::manpage::{ManpageEntry, OwnedParam, OwnedSwitch};
use crate::types::{Param, Switch};
/// collect consecutive text lines, joining them with spaces.
/// returns (collected, remaining).
fn collect_text_lines(lines: &[GroffLine]) -> (String, &[GroffLine]) {
let mut acc: Vec<&str> = Vec::new();
let mut i = 0;
while i < lines.len() {
match &lines[i] {
GroffLine::Text(t) => acc.push(t),
_ => break,
}
i += 1;
}
(acc.join(" "), &lines[i..])
}
fn collect_description_lines(lines: &[GroffLine], start: usize) -> (String, usize) {
let mut acc: Vec<String> = Vec::new();
let mut i = start;
while i < lines.len() {
match &lines[i] {
GroffLine::Macro { name, .. }
if matches!(name.as_str(), "TP" | "TQ" | "IP" | "PP" | "SH" | "SS") =>
{
break;
}
GroffLine::Text(t) => {
acc.push(t.clone());
i += 1;
}
GroffLine::Macro { name, args }
if matches!(
name.as_str(),
"B" | "BI" | "BR" | "I" | "IR" | "IB" | "RB" | "RI"
) =>
{
let text = tag_of_macro(name, args);
if !text.is_empty() {
acc.push(text);
}
i += 1;
}
GroffLine::Blank | GroffLine::Comment => {
i += 1;
}
GroffLine::Macro { .. } => {
i += 1;
}
}
}
(acc.join(" "), i)
}
fn to_owned_switch(s: Switch<'_>) -> OwnedSwitch {
match s {
Switch::Short(c) => OwnedSwitch::Short(c),
Switch::Long(l) => OwnedSwitch::Long(l.to_string()),
Switch::Both(c, l) => OwnedSwitch::Both(c, l.to_string()),
}
}
fn to_owned_param(p: Param<'_>) -> OwnedParam {
match p {
Param::Mandatory(s) => OwnedParam::Mandatory(s.to_string()),
Param::Optional(s) => OwnedParam::Optional(s.to_string()),
}
}
/// attempt to parse a tag string (e.g. "-v, --verbose FILE") into an entry.
/// uses the nom switch_parser + param_parser from the help module.
/// returns None if the tag doesn't look like a flag definition.
pub fn parse_tag_to_entry(tag: &str, desc: String) -> Option<ManpageEntry> {
let tag = strip_groff_escapes(tag);
let tag = tag.trim();
let result: nom::IResult<&str, (Switch<'_>, Option<Param<'_>>)> =
(switch_parser, opt(param_parser)).parse(tag);
match result {
Ok((_, (switch, param))) => Some(ManpageEntry {
switch: to_owned_switch(switch),
param: param.map(to_owned_param),
desc,
}),
Err(_) => None,
}
}
/// extract tag text from a macro line.
/// .B and .I preserve spaces (single argument); .BI, .BR, .IR alternate
/// fonts and concatenate arguments.
pub fn tag_of_macro(name: &str, args: &str) -> String {
match name {
"B" | "I" => strip_space_macro_args(args),
_ => strip_groff_escapes(&strip_inline_macro_args(args))
.trim()
.to_string(),
}
}
// strategy a: .TP style (most common — gnu coreutils, help2man).
// .TP introduces a tagged paragraph: the next line is the "tag" (flag name)
// and subsequent text lines are the description. the tag can be plain text
// or wrapped in a formatting macro (.B, .BI, etc.).
pub fn strategy_tp(lines: &[GroffLine]) -> Vec<ManpageEntry> {
let mut out = Vec::new();
let mut i = 0;
while i < lines.len() {
let GroffLine::Macro { name, .. } = &lines[i] else {
i += 1;
continue;
};
if name != "TP" {
i += 1;
continue;
}
let (tags, body_start) = collect_tp_tags(lines, i + 1);
if tags.is_empty() {
i += 1;
continue;
}
let (desc, new_i) = collect_description_lines(lines, body_start);
out.extend(entries_from_tag_alternates(&tags, desc));
i = new_i;
}
out
}
fn collect_tp_tags(lines: &[GroffLine], start: usize) -> (Vec<String>, usize) {
let mut tags = Vec::new();
let mut i = start;
loop {
if i >= lines.len() {
break;
}
let Some(tag) = tag_from_line(&lines[i]) else {
break;
};
tags.push(tag);
i += 1;
if i < lines.len() && matches!(&lines[i], GroffLine::Macro { name, .. } if name == "TQ") {
i += 1;
continue;
}
break;
}
(tags, i)
}
fn tag_from_line(line: &GroffLine) -> Option<String> {
match line {
GroffLine::Text(tag) => Some(tag.clone()),
GroffLine::Macro { name, args }
if matches!(
name.as_str(),
"B" | "I" | "BI" | "BR" | "IR" | "IB" | "RB" | "RI"
) =>
{
Some(tag_of_macro(name, args))
}
_ => None,
}
}
fn entries_from_tag_alternates(tags: &[String], desc: String) -> Vec<ManpageEntry> {
let entries: Vec<ManpageEntry> = tags
.iter()
.filter_map(|tag| parse_tag_to_entry(tag, desc.clone()))
.collect();
if entries.len() == 2
&& let Some(combined) = combine_short_long_alternates(&entries[0], &entries[1])
{
return vec![combined];
}
entries
}
fn combine_short_long_alternates(
left: &ManpageEntry,
right: &ManpageEntry,
) -> Option<ManpageEntry> {
match (&left.switch, &right.switch) {
(OwnedSwitch::Long(l), OwnedSwitch::Short(c)) => Some(ManpageEntry {
switch: OwnedSwitch::Both(*c, l.clone()),
param: left.param.clone().or_else(|| right.param.clone()),
desc: left.desc.clone(),
}),
(OwnedSwitch::Short(c), OwnedSwitch::Long(l)) => Some(ManpageEntry {
switch: OwnedSwitch::Both(*c, l.clone()),
param: right.param.clone().or_else(|| left.param.clone()),
desc: left.desc.clone(),
}),
_ => None,
}
}
// strategy b: .IP style (curl, hand-written manpages).
// .IP takes an inline tag argument: .IP "-v, --verbose"
// the description follows as text lines.
make_macro_walker!(pub strategy_ip -> Vec<ManpageEntry>, on macro "IP" =>
|lines, i, args| {
let tag = strip_groff_escapes(args);
let (desc, rest) = collect_text_lines(&lines[i + 1..]);
let new_i = lines.len() - rest.len();
parse_tag_to_entry(&tag, desc).map(|e| (e, new_i))
}
);
// strategy c: .PP + .RS/.RE style (git, docbook-generated manpages).
// flag entries are introduced by .PP (paragraph), with the flag name as
// plain text, followed by a .RS (indent) block containing the description,
// closed by .RE (de-indent).
make_macro_walker!(pub strategy_pp_rs -> Vec<ManpageEntry>, on macro "PP" =>
|lines, i, _args| {
if i + 1 >= lines.len() { return None; }
if let GroffLine::Text(tag) = &lines[i + 1] {
let (desc, new_i) = collect_pp_rs_desc(lines, i + 2);
parse_tag_to_entry(tag, desc).map(|e| (e, new_i))
} else {
None
}
}
);
fn collect_pp_rs_desc(lines: &[GroffLine], start: usize) -> (String, usize) {
let mut acc: Vec<String> = Vec::new();
let mut i = start;
// outer: look for .RS marker or text
while i < lines.len() {
match &lines[i] {
GroffLine::Macro { name, .. } if name == "RS" => {
i += 1;
// inside .RS — collect until .RE or boundary macro
while i < lines.len() {
match &lines[i] {
GroffLine::Macro { name, .. } if name == "RE" => {
return (acc.join(" "), i + 1);
}
GroffLine::Text(t) => {
acc.push(t.clone());
i += 1;
}
GroffLine::Macro { name, .. } if name == "PP" || name == "SH" => {
return (acc.join(" "), i);
}
_ => i += 1,
}
}
return (acc.join(" "), i);
}
GroffLine::Text(t) => {
acc.push(t.clone());
i += 1;
}
_ => return (acc.join(" "), i),
}
}
(acc.join(" "), i)
}
/// strategy d: deroff fallback — strip all groff markup, then feed the
/// resulting plain text through the help parser.
pub fn strategy_deroff(lines: &[GroffLine]) -> Vec<ManpageEntry> {
let mut buffer = String::with_capacity(256);
for line in lines {
match line {
GroffLine::Text(text) => {
buffer.push_str(text);
buffer.push('\n');
}
GroffLine::Macro { name, args }
if matches!(name.as_str(), "BI" | "BR" | "IR" | "B" | "I") =>
{
let text = strip_groff_escapes(&strip_inline_macro_args(args));
buffer.push_str(&text);
buffer.push('\n');
}
GroffLine::Blank => buffer.push('\n'),
_ => (),
}
}
match help_parser(&buffer) {
Ok((_, result)) => result
.entries
.into_iter()
.map(|e| ManpageEntry {
switch: to_owned_switch(e.switch),
param: e.param.map(to_owned_param),
desc: e.desc.join(" "),
})
.collect(),
Err(_) => Vec::new(),
}
}
fn is_bullet_ip(args: &str) -> bool {
!args.trim().is_empty()
}
// strategy e: nix3-style bullet .IP with .UR/.UE hyperlinks.
// nix's manpages use .IP with bullet markers for flag entries, interleaved
// with .UR/.UE hyperlink macros. the flag tag is in text lines after the
// bullet .IP, and the description follows a non-bullet .IP marker.
make_macro_walker!(pub strategy_nix -> Vec<ManpageEntry>, on macro "IP" =>
|lines, i, args| {
if !is_bullet_ip(args) { return None; }
// collect tag: skip .UR/.UE macros, gather Text lines
let mut tag_idx = i + 1;
let mut tag_parts: Vec<String> = Vec::new();
while tag_idx < lines.len() {
match &lines[tag_idx] {
GroffLine::Macro { name, .. } if name == "UR" || name == "UE" => {
tag_idx += 1;
}
GroffLine::Text(t) => {
tag_parts.push(t.clone());
tag_idx += 1;
}
_ => break,
}
}
let tag = tag_parts.join(" ");
let (desc, new_i) = collect_nix_desc(lines, tag_idx);
parse_tag_to_entry(&tag, desc).map(|e| (e, new_i))
}
);
fn collect_nix_desc(lines: &[GroffLine], start: usize) -> (String, usize) {
if start >= lines.len() {
return (String::new(), start);
}
let mut i = start;
// require non-bullet .IP marker for description
if let GroffLine::Macro { name, args } = &lines[i]
&& name == "IP"
&& args.trim().is_empty()
{
i += 1;
} else {
return (String::new(), start);
}
let mut parts: Vec<String> = Vec::new();
while i < lines.len() {
match &lines[i] {
GroffLine::Text(t) => {
parts.push(t.clone());
i += 1;
}
GroffLine::Macro { name, args } if name == "IP" => {
if !args.trim().is_empty() {
// next bullet entry — stop
return (parts.join(" "), i);
}
// non-bullet .IP = continuation paragraph
i += 1;
}
GroffLine::Macro { name, .. } if name == "SS" || name == "SH" => {
return (parts.join(" "), i);
}
GroffLine::Macro { name, .. } if name == "RS" => {
i = skip_rs(lines, i + 1, 1);
}
GroffLine::Macro { .. } => {
i += 1;
}
GroffLine::Blank | GroffLine::Comment => {
i += 1;
}
}
}
(parts.join(" "), i)
}
fn skip_rs(lines: &[GroffLine], start: usize, mut depth: usize) -> usize {
let mut i = start;
while i < lines.len() {
if let GroffLine::Macro { name, .. } = &lines[i] {
if name == "RE" {
depth -= 1;
if depth == 0 {
return i + 1;
}
} else if name == "RS" {
depth += 1;
}
}
i += 1;
}
i
}
/// count occurrences of a specific macro in the section.
fn count_macro(name: &str, lines: &[GroffLine]) -> usize {
lines
.iter()
.filter(|line| matches!(line, GroffLine::Macro { name: n, .. } if n == name))
.count()
}
/// auto-detect and try strategies, return the one with most entries.
/// first counts macros to determine which strategies are applicable,
/// then runs all applicable ones and picks the winner by entry count.
/// if no specialized strategy produces results, falls back to deroff.
pub fn extract_entries(lines: &[GroffLine]) -> Vec<ManpageEntry> {
let tp = count_macro("TP", lines);
let ip = count_macro("IP", lines);
let pp = count_macro("PP", lines);
let rs = count_macro("RS", lines);
let ur = count_macro("UR", lines);
let mut specialized: Vec<(&str, Vec<ManpageEntry>)> = Vec::new();
if tp > 0 {
specialized.push(("TP", strategy_tp(lines)));
}
if ip > 0 {
specialized.push(("IP", strategy_ip(lines)));
}
if pp > 0 && rs > 0 {
specialized.push(("PP+RS", strategy_pp_rs(lines)));
}
if ur > 0 && ip > 0 {
specialized.push(("nix", strategy_nix(lines)));
}
let candidates: Vec<(&str, Vec<ManpageEntry>)> = {
let filtered: Vec<_> = specialized
.into_iter()
.filter(|(_, e)| !e.is_empty())
.collect();
if filtered.is_empty() {
vec![("deroff", strategy_deroff(lines))]
} else {
filtered
}
};
let mut best: Vec<ManpageEntry> = Vec::new();
for (_, entries) in candidates {
if entries.len() >= best.len() {
best = entries;
}
}
best
}

3
src/parsers/mod.rs Normal file
View file

@ -0,0 +1,3 @@
pub mod help;
pub mod manpage;
pub mod nushell;

475
src/parsers/nushell.rs Normal file
View file

@ -0,0 +1,475 @@
//! generate nushell `extern` definitions from parsed help data.
//!
//! this module is the code generation backend. it takes a [`ManpageResult`]
//! (from the help or manpage parsers) and produces nushell source that defines
//! `extern` declarations — nushell's mechanism for teaching the shell about
//! external commands' flags and subcommands so it can offer completions.
//!
//! key responsibilities:
//! - deduplicating flag entries (same flag from multiple help sources)
//! - mapping parameter names to nushell types (path, int, string)
//! - formatting flags in nushell syntax: --flag(-f): type # description
//! - handling positional arguments with nushell's ordering constraints
//! - escaping special characters for nushell string literals
use std::borrow::Cow;
use std::collections::{HashMap, HashSet};
use std::sync::OnceLock;
use crate::parsers::manpage::{
ManpageEntry, ManpageResult, ManpageSubcommand, OwnedParam, OwnedSwitch,
};
use crate::types::Positional;
/// nushell built-in commands and keywords — we must never generate `extern`
/// definitions for these because it would shadow nushell's own implementations.
/// maintained manually and should be updated with new nushell releases.
pub const NUSHELL_BUILTINS: &[&str] = &[
"alias",
"all",
"ansi",
"any",
"append",
"ast",
"attr",
"bits",
"break",
"bytes",
"cal",
"cd",
"char",
"chunk-by",
"chunks",
"clear",
"collect",
"columns",
"commandline",
"compact",
"complete",
"config",
"const",
"continue",
"cp",
"date",
"debug",
"decode",
"def",
"default",
"describe",
"detect",
"do",
"drop",
"du",
"each",
"echo",
"encode",
"enumerate",
"error",
"every",
"exec",
"exit",
"explain",
"explore",
"export",
"export-env",
"extern",
"fill",
"filter",
"find",
"first",
"flatten",
"for",
"format",
"from",
"generate",
"get",
"glob",
"grid",
"group-by",
"hash",
"headers",
"help",
"hide",
"hide-env",
"histogram",
"history",
"http",
"if",
"ignore",
"input",
"insert",
"inspect",
"interleave",
"into",
"is-admin",
"is-empty",
"is-not-empty",
"is-terminal",
"items",
"job",
"join",
"keybindings",
"kill",
"last",
"length",
"let",
"let-env",
"lines",
"load-env",
"loop",
"ls",
"match",
"math",
"merge",
"metadata",
"mkdir",
"mktemp",
"module",
"move",
"mut",
"mv",
"nu-check",
"nu-highlight",
"open",
"overlay",
"panic",
"par-each",
"parse",
"path",
"plugin",
"port",
"prepend",
"print",
"ps",
"query",
"random",
"reduce",
"reject",
"rename",
"return",
"reverse",
"rm",
"roll",
"rotate",
"run-external",
"save",
"schema",
"scope",
"select",
"seq",
"shuffle",
"skip",
"sleep",
"slice",
"sort",
"sort-by",
"source",
"source-env",
"split",
"start",
"stor",
"str",
"sys",
"table",
"take",
"tee",
"term",
"timeit",
"to",
"touch",
"transpose",
"try",
"tutor",
"ulimit",
"umask",
"uname",
"uniq",
"uniq-by",
"unlet",
"update",
"upsert",
"url",
"use",
"values",
"version",
"view",
"watch",
"where",
"which",
"while",
"whoami",
"window",
"with-env",
"wrap",
"zip",
];
fn builtin_set() -> &'static HashSet<&'static str> {
static SET: OnceLock<HashSet<&'static str>> = OnceLock::new();
SET.get_or_init(|| NUSHELL_BUILTINS.iter().copied().collect())
}
/// returns true if the given command name collides with a nushell built-in.
pub fn is_nushell_builtin(cmd: &str) -> bool {
builtin_set().contains(cmd)
}
/// map parameter names to nushell types.
/// nushell's `extern` declarations use typed parameters, so we infer the type
/// from the parameter name. file/path-related names become "path" (enables
/// path completion), numeric names become "int", everything else is "string".
pub fn nushell_type_of_param(name: &str) -> &'static str {
match name {
"FILE" | "file" | "PATH" | "path" | "DIR" | "dir" | "DIRECTORY" | "FILENAME"
| "PATTERNFILE" => "path",
"NUM" | "N" | "COUNT" | "NUMBER" | "int" | "INT" | "COLS" | "WIDTH" | "LINES" | "DEPTH"
| "depth" => "int",
_ => "string",
}
}
/// escape a string for use inside nushell double-quoted string literals.
/// only double quotes and backslashes need escaping in nushell's syntax.
pub fn escape_nu(s: &str) -> Cow<'_, str> {
if !s.contains('"') && !s.contains('\\') {
Cow::Borrowed(s)
} else {
let mut buf = String::with_capacity(s.len() + 4);
for c in s.chars() {
match c {
'"' => buf.push_str("\\\""),
'\\' => buf.push_str("\\\\"),
c => buf.push(c),
}
}
Cow::Owned(buf)
}
}
fn entry_key(e: &ManpageEntry) -> String {
match &e.switch {
OwnedSwitch::Short(c) => format!("-{c}"),
OwnedSwitch::Long(l) | OwnedSwitch::Both(_, l) => format!("--{l}"),
}
}
fn entry_score(e: &ManpageEntry) -> i32 {
let switch_bonus = if matches!(e.switch, OwnedSwitch::Both(_, _)) {
10
} else {
0
};
let param_bonus = if e.param.is_some() { 5 } else { 0 };
let desc_bonus = (e.desc.len() / 10).min(5) as i32;
switch_bonus + param_bonus + desc_bonus
}
/// deduplicate flag entries that refer to the same flag.
///
/// when the same flag appears multiple times (e.g. from overlapping manpage
/// sections or repeated help text), we keep the "best" version using a score:
/// - both short+long form present: +10 (most informative)
/// - has a parameter: +5
/// - description length bonus: up to +5
///
/// after deduplication by long name, we also remove standalone short flags
/// whose letter is already covered by a Both(short, long) entry. this prevents
/// emitting both "-v" and "--verbose(-v)" which nushell would reject as a
/// duplicate. the filtering preserves original ordering from the help text.
pub fn dedup_entries(entries: &[ManpageEntry]) -> Vec<ManpageEntry> {
let mut best: HashMap<String, &ManpageEntry> = HashMap::new();
for e in entries {
let key = entry_key(e);
match best.get(&key) {
Some(prev) if entry_score(prev) >= entry_score(e) => {}
_ => {
best.insert(key, e);
}
}
}
let mut covered: HashSet<char> = HashSet::new();
for e in best.values() {
if let OwnedSwitch::Both(c, _) = &e.switch {
covered.insert(*c);
}
}
let mut seen: HashSet<String> = HashSet::new();
let mut out: Vec<ManpageEntry> = Vec::new();
for e in entries {
let key = entry_key(e);
if seen.contains(&key) {
continue;
}
if let OwnedSwitch::Short(c) = &e.switch
&& covered.contains(c)
{
continue;
}
seen.insert(key.clone());
out.push((*best.get(&key).unwrap()).clone());
}
out
}
/// format a single flag entry as a nushell `extern` parameter line.
/// output examples:
/// " --verbose(-v) # increase verbosity"
/// " --output(-o): path # write output to file"
/// " -n: int # number of results"
///
/// the description is right-padded to column 40 with a "# " comment prefix.
pub fn format_flag(entry: &ManpageEntry) -> String {
let name = match &entry.switch {
OwnedSwitch::Both(c, l) => format!("--{l}(-{c})"),
OwnedSwitch::Long(l) => format!("--{l}"),
OwnedSwitch::Short(c) => format!("-{c}"),
};
let typed = match &entry.param {
Some(OwnedParam::Mandatory(p)) | Some(OwnedParam::Optional(p)) => {
format!(": {}", nushell_type_of_param(p))
}
None => String::new(),
};
let flag = format!(" {name}{typed}");
if entry.desc.is_empty() {
flag
} else {
let pad_len = 40usize.saturating_sub(flag.len()).max(1);
format!("{flag}{}# {}", " ".repeat(pad_len), entry.desc)
}
}
/// format a positional argument as a nushell `extern` parameter line.
/// nushell syntax: "...name: type" for variadic, "name?: type" for optional.
/// hyphens in names are converted to underscores since nushell identifiers
/// cannot contain hyphens.
pub fn format_positional(name: &str, p: &Positional) -> String {
let name_underscored: String = name
.chars()
.map(|c| if c == '-' { '_' } else { c })
.collect();
let prefix = if p.variadic { "..." } else { "" };
let suffix = if p.optional && !p.variadic { "?" } else { "" };
let typ = nushell_type_of_param(&name.to_ascii_uppercase());
format!(" {prefix}{name_underscored}{suffix}: {typ}")
}
/// enforce nushell's positional argument ordering rules:
/// 1. no required positional may follow an optional one
/// 2. at most one variadic ("rest") parameter is allowed
///
/// if a required positional appears after an optional one, it is silently
/// promoted to optional. duplicate variadic params are dropped.
pub fn fixup_positionals(positionals: Vec<(String, Positional)>) -> Vec<(String, Positional)> {
let mut seen_optional = false;
let mut seen_variadic = false;
let mut out = Vec::with_capacity(positionals.len());
for (name, mut p) in positionals {
if p.variadic {
if seen_variadic {
continue;
}
seen_variadic = true;
seen_optional = true;
out.push((name, p));
} else if seen_optional {
p.optional = true;
out.push((name, p));
} else {
seen_optional = p.optional;
out.push((name, p));
}
}
out
}
/// derive a nushell `module` name from a command name.
/// replaces non-alphanumeric characters with hyphens and appends "-completions".
pub fn module_name_of(cmd_name: &str) -> String {
let mut s: String = cmd_name
.chars()
.map(|c| {
if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
c
} else {
'-'
}
})
.collect();
s.push_str("-completions");
s
}
/// generate the full nushell `extern` block for a command.
///
/// produces output like:
/// export extern "git add" [
/// ...pathspec?: path
/// --verbose(-v) # be verbose
/// --dry-run(-n) # dry run
/// ]
///
/// subcommands that weren't resolved into their own full definitions get
/// stub `extern` blocks with just a comment containing their description:
/// export extern "git stash" [ # stash changes
/// ]
pub fn generate_extern(cmd_name: &str, result: &ManpageResult) -> String {
let entries = dedup_entries(&result.entries);
let escaped_name = escape_nu(cmd_name);
let positionals = fixup_positionals(result.positionals.clone());
let mut out = String::new();
out.push_str(&format!("export extern \"{escaped_name}\" [\n"));
for (name, p) in &positionals {
out.push_str(&format_positional(name, p));
out.push('\n');
}
for entry in &entries {
out.push_str(&format_flag(entry));
out.push('\n');
}
out.push_str("]\n");
for sc in &result.subcommands {
out.push_str(&format!(
"\nexport extern \"{} {}\" [ # {}\n]\n",
escaped_name,
escape_nu(&sc.name),
escape_nu(&sc.desc)
));
}
out
}
/// generate a complete nushell `module` wrapping the `extern`.
/// output: "module git-completions { ... }\n\nuse git-completions *\n"
/// the `use` at the end makes the `extern` immediately available in scope.
pub fn generate_module(cmd_name: &str, result: &ManpageResult) -> String {
let mod_name = module_name_of(cmd_name);
format!(
"module {mod_name} {{\n{}}}\n\nuse {mod_name} *\n",
generate_extern(cmd_name, result)
)
}
/// convenience wrapper: generate an `extern` from just a list of entries.
pub fn generate_extern_from_entries(cmd_name: &str, entries: Vec<ManpageEntry>) -> String {
generate_extern(
cmd_name,
&ManpageResult {
entries,
subcommands: Vec::new(),
positionals: Vec::new(),
description: String::new(),
},
)
}
/// stub subcommand entry used when extracting subcommands from a parsed
/// help result for nushell output.
pub fn manpage_subcommand_from(name: &str, desc: &str) -> ManpageSubcommand {
ManpageSubcommand {
name: name.to_string(),
desc: desc.to_string(),
}
}

233
src/pool.rs Normal file
View file

@ -0,0 +1,233 @@
//! BFS-queue worker pool for parallel subprocess scraping.
//!
//! workers pull jobs from a shared queue and call a user-supplied
//! handler; the handler gets a `Submitter` to push newly-discovered
//! child jobs back onto the same queue. when the in-flight count
//! reaches zero the pool shuts down and `wait` returns.
//!
//! the queue-back design is deliberate: command-help trees are uneven
//! (one binary has 30 subs, another has 1). queue-back keeps every
//! worker fed; spawn-in-place would leave cores idle on lopsided trees.
//!
//! synchronization: `parking_lot::Condvar` parks workers when the queue is
//! empty. the queue, in-flight count, and close state live under one mutex so
//! the condvar predicate cannot miss a wakeup.
//! parking_lot gives no-poison locks (no `Result` noise on every
//! `lock()`) and a single-syscall fast path in the uncontended case.
use std::collections::VecDeque;
use std::sync::Arc;
use std::thread::{self, JoinHandle};
use parking_lot::{Condvar, Mutex};
struct State<J> {
queue: VecDeque<J>,
/// jobs created but not yet completed. counts both queued and
/// in-progress jobs. workers can exit once wait() has closed the pool
/// and this reaches 0.
in_flight: usize,
/// set by wait(), which is also the point where top-level submission is
/// done. workers must not exit on transient empty periods before this.
closed: bool,
}
/// shared state held behind an `Arc` by every worker and by the
/// submitter handles handed to the per-job handler.
struct Inner<J> {
state: Mutex<State<J>>,
notify: Condvar,
}
impl<J> Inner<J> {
fn submit(&self, job: J) {
let mut state = self.state.lock();
state.in_flight += 1;
state.queue.push_back(job);
self.notify.notify_one();
}
fn next(&self) -> Option<J> {
let mut state = self.state.lock();
loop {
if let Some(job) = state.queue.pop_front() {
return Some(job);
}
if state.closed && state.in_flight == 0 {
return None;
}
self.notify.wait(&mut state);
}
}
fn complete(&self) {
let mut state = self.state.lock();
state.in_flight -= 1;
if state.closed && state.in_flight == 0 {
// we were the last in-flight job after wait() closed top-level
// submission, so parked workers can wake and exit.
self.notify.notify_all();
}
}
}
/// cheap-to-clone handle that lets a job handler enqueue further jobs.
/// passed by reference to the handler closure.
pub struct Submitter<J> {
inner: Arc<Inner<J>>,
}
impl<J> Clone for Submitter<J> {
fn clone(&self) -> Self {
Submitter {
inner: self.inner.clone(),
}
}
}
impl<J> Submitter<J> {
pub fn submit(&self, job: J) {
self.inner.submit(job);
}
}
/// BFS-queue worker pool. each worker pulls a job, calls the handler
/// (which may submit further jobs via the passed `Submitter`), then marks
/// the job complete. when in-flight reaches zero the pool shuts down and
/// `wait` returns.
pub struct ScrapePool<J> {
inner: Arc<Inner<J>>,
workers: Vec<JoinHandle<()>>,
}
impl<J: Send + 'static> ScrapePool<J> {
/// spawn `num_workers` threads that run `handler` on each job pulled
/// from the queue. the handler receives the job by value and a
/// `&Submitter` for enqueuing children.
pub fn new<F>(num_workers: usize, handler: F) -> Self
where
F: Fn(J, &Submitter<J>) + Send + Sync + 'static,
{
let inner = Arc::new(Inner {
state: Mutex::new(State {
queue: VecDeque::new(),
in_flight: 0,
closed: false,
}),
notify: Condvar::new(),
});
let handler = Arc::new(handler);
let workers = (0..num_workers.max(1))
.map(|_| {
let inner = inner.clone();
let handler = handler.clone();
thread::spawn(move || {
let submitter = Submitter {
inner: inner.clone(),
};
while let Some(job) = inner.next() {
handler(job, &submitter);
inner.complete();
}
})
})
.collect();
ScrapePool { inner, workers }
}
/// submit a top-level job. typically called by the orchestrating
/// thread before `wait`; handlers should use `Submitter::submit`.
pub fn submit(&self, job: J) {
self.inner.submit(job);
}
/// block until all jobs (initial + transitively discovered) have
/// completed, then join every worker thread.
pub fn wait(self) {
{
let mut state = self.inner.state.lock();
state.closed = true;
// Wake workers so they can either drain queued work or exit if
// the pool was empty. The close flag is guarded by this same lock,
// so this cannot race with a worker entering the condvar wait.
self.inner.notify.notify_all();
}
for w in self.workers {
let _ = w.join();
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::time::Duration;
#[test]
fn flat_jobs_processed_once_each() {
let collected: Arc<Mutex<Vec<u32>>> = Arc::new(Mutex::new(Vec::new()));
let pool = ScrapePool::new(4, {
let collected = collected.clone();
move |n: u32, _: &Submitter<u32>| {
collected.lock().push(n);
}
});
for i in 0..100u32 {
pool.submit(i);
}
pool.wait();
let mut got = collected.lock().clone();
got.sort();
assert_eq!(got, (0..100).collect::<Vec<_>>());
}
#[test]
fn discovered_children_processed_to_completion() {
// BFS expansion: every odd number under 10 spawns its successor.
let collected: Arc<Mutex<Vec<u32>>> = Arc::new(Mutex::new(Vec::new()));
let pool = ScrapePool::new(2, {
let collected = collected.clone();
move |n: u32, sub: &Submitter<u32>| {
collected.lock().push(n);
if n < 10 && n % 2 == 1 {
sub.submit(n + 1);
}
}
});
for i in [1u32, 3, 5, 7, 9] {
pool.submit(i);
}
pool.wait();
let mut got = collected.lock().clone();
got.sort();
assert_eq!(got, vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
}
#[test]
fn transient_empty_queue_before_wait_does_not_stop_workers() {
let processed = Arc::new(AtomicUsize::new(0));
let pool = ScrapePool::new(1, {
let processed = processed.clone();
move |_: u32, _: &Submitter<u32>| {
processed.fetch_add(1, Ordering::SeqCst);
}
});
pool.submit(1);
while processed.load(Ordering::SeqCst) == 0 {
thread::yield_now();
}
thread::sleep(Duration::from_millis(10));
pool.submit(2);
pool.wait();
assert_eq!(processed.load(Ordering::SeqCst), 2);
}
#[test]
fn wait_with_no_jobs_returns_immediately() {
let pool: ScrapePool<()> = ScrapePool::new(2, |_, _| {});
pool.wait();
}
}

657
src/store.rs Normal file
View file

@ -0,0 +1,657 @@
//! filesystem store for parsed completion data.
//!
//! write side: serialize ManpageResult to JSON, derive sanitised
//! filenames from command names ("git add" → git_add.json).
//!
//! read side: look up a command by name across the user cache + system
//! dirs, deserialize JSON or parse a .nu extern blob back into a result.
use std::collections::HashMap;
use std::fs;
use std::io;
use std::path::{Path, PathBuf};
use serde_json::Value;
use crate::parsers::manpage::{
ManpageEntry, ManpageResult, ManpageSubcommand, OwnedParam, OwnedSwitch,
};
use crate::types::Positional;
/// default cache directory: $XDG_CACHE_HOME/inshellah, falling back to
/// $HOME/.cache/inshellah.
pub fn default_store_path() -> PathBuf {
if let Ok(xdg) = std::env::var("XDG_CACHE_HOME")
&& !xdg.is_empty()
{
return PathBuf::from(xdg).join("inshellah");
}
if let Ok(home) = std::env::var("HOME") {
return PathBuf::from(home).join(".cache/inshellah");
}
PathBuf::from(".cache/inshellah")
}
/// create directory and all parents.
pub fn ensure_dir(dir: &Path) -> io::Result<()> {
fs::create_dir_all(dir)
}
/// derive a safe filename from a command name.
/// spaces in subcommand names ("git add") become "_" ("git_add").
/// any other non-filesystem-safe characters are also replaced.
pub fn filename_of_command(cmd: &str) -> String {
cmd.chars()
.map(|c| match c {
'a'..='z' | 'A'..='Z' | '0'..='9' | '-' | '_' | '.' => c,
' ' => '_',
_ => '_',
})
.collect()
}
/// reverse: a filename "git_add" produces command name "git add".
/// underscores are flipped to spaces unconditionally — names that
/// genuinely contained an underscore round-trip as spaces, which is
/// acceptable since the read side is only used for display.
pub fn command_of_filename(base: &str) -> String {
base.replace('_', " ")
}
fn escape_json(s: &str) -> String {
let mut out = String::with_capacity(s.len() + 2);
for c in s.chars() {
match c {
'"' => out.push_str("\\\""),
'\\' => out.push_str("\\\\"),
'\n' => out.push_str("\\n"),
'\r' => out.push_str("\\r"),
'\t' => out.push_str("\\t"),
'\x08' => out.push_str("\\b"),
'\x0c' => out.push_str("\\f"),
c if (c as u32) < 0x20 => {
out.push_str(&format!("\\u{:04x}", c as u32));
}
c => out.push(c),
}
}
out
}
fn json_string(s: &str) -> String {
format!("\"{}\"", escape_json(s))
}
fn json_switch(s: &OwnedSwitch) -> String {
match s {
OwnedSwitch::Short(c) => {
format!(
r#"{{"type":"short","char":{}}}"#,
json_string(&c.to_string())
)
}
OwnedSwitch::Long(l) => {
format!(r#"{{"type":"long","name":{}}}"#, json_string(l))
}
OwnedSwitch::Both(c, l) => format!(
r#"{{"type":"both","char":{},"name":{}}}"#,
json_string(&c.to_string()),
json_string(l)
),
}
}
fn json_param(p: &Option<OwnedParam>) -> String {
match p {
None => "null".to_string(),
Some(OwnedParam::Mandatory(n)) => {
format!(r#"{{"kind":"mandatory","name":{}}}"#, json_string(n))
}
Some(OwnedParam::Optional(n)) => {
format!(r#"{{"kind":"optional","name":{}}}"#, json_string(n))
}
}
}
fn json_entry(e: &ManpageEntry) -> String {
format!(
r#"{{"switch":{},"param":{},"desc":{}}}"#,
json_switch(&e.switch),
json_param(&e.param),
json_string(&e.desc)
)
}
fn json_subcommand(sc: &ManpageSubcommand) -> String {
format!(
r#"{{"name":{},"desc":{}}}"#,
json_string(&sc.name),
json_string(&sc.desc)
)
}
fn json_positional(name: &str, p: &Positional) -> String {
format!(
r#"{{"name":{},"optional":{},"variadic":{}}}"#,
json_string(name),
p.optional,
p.variadic
)
}
fn json_list<T, F: Fn(&T) -> String>(items: &[T], f: F) -> String {
let parts: Vec<String> = items.iter().map(f).collect();
format!("[{}]", parts.join(","))
}
/// serialize a ManpageResult to JSON:
/// {"source":..., "description":..., "entries":[...],
/// "subcommands":[...], "positionals":[...]}
pub fn json_of_result(source: &str, result: &ManpageResult) -> String {
let entries = json_list(&result.entries, json_entry);
let subcommands = json_list(&result.subcommands, json_subcommand);
let positionals_parts: Vec<String> = result
.positionals
.iter()
.map(|(name, p)| json_positional(name, p))
.collect();
let positionals = format!("[{}]", positionals_parts.join(","));
format!(
r#"{{"source":{},"description":{},"entries":{},"subcommands":{},"positionals":{}}}"#,
json_string(source),
json_string(&result.description),
entries,
subcommands,
positionals,
)
}
pub fn write_file(path: &Path, contents: &str) -> io::Result<()> {
if let Some(parent) = path.parent() {
fs::create_dir_all(parent)?;
}
fs::write(path, contents)
}
/// write the parsed result for `command` into `dir` as JSON.
pub fn write_result(
dir: &Path,
command: &str,
source: &str,
result: &ManpageResult,
) -> io::Result<()> {
let path = dir.join(format!("{}.json", filename_of_command(command)));
write_file(&path, &json_of_result(source, result))
}
/// write a native-nushell completion blob (the binary supplied its own).
pub fn write_native(dir: &Path, command: &str, data: &str) -> io::Result<()> {
let path = dir.join(format!("{}.nu", filename_of_command(command)));
write_file(&path, data)
}
// --- read side ---
fn read_file(path: &Path) -> Option<String> {
fs::read_to_string(path).ok()
}
fn read_json_result(path: &Path) -> Option<(String, ManpageResult)> {
let data = read_file(path)?;
let v = serde_json::from_str::<Value>(&data).ok()?;
let source = v
.get("source")
.and_then(|x| x.as_str())
.unwrap_or("json")
.to_string();
Some((source, result_from_json(&v)))
}
fn switch_from_json(v: &Value) -> Option<OwnedSwitch> {
let t = v.get("type")?.as_str()?;
match t {
"short" => {
let c = v.get("char")?.as_str()?.chars().next()?;
Some(OwnedSwitch::Short(c))
}
"long" => Some(OwnedSwitch::Long(v.get("name")?.as_str()?.to_string())),
"both" => {
let c = v.get("char")?.as_str()?.chars().next()?;
let n = v.get("name")?.as_str()?.to_string();
Some(OwnedSwitch::Both(c, n))
}
_ => None,
}
}
fn param_from_json(v: &Value) -> Option<OwnedParam> {
if v.is_null() {
return None;
}
let kind = v.get("kind")?.as_str()?;
let name = v.get("name")?.as_str()?.to_string();
Some(match kind {
"mandatory" => OwnedParam::Mandatory(name),
"optional" => OwnedParam::Optional(name),
_ => return None,
})
}
fn entry_from_json(v: &Value) -> Option<ManpageEntry> {
let switch = switch_from_json(v.get("switch")?)?;
let param = v.get("param").and_then(param_from_json);
let desc = v
.get("desc")
.and_then(|d| d.as_str())
.unwrap_or("")
.to_string();
Some(ManpageEntry {
switch,
param,
desc,
})
}
fn subcommand_from_json(v: &Value) -> Option<ManpageSubcommand> {
let name = v.get("name")?.as_str()?.to_string();
let desc = v
.get("desc")
.and_then(|d| d.as_str())
.unwrap_or("")
.to_string();
Some(ManpageSubcommand { name, desc })
}
fn positional_from_json(v: &Value) -> Option<(String, Positional)> {
let name = v.get("name")?.as_str()?.to_string();
let optional = v.get("optional").and_then(|x| x.as_bool()).unwrap_or(false);
let variadic = v.get("variadic").and_then(|x| x.as_bool()).unwrap_or(false);
Some((name, Positional { optional, variadic }))
}
/// deserialize a JSON cache entry into ManpageResult.
pub fn result_from_json(v: &Value) -> ManpageResult {
let description = v
.get("description")
.and_then(|d| d.as_str())
.unwrap_or("")
.to_string();
let entries = v
.get("entries")
.and_then(|x| x.as_array())
.map(|arr| arr.iter().filter_map(entry_from_json).collect())
.unwrap_or_default();
let subcommands = v
.get("subcommands")
.and_then(|x| x.as_array())
.map(|arr| arr.iter().filter_map(subcommand_from_json).collect())
.unwrap_or_default();
let positionals = v
.get("positionals")
.and_then(|x| x.as_array())
.map(|arr| arr.iter().filter_map(positional_from_json).collect())
.unwrap_or_default();
ManpageResult {
entries,
subcommands,
positionals,
description,
}
}
/// parse nushell `export extern` blocks out of a .nu source file.
///
/// returns the help_result that matches `target_cmd` — its entries,
/// positionals, and any other extern blocks under it (`cmd sub`) are
/// folded into the subcommands list.
pub fn parse_nu_completions(target_cmd: &str, contents: &str) -> ManpageResult {
let mut blocks: Vec<NuBlock> = Vec::new();
let mut current_desc = String::new();
let mut in_block = false;
let mut block = NuBlock::default();
for line in contents.split('\n') {
let trimmed = line.trim();
if !in_block {
if let Some(stripped) = trimmed.strip_prefix("# ") {
current_desc = stripped.trim().to_string();
} else if trimmed.contains("export extern")
&& let Some(cmd) = extract_extern_name(trimmed)
{
in_block = true;
block = NuBlock {
cmd,
description: std::mem::take(&mut current_desc),
..Default::default()
};
} else {
current_desc.clear();
}
} else if trimmed.starts_with(']') {
blocks.push(std::mem::take(&mut block));
in_block = false;
} else {
let (param_part, desc) = match trimmed.find('#') {
Some(idx) => (trimmed[..idx].trim(), trimmed[idx + 1..].trim()),
None => (trimmed, ""),
};
parse_nu_param_line_into(param_part, desc, &mut block);
}
}
if in_block {
blocks.push(block);
}
// find the block matching target_cmd
let Some(matched) = blocks.iter().find(|b| b.cmd == target_cmd) else {
return ManpageResult::default();
};
// collect immediate subcommands from other blocks ("target sub" pattern)
let prefix = format!("{target_cmd} ");
let mut subcommands: Vec<ManpageSubcommand> = Vec::new();
for b in &blocks {
if let Some(suffix) = b.cmd.strip_prefix(&prefix)
&& !suffix.contains(' ')
&& !suffix.is_empty()
{
subcommands.push(ManpageSubcommand {
name: suffix.to_string(),
desc: b.description.clone(),
});
}
}
ManpageResult {
entries: matched.entries.clone(),
subcommands,
positionals: matched.positionals.clone(),
description: matched.description.clone(),
}
}
fn extract_extern_name(line: &str) -> Option<String> {
let idx = line.find("export extern")?;
let after = line[idx + "export extern".len()..].trim_start();
if let Some(rest) = after.strip_prefix('"') {
let end = rest.find('"')?;
Some(rest[..end].to_string())
} else {
let end = after
.find(|c: char| !(c.is_ascii_alphanumeric() || c == '_' || c == '-'))
.unwrap_or(after.len());
if end == 0 {
None
} else {
Some(after[..end].to_string())
}
}
}
fn parse_nu_param_line_into(param_part: &str, desc: &str, block: &mut NuBlock) {
if param_part.len() < 2 {
return;
}
if let Some(after) = param_part.strip_prefix("--") {
// long flag: --name(-c): type or --name: type or --name
let (name, rest) = split_at_non_name_char(after);
if name.is_empty() {
return;
}
let mut short: Option<char> = None;
let mut rest = rest;
if let Some(after_open) = rest.strip_prefix("(-")
&& let Some(c) = after_open.chars().next()
&& after_open[c.len_utf8()..].starts_with(')')
{
short = Some(c);
rest = &after_open[c.len_utf8() + 1..];
}
let param = parse_type_suffix(rest);
let switch = match short {
Some(c) => OwnedSwitch::Both(c, name.to_string()),
None => OwnedSwitch::Long(name.to_string()),
};
block.entries.push(ManpageEntry {
switch,
param,
desc: desc.to_string(),
});
} else if param_part.starts_with('-') {
// short flag: -c
if let Some(c) = param_part.chars().nth(1)
&& c.is_ascii_alphanumeric()
{
block.entries.push(ManpageEntry {
switch: OwnedSwitch::Short(c),
param: None,
desc: desc.to_string(),
});
}
} else {
// positional: name: type or name?: type or ...name: type
let variadic = param_part.starts_with("...");
let after_prefix = if variadic {
&param_part[3..]
} else {
param_part
};
let optional = after_prefix.contains('?');
let name_end = after_prefix.find([':', '?']).unwrap_or(after_prefix.len());
let name = after_prefix[..name_end].trim();
let name: String = name
.chars()
.map(|c| if c == '-' { '_' } else { c })
.collect();
if !name.is_empty() && !name.starts_with('-') {
let duplicate = block
.positionals
.iter()
.any(|(existing, _)| existing.eq_ignore_ascii_case(&name));
if !duplicate {
block.positionals.push((
name,
Positional {
optional: optional || variadic,
variadic,
},
));
}
}
}
}
fn split_at_non_name_char(s: &str) -> (&str, &str) {
let end = s
.find(|c: char| !(c.is_ascii_alphanumeric() || c == '-'))
.unwrap_or(s.len());
(&s[..end], &s[end..])
}
/// parse a `: type` suffix into an OwnedParam (always Mandatory since the
/// nushell extern syntax doesn't distinguish optional-with-default).
fn parse_type_suffix(s: &str) -> Option<OwnedParam> {
let s = s.trim_start();
let s = s.strip_prefix(':')?;
let s = s.trim_start();
let end = s
.find(|c: char| !c.is_ascii_alphabetic())
.unwrap_or(s.len());
if end == 0 {
None
} else {
Some(OwnedParam::Mandatory(s[..end].to_string()))
}
}
#[derive(Default)]
struct NuBlock {
cmd: String,
entries: Vec<ManpageEntry>,
positionals: Vec<(String, Positional)>,
description: String,
}
/// look up a command's parsed result. source priority is native nushell,
/// then manpage JSON, then help JSON. parent .nu files are searched for
/// subcommand lookups because clap-generated .nu files contain all extern
/// blocks in a single file.
pub fn lookup(dirs: &[PathBuf], command: &str) -> Option<ManpageResult> {
let base_name = filename_of_command(command);
let parent_base = command
.find(' ')
.map(|i| filename_of_command(&command[..i]));
for directory in dirs {
let nu_path = directory.join(format!("{base_name}.nu"));
if let Some(data) = read_file(&nu_path) {
return Some(parse_nu_completions(command, &data));
}
if let Some(pb) = &parent_base {
let parent_nu = directory.join(format!("{pb}.nu"));
if let Some(data) = read_file(&parent_nu) {
let r = parse_nu_completions(command, &data);
if !r.entries.is_empty() || !r.subcommands.is_empty() || !r.positionals.is_empty() {
return Some(r);
}
}
}
}
for directory in dirs {
let json_path = directory.join(format!("{base_name}.json"));
if let Some((source, result)) = read_json_result(&json_path)
&& source != "help"
{
return Some(result);
}
}
for directory in dirs {
let json_path = directory.join(format!("{base_name}.json"));
if let Some((_, result)) = read_json_result(&json_path) {
return Some(result);
}
}
None
}
/// look up a command's raw stored data (JSON or .nu source).
pub fn lookup_raw(dirs: &[PathBuf], command: &str) -> Option<String> {
let base_name = filename_of_command(command);
for directory in dirs {
let nu_path = directory.join(format!("{base_name}.nu"));
if let Some(data) = read_file(&nu_path) {
return Some(data);
}
}
for directory in dirs {
let json_path = directory.join(format!("{base_name}.json"));
if let Some(data) = read_file(&json_path) {
return Some(data);
}
}
None
}
fn chop_extension(filename: &str) -> Option<&str> {
filename
.strip_suffix(".json")
.or_else(|| filename.strip_suffix(".nu"))
}
/// list all indexed commands across all store directories.
/// returns a sorted, deduplicated list of command names.
pub fn all_commands(dirs: &[PathBuf]) -> Vec<String> {
let mut out: std::collections::BTreeSet<String> = std::collections::BTreeSet::new();
for directory in dirs {
let Ok(entries) = fs::read_dir(directory) else {
continue;
};
for entry in entries.flatten() {
if let Some(name) = entry.file_name().to_str()
&& let Some(base) = chop_extension(name)
{
out.insert(command_of_filename(base));
}
}
}
out.into_iter().collect()
}
/// discover subcommands of a command by scanning filenames in the store
/// (e.g. for "git", finds "git_add.json", "git_log.json").
pub fn subcommands_of(dirs: &[PathBuf], command: &str) -> Vec<ManpageSubcommand> {
let prefix = format!("{}_", filename_of_command(command));
let mut seen: HashMap<String, ManpageSubcommand> = HashMap::new();
for directory in dirs {
let Ok(entries) = fs::read_dir(directory) else {
continue;
};
for entry in entries.flatten() {
let Some(filename) = entry.file_name().to_str().map(|s| s.to_string()) else {
continue;
};
if !filename.starts_with(&prefix) {
continue;
}
let is_json = filename.ends_with(".json");
let Some(base) = chop_extension(&filename) else {
continue;
};
let rest = &base[prefix.len()..];
if rest.is_empty() || rest.contains('_') {
continue;
}
if seen.contains_key(rest) {
continue;
}
let desc = if is_json {
read_file(&entry.path())
.and_then(|d| serde_json::from_str::<Value>(&d).ok())
.and_then(|v| {
v.get("description")
.and_then(|x| x.as_str())
.map(|s| s.to_string())
})
.unwrap_or_default()
} else {
String::new()
};
seen.insert(
rest.to_string(),
ManpageSubcommand {
name: rest.to_string(),
desc,
},
);
}
}
let mut out: Vec<ManpageSubcommand> = seen.into_values().collect();
out.sort_by(|a, b| a.name.cmp(&b.name));
out
}
/// determine how a command was indexed: "help", "manpage", "native", etc.
/// for JSON files, returns the "source" field. for .nu files, returns "native".
pub fn file_type_of(dirs: &[PathBuf], command: &str) -> Option<String> {
let base = filename_of_command(command);
for directory in dirs {
let nu_path = directory.join(format!("{base}.nu"));
if nu_path.exists() {
return Some("native".to_string());
}
}
for directory in dirs {
let json_path = directory.join(format!("{base}.json"));
if json_path.exists() {
return Some(
read_file(&json_path)
.and_then(|d| serde_json::from_str::<Value>(&d).ok())
.and_then(|v| v.get("source").and_then(|x| x.as_str()).map(String::from))
.unwrap_or_else(|| "json".to_string()),
);
}
}
None
}

34
src/types.rs Normal file
View file

@ -0,0 +1,34 @@
pub enum Switch<'a> {
Short(char),
Long(&'a str),
Both(char, &'a str),
}
pub enum Param<'a> {
Mandatory(&'a str),
Optional(&'a str),
}
pub struct OptionEntry<'a> {
pub switch: Switch<'a>,
pub param: Option<Param<'a>>,
pub desc: Vec<&'a str>,
}
pub struct Subcommand<'a> {
pub name: &'a str,
pub desc: &'a str,
}
#[derive(Debug, Clone)]
pub struct Positional {
pub optional: bool,
pub variadic: bool,
}
pub struct HelpResult<'a> {
pub entries: Vec<OptionEntry<'a>>,
pub subcommands: Vec<Subcommand<'a>>,
pub positionals: Vec<(&'a str, Positional)>,
pub desc: &'a str,
}