#!/usr/bin/perl -w

# Generate a markdown help page from --help and --version output.

# Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2009,
# 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2020, 2021 Free Software
# Foundation, Inc.
# Copyright (C) 2021 SIL Global

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3, or (at your option)
# any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.

# Based on help2man

use 5.008;
use strict;
use Getopt::Long;
use Text::ParseWords qw(shellwords);
use Text::Tabs qw(expand);
use POSIX qw(strftime setlocale LC_ALL);
use Locale::gettext qw(gettext);
use Encode qw(decode encode);
use I18N::Langinfo qw(langinfo CODESET);

my $this_program = 'help2md';
my $this_version = '1.48.4';
my $encoding;

{
    my $gettext = Locale::gettext->domain($this_program);
    sub _ { $gettext->get($_[0]) }

    my ($user_locale) = grep defined && length,
        (map $ENV{$_}, qw(LANGUAGE LC_ALL LC_MESSAGES LANG)), 'C';

    my $user_encoding = langinfo CODESET;

    # Set localisation of date and executable's output.
    sub configure_locale
    {
        delete @ENV{qw(LANGUAGE LC_MESSAGES LANG)};
        setlocale LC_ALL, $ENV{LC_ALL} = shift || 'C';
        $encoding = langinfo CODESET;
    }

    sub dec { $encoding ? decode $encoding, $_[0] : $_[0] }
    sub enc { $encoding ? encode $encoding, $_[0] : $_[0] }
    sub enc_user { encode $user_encoding, $_[0] }
    sub kark # die with message formatted in the invoking users locale
    {
        setlocale LC_ALL, $user_locale;
        my $fmt = $gettext->get(shift);
        my $errmsg = enc_user sprintf $fmt, @_;
        die $errmsg, "\n";
    }
}

sub N_ { $_[0] }

sub program_basename;
sub get_option_value;

my $version_info = enc_user sprintf _(<<'EOT'), $this_program, $this_version;
%s %s

Copyright (C) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2009,
2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2020, 2021 Free Software
Foundation, Inc.
Copyright (C) 2021 SIL Global

This is free software; see the source for copying conditions.  There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
EOT

my $help_info = enc_user sprintf _(<<'EOT'), $this_program, $this_program;
`%s' generates a markdown help page out of `--help' output.

Usage: %s [OPTION]... EXECUTABLE

 -n, --name=STRING       description for the Summary paragraph
 -L, --locale=STRING     select locale (default "C")
 -i, --include=FILE      include material from `FILE'
 -I, --opt-include=FILE  include material from `FILE' if it exists
 -o, --output=FILE       send output to `FILE'
     --help              print this help, then exit
     --version           print version number, then exit

EXECUTABLE should accept `--help' option and produce output on
stdout although alternatives may be specified using:

 -h, --help-option=STRING     help option string
 --no-discard-stderr          include stderr when parsing option output

EOT

my $help_option = '--help';
my $discard_stderr = 1;
my ($opt_name, @opt_include, $opt_output);

my %opt_def = (
    'n|name=s'         => \$opt_name,
    'L|locale=s'       => sub { configure_locale pop },
    'i|include=s'      => sub { push @opt_include, [ pop, 1 ] },
    'I|opt-include=s'  => sub { push @opt_include, [ pop, 0 ] },
    'o|output=s'       => \$opt_output,
    'help'             => sub { print $help_info; exit },
    'version'          => sub { print $version_info; exit },
    'h|help-option=s'  => \$help_option,
    'discard-stderr!'  => \$discard_stderr,
);

# Parse options.
Getopt::Long::config('bundling');
die $help_info unless GetOptions %opt_def and @ARGV == 1;

configure_locale unless $encoding;

my %include = ();
my %replace = ();
my %append = ();
my %append_match = ();
my @sections = ();  # retain order of include file or in-line *section*s

# Process include file (if given).  Format is:
#
#   Optional initial text, ignored.  May include lines starting with `-'
#   which are processed as options.
#
#   [section]
#   Verbatim text to be included in the named section.  By default at
#   the start, but in the case of `name' and `syntax' the content
#   will replace the autogenerated contents.
#
#   [<section]
#   Verbatim text to be inserted at the start of the named section.
#
#   [=section]
#   Verbatim text to replace the named section.
#
#   [>section]
#   Verbatim text to be appended to the end of the named section.
#
#   /pattern/
#   Verbatim text for inclusion below a paragraph matching `pattern'.
#

while (@opt_include)
{
    my ($inc, $required) = @{shift @opt_include};

    next unless -f $inc or $required;
    kark N_("%s: can't open `%s' (%s)"), $this_program, $inc, $!
        unless open INC, $inc;

    my $key;
    my $hash;

    while (<INC>)
    {
        # Convert input to internal Perl format, so that multibyte
        # sequences are treated as single characters.
        $_ = dec $_;

        # [section]
        if (/^\[([^]]+)\]\s*$/)
        {
            $key = $1;
            $key =~ s/^\s+//;
            $key =~ s/\s+$//;
            $key =~ s/([<>=]?)(.)(.+)/$1\u$2\L$3/;
            $hash = \%include;
            # Handle explicit [<section], [=section] and [>section]
            if ($key =~ s/^([<>=])\s*//)
            {
                if    ($1 eq '>') { $hash = \%append; }
                elsif ($1 eq '=') { $hash = \%replace; }
            }
            # Summary/Syntax replace by default
            elsif ($key eq _('Summary') or $key eq _('Syntax'))
            {
                $hash = \%replace;
            }
            else
            {
                $hash = \%include;
            }

            push @sections, $key;
            next;
        }

        # /pattern/
        if (m!^/(.*)/([ims]*)\s*$!)
        {
            my $pat = $2 ? "(?$2)$1" : $1;

            # Check pattern.
            eval { $key = qr($pat) };
            if ($@)
            {
                $@ =~ s/ at .*? line \d.*//;
                die "$inc:$.:$@";
            }

            $hash = \%append_match;
            next;
        }

        # Check for options before the first section--anything else is
        # silently ignored, allowing the first for comments and
        # revision info.
        unless ($key)
        {
            # handle options
            if (/^-/)
            {
                local @ARGV = shellwords $_;
                GetOptions %opt_def;
            }

            next;
        }

        if ($key eq _('See also'))
        {
            s/^(.+)\(\d\)/- [`$1`]($1)/;
            s/^/ <br>\n/ if ($hash->{$key});
        }

        $hash->{$key} .= $_;
    }

    close INC;

    kark N_("%s: no valid information found in `%s'"), $this_program, $inc
        unless $key;
}

# Compress trailing blank lines.
for my $hash (\(%include, %replace, %append, %append_match))
{
    for (keys %$hash) { $hash->{$_} =~ s/\n+$/\n/ }
}

# Grab help and version info from executable.
my $help_text   = get_option_value $ARGV[0], $help_option;

my $program = program_basename $ARGV[0];
my $package = $program;
my $version;

if ($opt_output)
{
    unlink $opt_output or kark N_("%s: can't unlink %s (%s)"),
        $this_program, $opt_output, $! if -e $opt_output;

    open STDOUT, ">$opt_output"
        or kark N_("%s: can't create %s (%s)"), $this_program, $opt_output, $!;
}

if ($opt_name)
{
    # --name overrides --include contents.
    $replace{_('Summary')} = "$opt_name\n";
}

for ($replace{_('Summary')} || ($include{_('Summary')} ||= ''))
{
    if ($_) # Use first name given as $program
    {
        $program = $1 if /^([^\s,]+)(?:,?\s*[^\s,\\-]+)*\s+\\?-/;
    }
}

# Man pages traditionally have the page title in caps.
my $PROGRAM = uc $program;

# Process text, initial section is Description.
my $sect = _('Description');

# Extract usage clause(s) [if any] for Syntax.
my $PAT_USAGE = _('Usage');
my $PAT_USAGE_CONT = _('or');

if ($help_text =~ s/^(?i:$PAT_USAGE): +(\S+)(.*)((?:\n +)(\S.*))?((?:\n(?: {6}\1| *($PAT_USAGE_CONT): +\S).*)*)//om)
{
    my @syn = $4 ? $2 . " " . $4 : $2;

    if ($_ = $5)
    {
        s/^\n//;
        for (split /\n/) { s/^ *(($PAT_USAGE_CONT): +)?//o; push @syn, $_ }
    }

    my $syntax = '';
    for (@syn)
    {
        $syntax .= "\n" if $syntax;
        s!^\S*/!!;
        s/^(\S+) *//;
        $syntax .= "```bash\n$1";
        s/\s+$//;
        $syntax .= "$_\n";
    }

    if ($help_text =~ /^\n\n/o)
    {
        $sect = _('Description');
        $syntax .= "```\n\n";
    }
    else
    {
        $sect = _('Syntax');
    }

    $include{_('Syntax')} .= $syntax;
}

$_ = "$help_text\n\n";
my $args = '';

# Normalise paragraph breaks.
s/^\n+//;
s/\n*$/\n/;
s/\n\n+/\n\n/g;

# Join hyphenated lines.
s/([A-Za-z])-\n *([A-Za-z])/$1$2/g;

# Replace <> with ``
s/</`/g;
s/>/`/g;
# may separate alternatives with "|".  Spaces in these expressions are written
# as " +" to indicate that more than one space may be matched.  The string
# "(?:[\\w-]+ +)?" in the bug reporting pattern is used to indicate an
# optional word, so that either "Report bugs" or "Report _program_ bugs" will
# be matched.
my $PAT_OPTIONS       = _('Options');
my $PAT_ENVIRONMENT   = _('Environment');
my $PAT_FILES         = _('Files');
my $PAT_POSITIONAL_ARGS = _('positional arguments:');
my $PAT_OPTIONAL_ARGS   = _('optional arguments:');

while (length)
{
    # Convert some standard paragraph names.
    if (s/^($PAT_OPTIONS): *\n+//o)
    {
        $sect = _('OPTIONS');
        next;
    }
    if (s/^($PAT_ENVIRONMENT): *\n+//o)
    {
        $sect = _('ENVIRONMENT');
        next;
    }
    if (s/^($PAT_FILES): *\n+//o)
    {
        $sect = _('FILES');
        next;
    }

    # Custom section indicated by a line containing "*Section Summary*".
    if (s/^\*(\w(.*\w)?)\* *\n+//)
    {
        $sect = uc $1;
        $sect =~ tr/*/ /;  # also accept *Section*Summary*
        push @sections, $sect;
        next;
    }

    # Arguments
    if (s/^($PAT_POSITIONAL_ARGS)\n//o)
    {
        $sect = _('Parameters');
        $args = 'pos';
    }
    elsif (s/^($PAT_OPTIONAL_ARGS)\n//o)
    {
        $sect = _('Parameters');
        $args = 'opt';
    }

    my $matched = '';

    # Sub-sections have a trailing colon and the second line indented.
    if (s/^(\S.*:) *\n / /)
    {
        $matched .= $& if %append_match;
        $include{$sect} .= qq(#### $1\n);
    }

    my $indent = 0;
    my $content = '';

    if ($sect eq _('Parameters') && s/^(\s*(-*[a-zA-Z0-9]+( [^, ]+)?)(, (-+[a-zA-Z0-9]+( \S+)?))?([ \t]*))(.+)?\n//)
    {
        my $firstParam = $2;
        my $secondParam = $5;
        my $helpText = $8 || "";

        $matched .= $& if %append_match;
        $indent = length($1);
        $indent = 24 unless $8;

        $content .= "`$firstParam`";
        $content .= ", `$secondParam`" if ($secondParam);
        $content .= " <br>\n";
        $content .= "__optional__ <br>\n" if ($args eq 'opt');
        $content .= "$helpText\n" if $helpText;
    }

    # Option with description.
    elsif (s/^( {1,10}([+-]\S.*?))(?:(  +(?!-))|\n( {20,}))(\S.*)\n//)
    {
        $matched .= $& if %append_match;
        $indent = length ($4 || "$1$3");
        $content = ".TP\n\x84$2\n\x84$5\n";
        unless ($4)
        {
            # Indent may be different on second line.
            $indent = length $& if /^ {20,}/;
        }
    }

    # Option without description.
    elsif (s/^ {1,10}([+-]\S.*)\n//)
    {
        $matched .= $& if %append_match;
        $content = ".HP\n\x84$1\n";
        $indent = 80; # not continued
    }

    # Indented paragraph with tag.
    elsif (s/^( +(\S.*?))(?:(  +)|\n( {20,}))(\S.*)\n//)
    {
        $matched .= $& if %append_match;
        $indent = length ($4 || "$1$3");
        $content = ".TP\n\x84$2\n\x84$5\n";
    }

    # Indented paragraph.
    elsif (s/^( +)(\S.*)\n//)
    {
        $matched .= $& if %append_match;
        $indent = length $1;
        $content = "    $2\n";
    }

    # Left justified paragraph.
    else
    {
        s/(.*)\n//;
        $matched .= $& if %append_match;
        $content = "\n" if $include{$sect};
        $content .= "$1\n";
    }

    if ($sect eq _('Parameters'))
    {
        # Append continuations.
        while ($indent ? s/^ {$indent}(\S.*)\n// : s/^(\S.*)\n//)
        {
            $matched .= $& if %append_match;
            $content .= "$1\n";
        }
        $content .= "\n";
    }
    else
    {
        # Append continuations.
        while ($indent ? s/^ {$indent}(\S.*)\n// : s/^(\S.*)\n//)
        {
            $matched .= $& if %append_match;
            $content .= "$1\n";
        }
    }

    if (/^\n/o && $sect eq _('Syntax'))
    {
        $content =~ s/\n$//;
        $include{$sect} .= "$content\n```\n\n";
        $sect = _('Description') ;
        next;
    }

    # Move to next paragraph.
    s/^\n+//;

    for ($content)
    {
        # Put filenames in backticks: /a/b, $VAR/c/d, ~/e/f
        s!
            (^|[ (])           # space/punctuation before
            (
                (?:\$\w+|~)?   # leading variable, or tilde
                (?:/[\w.-]+)+  # path components
            )
            ($|[ ,;.)])        # space/punctuation after
        !$1`$2`$3!xmg
    }

    # Check if matched paragraph contains /pat/.
    if (%append_match)
    {
        for my $pat (keys %append_match)
        {
            if ($matched =~ $pat)
            {
                $content .= "\n\n" unless $append_match{$pat} =~ /^\./;
                $content .= $append_match{$pat};
            }
        }
    }

    $include{$sect} .= $content;
}

# Append additional text.
while (my ($sect, $text) = each %append)
{
    $include{$sect} .= $append{$sect};
}

# Replace sections.
while (my ($sect, $text) = each %replace)
{
    $include{$sect} = $replace{$sect};
}

# Output header.
print <<EOT;
---
title: $program
---
<!-- DO NOT MODIFY THIS FILE!  It was generated by $this_program $this_version. -->
EOT

# Section ordering.
my @pre = (_('Summary'), _('Syntax'), _('Parameters'), _('Description'), _('OPTIONS'));
my @post = (_('ENVIRONMENT'), _('FILES'), _('SEE ALSO'));
my %filter = map { $_ => 1 } @pre, @post;

# Output content.
my %done;
for my $sect (@pre, (grep !$filter{$_}, @sections), @post)
{
    next if $done{$sect}++;  # ignore duplicates
    next unless $include{$sect};
    if ($include{$sect})
    {
        if ($sect eq _('Parameters'))
        {
            print enc "### $sect\n\n";
        }
        else
        {
            print enc "## $sect\n\n";
        }

        for ($include{$sect})
        {
            print enc $_;
        }
    }
}

close STDOUT or kark N_("%s: error writing to %s (%s)"), $this_program,
    $opt_output || 'stdout', $!;

exit;

# Get program basename
sub program_basename
{
    local $_ = shift;
    s!.*/!!;
    $_;
}

# Call program with given option and return results.
sub get_option_value
{
    my ($prog, $opt) = @_;
    my $stderr = $discard_stderr ? '/dev/null' : '&1';
    my $value = join '',
        map { s/ +$//; expand $_ }
        map { dec $_ }
        `$prog $opt 2>$stderr`;

    unless ($value)
    {
        my $err = N_("%s: can't get `%s' info from %s%s");
        my $extra = $discard_stderr
            ? "\n" . N_("Try `--no-discard-stderr' if option outputs to stderr")
            : '';

        kark $err, $this_program, $opt, $prog, $extra;
    }

    $value;
}
