#!/usr/bin/env perl
=head
    Converts PO files to SGML files
    Copyright (C) 2008 Red Hat, Inc.

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

=cut

use strict;
use warnings;

{

    package SGML::Translate;

    use Carp;
    use strict;
    use warnings;

    use Parse::LexEvent;

    require Exporter;

    our @ISA = qw(Exporter);

    our %EXPORT_TAGS = (
        'all' => [
            qw(
                tokens
                po_format
                po_comment_format
                normalize
                )
        ]
    );

    our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );

    our @EXPORT = qw(
        tokens
        normalize
        po_format
        po_comment_format
    );

    our $Debugging = 0;

    our @tokens = (
        new Parse::Token::Delimited(
            Name    => 'seealso',
            Handler => 'tag',
            Start   => '<seealso>',
            End     => '</seealso>',
        ),
        new Parse::Token::Delimited(
            Name    => 'term',
            Handler => 'tag',
            Start   => '<term>',
            End     => '</term>',
        ),
        new Parse::Token::Delimited(
            Name    => 'title',
            Handler => 'tag',
            Start   => '<title>',
            End     => '</title>'
        ),
        new Parse::Token::Delimited(
            Name    => 'subtitle',
            Handler => 'tag',
            Start   => '<subtitle>',
            End     => '</subtitle>'
        ),
        new Parse::Token::Delimited(
            Name    => 'comment',
            Handler => 'ignore',
            Start   => '<!--',
            End     => '-->'
        ),
        new Parse::Token::Delimited(
            Name    => 'para',
            Handler => 'tag',
            Start   => '<para>',
            End     => '</para>'
        ),
        new Parse::Token::Delimited(
            Name    => 'glossterm',
            Handler => 'tag',
            Start   => '<glossterm>',
            End     => '</glossterm>'
        ),
        new Parse::Token::Delimited(
            Name    => 'preface',
            Handler => 'tag',
            Start   => '<preface>',
            End     => '</preface>'
        ),
        new Parse::Token::Delimited(
            Name    => 'primary',
            Handler => 'tag',
            Start   => '<primary[^>]*>',    # pattern for jp sortas
            End     => '</primary>'
        ),
        new Parse::Token::Delimited(
            Name    => 'see',
            Handler => 'tag',
            Start   => '<see>',
            End     => '</see>'
        ),
        new Parse::Token::Delimited(
            Name    => 'citetitle',
            Handler => 'tag',
            Start   => '<citetitle>',
            End     => '</citetitle>'
        ),
        new Parse::Token::Delimited(
            Name    => 'secondary',
            Handler => 'tag',
            Start   => '<secondary[^>]*>',    # pattern for jp sortas
            End     => '</secondary>'
        ),
        new Parse::Token::Delimited(
            Name    => 'tertiary',
            Handler => 'tag',
            Start   => '<tertiary[^>]*>',
            End     => '</tertiary>'
        ),
        new Parse::Token::Delimited(
            Name    => 'entry',
            Handler => 'tag',
            Start   => '<entry>',
            End     => '</entry>'
        ),
        new Parse::Token::Delimited(
            Name    => 'entity',
            Handler => 'tag',
            Start   => '<!ENTITY\s+\S+\s+"',
            End     => '"[^<>]*?>',
        ),

# Stuff we ignore is broken into two sections
# Section 1: Tags who do not have sub-tags and that we do not translate.
#            See http://intranet.corp.redhat.com/docs/docs-trans/translating.html
        new Parse::Token::Delimited(
            Name    => 'computeroutput',
            Handler => 'ignore',
            Start   => '<computeroutput>',
            End     => '</computeroutput>',
        ),
        new Parse::Token::Delimited(
            Name    => 'command',
            Handler => 'ignore',
            Start   => '<command>',
            End     => '</command>',
        ),
        new Parse::Token::Delimited(
            Name    => 'filename',
            Handler => 'ignore',
            Start   => '<filename>',
            End     => '</filename>',
        ),
        new Parse::Token::Delimited(
            Name    => 'prompt',
            Handler => 'ignore',
            Start   => '<prompt>',
            End     => '</prompt>',
        ),

        # The following appear inside paras so do not need special treatment
        # Text inside <xref> tags.
        # Text inside <ulink> tags.
        # Text inside <email> tags.

        # Section 2: Tags that have sub-tags that are likely to have content
        #            we translate, so we just skip over the open/close pair
        new Parse::Token::Simple(
            Name    => 'indexterm_open',
            Handler => 'ignore',
            Regex   => '<indexterm>'
        ),
        new Parse::Token::Simple(
            Name    => 'indexterm_close',
            Handler => 'ignore',
            Regex   => '</indexterm>'
        ),
        new Parse::Token::Simple(
            Name    => 'listitem_open',
            Handler => 'ignore',
            Regex   => '<listitem>'
        ),
        new Parse::Token::Simple(
            Name    => 'listitem_close',
            Handler => 'ignore',
            Regex   => '</listitem>'
        ),
        new Parse::Token::Simple(
            Name    => 'option_open',
            Handler => 'ignore',
            Regex   => '<!\s*\[\s*%\S+\s*\['
        ),
        new Parse::Token::Simple(
            Name    => 'option_close',
            Handler => 'ignore',
            Regex   => ']]\s*>'
        ),
        new Parse::Token::Simple(
            Name    => 'note_open',
            Handler => 'ignore',
            Regex   => '<note>'
        ),
        new Parse::Token::Simple(
            Name    => 'note_close',
            Handler => 'ignore',
            Regex   => '</note>'
        ),
        new Parse::Token::Simple(
            Name     => 'newline',
            Handler  => 'ignore',
            Regex    => '\n',
            ReadMore => 1
        ),
        new Parse::Token::Simple(
            Name    => 'space',
            Handler => 'ignore',
            Regex   => '\s*?'
        ),
        new Parse::Token::Simple(
            Name  => 'ignore',
            Regex => '.*?'
        ),
    );

    our $VERSION            = '0.37';
    our $NESTED_DEPTH_LIMIT = 5;
    our $DEFAULT_PO_CHARSET = 'CHARSET';

    sub new {
        my $proto = shift;
        my $class = ref($proto) || $proto;
        my $self  = {};
        $self->{LEXER} = undef;
        bless( $self, $class );
        return $self;
    }

    sub lexer {
        my $self = shift;
        if (@_) { $self->{LEXER} = shift }
        return $self->{LEXER};
    }

    sub parse {
        my $self    = shift;
        my $content = shift;

        my $lexer = $self->lexer();
        $lexer->trace
            if ( $Debugging || $self->{"_DEBUG"} )
            and ( $Debugging > 1 || $self->{_DEBUG} > 1 );
        $lexer->from($content);
        $lexer->parse();
    }

    sub debug {
        my $self = shift;
        confess "usage: thing->debug(level)" unless @_ == 1;
        my $level = shift;
        if ( ref($self) ) {
            $self->{"_DEBUG"} = $level;    # just myself
        }
        else {
            $Debugging = $level;           # whole class
        }
    }

    sub info {
        my $self = shift;
        if ( $Debugging || $self->{"_DEBUG"} ) {
            print STDERR @_;
        }
    }

    # Class method to ensure we have a closed set of tags
    sub _tag {
        my $self  = shift;
        my $name  = shift;
        my $data  = shift;
        my $lexer = $self->lexer();

        my ( $open_tags, $close_tags, $nested_depth );
        $nested_depth = 0;
        while ( $open_tags = () = $data =~ /<$name>/sg
            and $close_tags = () = $data =~ /<\/$name>/sg
            and $open_tags != $close_tags
            and $nested_depth++ < $NESTED_DEPTH_LIMIT )
        {

        #$self->info("tag count is: open [$open_tags] close [$close_tags]\n");
        #$self->info("data is: [$data]\n");
        #$self->info("pos is: [", $lexer->pos, "]\n");
            my $index
                = index( $lexer->buffer, "</$name>", $lexer->pos )
                - $lexer->pos 
                + length("</$name>");

            #$self->info("index is: [$index]\n");
            $data .= substr( $lexer->buffer, $lexer->pos, $index );

            # need to manually reset the buffer now
            my $new_buffer
                = substr( $lexer->buffer, ( $lexer->pos + $index ) );
            $lexer->pos(0);
            $lexer->buffer($new_buffer);
            $lexer->length( length($new_buffer) );

            #$self->info("new data is: [$data]\n");
        }
        die
            "ERROR: nested tag depth of $nested_depth exceeded limit $NESTED_DEPTH_LIMIT\n$data\n"
            if $nested_depth >= $NESTED_DEPTH_LIMIT;
        return $data;
    }

    sub normalize {
        my $norm = shift;
        my $name = shift;
        $norm =~ s/\n/ /g;     # CR
        $norm =~ s/^\s*//g;    # space at start of line
        $norm =~ s/\s*$//g;    # space at end of line
        $norm =~ s/\s+/ /g;    # colapse spacing
        return $norm;
    }

    sub po_comment_format {
        my $string = shift;
        $string =~ s/\n/\n#/sg;
        return $string;
    }

    sub po_format {
        my $string = shift;
        my $name   = shift;
        $string =~ s/^<$name>\s*//s;     # remove start tag to reduce polution
        $string =~ s/\s*<\/$name>$//s;   # remove close tag to reduce polution
        $string =~ s/\\/\\\\/g;  # \ seen as control sequence by msg* programs
        $string =~ s/\"/\\"/g;   # " seen as special char by msg* programs
        return $string;
    }

    sub po_unformat {
        my $string = shift;

        $string =~ s/^\"//msg;    # strip sol quotes added by msguniq etc
        $string =~ s/\"$//msg;    # strip sol quotes added by msguniq etc
        $string =~ s/\n//msg;     # strip eol quotes added by msguniq etc
        $string =~ s/^\s*//msg
            ;    # strip the leading spaces left from the msgid "" line
        $string =~ s/\\"/\"/msg;    # unescape quotes added by po_format
        $string =~ s/\\\\/\\/g;     # unescape backslash added by po_format
        return $string;
    }

    sub header {
        return q [
# SOME DESCRIPTIVE TITLE.
# Copyright (C) YEAR Free Software Foundation, Inc.
# FIRST AUTHOR <EMAIL@ADDRESS>, YEAR.
#
msgid ""
msgstr ""
"Project-Id-Version: PACKAGE VERSION\n"
"POT-Creation-Date: 2001-01-22 16:42+1000\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <LL@li.org>\n"
"MIME-Version: 1.0\n"
"Content-Type: text/plain; charset=CHARSET\n"
"Content-Transfer-Encoding: 8bit\n"

];

    }

    sub msgid {
        my $file       = shift;
        my $line       = shift;
        my $tag        = shift;
        my $segment    = shift;
        my $substitute = shift;
        my $status     = shift || '';

        # #: src/hello.c:12
        # msgid "Hello World\n"
        # msgstr ""

        my $msgid  = po_format( $segment,    $tag );
        my $msgstr = po_format( $substitute, $tag );
        my $comment = po_comment_format($segment);

        return if length($msgid) < 1;    # skip null strings

        print << "EOR"
# $comment
#: $file: $line
#, no-c-format, $status
msgid "$msgid"
msgstr "$msgstr"

EOR
    }

}

# Loosely based on the concept of msgmerge for sgml files.
# Paul Gampe <pgampe@redhat.com> 2002-2003
# Changed reg ex in line 100 to "#, fuzzy" to cover fuzzy entries
# both in RHEL and Fedora related po-files
# Timo Trinks <ttrinks@redhat.com>
# vim:tw=78:ts=4

{

    # same package exists in sgml-merge by the way

    package SGML::Translate::Merge;
    use base qw(SGML::Translate);

    use utf8;
    use strict;
    use IO::File;
    use Parse::LexEvent;

    ###############################################################################
    # VARIABLES
    ###############################################################################
    our %cnt;
    use constant DEBUG => 0;
    our $DEFAULT_FORMAT = 'SGML';
    our $VERSION = ( split( ' ', '$Revision: 831 $' ) )[1];

    ###############################################################################
    # METHODS
    ###############################################################################

    sub new {
        my $that  = shift;
        my $class = ref($that) || $that;
        my $self  = bless $that->SUPER::new(), $class;
        $self->{LEXER} = Parse::LexEvent->new(@SGML::Translate::tokens);
        $self->{LEXER}->skip('');
        $self->{FILE}             = '';
        $self->{PO_INDEX}         = {};
        $self->{WORDS_TRANSLATED} = 0;
        $self->{WORDS_FUZZY}      = 0;
        $self->{WORDS_TOTAL}      = 0;
        while ( my $token = pop @SGML::Translate::tokens ) {
            $token->set( Self => $self )
                ;    # make sure the tokens can reference self
        }
        return $self;
    }

    sub fuzzy {
        my $self = shift;
        if (@_) { $self->{FUZZY} = shift }
        return $self->{FUZZY};
    }

    sub report_mode {
        my $self = shift;
        if (@_) { $self->{REPORT_MODE} = shift }
        return $self->{REPORT_MODE};
    }

    sub po_index {
        my $self = shift;
        if (@_) { $self->{PO_INDEX} = shift }
        return $self->{PO_INDEX};
    }

    sub load_po {
        my $self = shift;
        my $file = shift;
        local $/ = "\n\n";

        my $po_idx = new IO::File "< $file";
        while (<$po_idx>) {
            next if $. == 1;    # skip po file header
                                # # <para>duplicate paragraph</para>
                                # # 900duplicate.sgml: 0
                                # #, no-c-format
                                # msgid "duplicate paragraph"
                                # msgstr "duplicate paragraph"
            next
                unless my ( $ref, $msgid, $msgstr )
                    = /^#(.*)msgid (\".*?\")\nmsgstr (\".*\")/s;
            $msgid = SGML::Translate::po_unformat($msgid);

            my $count = () = $msgid =~ /\w+/g;
            $self->{WORDS_TOTAL} += $count;

            next if $msgstr =~ /^\"\"$/;

            # TODO: is this right?
            if ( $ref =~ /#, fuzzy/ ) {
                $self->{WORDS_FUZZY} += $count if $self->fuzzy();
            }
            else {
                $self->{WORDS_TRANSLATED} += $count;
            }

            $msgstr = SGML::Translate::po_unformat($msgstr);
            $self->po_index->{$msgid} = $msgstr;

            $self->info("REF: $ref\n");
            $self->info("MSGID: $msgid\n");
            $self->info("MSGSTR: $msgstr\n");
        }
        my $po_cnt = () = keys %{ $self->po_index() };
        $self->info("PO TOTAL: $po_cnt\n");
    }

    sub tag {
        my $token = $_[0];
        my $name  = $token->name;
        my $data  = $_[1];
        my $id    = $cnt{ uc($name) }++;

        my $self = $token->get('Self');

        $data = $self->SUPER::_tag( $name, $data );

        my $norm = SGML::Translate::normalize( $data, $name );
        $data = $norm;

        # this is part of po_format we need to copy to match properly
        $norm =~ s/^<$name[^>]*?>\s*//sm;      # remove start tag
        $norm =~ s/\s*<\/$name[^>]*?>$//sm;    # remove close tag

        # see if a msgstr for this msgid exists
        my $count = () = $norm =~ /\w+/g;
        if ( exists $self->po_index->{$norm} ) {
            my $msgstr = $self->po_index->{$norm};
            $self->info("MSGSTR [$msgstr]\n");
            $self->info("NORM [$norm]\n");

   # now insert the msgstr text, as in the po_format command took off the tags
            $data =~ s/\Q$norm\E/$msgstr/;
            $self->info("DATA [$data]\n");
        }
        else {
            $self->info("NO MATCH:\nDATA\t[$data]\n");
        }
        $self->info("INSERTING [$data]\n");

        # local hack for CJK jadetex limitation of utf-8 code mapping
        $data
            =~ s/<trademark\s+class=\Sregistered\S>(.*?)<\/trademark>/<![ %HTMLONLY  [<trademark class='registered'>$1<\/trademark>]]><![ %PRINTONLY [$1&reg;]]>/g;
        $data
            =~ s/<trademark\s+class=\Scopyright\S>(.*?)<\/trademark>/<![ %HTMLONLY  [<trademark class='copyright'>$1<\/trademark>]]><![ %PRINTONLY [$1&copy;]]>/g;
        print $data;
    }

    sub stats {
        my $self = shift;
        my $rc   = "";

        if ( $self->{WORDS_TOTAL} == 0 ) {
            return "warning: no words found\n";
        }

        if ( $self->{WORDS_FUZZY} > 1 ) {
            $rc
                .= sprintf
                "$self->{WORDS_FUZZY}/$self->{WORDS_TOTAL} fuzzy words (%2.2f%%), ",
                $self->{WORDS_FUZZY} / $self->{WORDS_TOTAL} * 100;
        }
        $rc
            .= sprintf
            "$self->{WORDS_TRANSLATED}/$self->{WORDS_TOTAL} translated words (%2.2f%%)",
            $self->{WORDS_TRANSLATED} / $self->{WORDS_TOTAL} * 100;
        return "$rc\n";
    }

    sub ignore {
        my $token = $_[0];
        my $self  = $token->get('Self');
        $self->info("IGNORING\t[$_[1]]\n");
        print "$_[1]";
    }

}    # PACKAGE


###############################################################################
# USAGE
###############################################################################

=head1 NAME

po2sgml - Convert gettext po into DocBook SGML

=head1 SYNOPSIS

po2sgml [options] [sgml_file po_file | --report po_file]

=head1 OPTIONS

=over 8

=item B<--help>

Print a brief help message and exits.

=item B<--man>
 
Prints the manual page and exits.

=cut

###############################################################################
# VARIABLES
###############################################################################

our $VERSION = 0.01;

our $verbose = 0;    # display diagnostics
our $man     = 0;    # display man page
our $help    = 0;    # display help
our $do_report;
our $fuzzy = 1;
our $po_file;
our $sgml_file;
our $rate        = 250;
our $frate       = 250 * 2;
our $totals_only = 0;
our $complete    = 0;

###############################################################################
# MODULES
###############################################################################

use Pod::Usage;
use Getopt::Long;
GetOptions(
    "verbose|d"  => \$verbose,
    'help|?'     => \$help,
    'man'        => \$man,
    "report"     => \$do_report,
    "fuzzy!"     => \$fuzzy,
    "sgml=s"     => \$sgml_file,
    "po=s"       => \$po_file,
    "rate=i"     => \$rate,
    "frate=i"    => \$frate,
    "totals"     => \$totals_only,
    "complete|c" => \$complete,
) || pod2usage( -verbose => 0 );


###############################################################################
# METHODS
###############################################################################

sub do_report {

    my $total_words = 0;
    my $total_fuzzy = 0;
    my $total_trans = 0;

    while ( $po_file = shift ) {
        my $merger = SGML::Translate::Merge->new();
        $merger->debug($verbose);
        $merger->report_mode(1) if defined $do_report;
        $merger->fuzzy($fuzzy);
        $merger->load_po($po_file);
        print STDERR "$po_file: ", $merger->stats() unless $totals_only;
        $total_fuzzy += $merger->{WORDS_FUZZY};
        $total_trans += $merger->{WORDS_TRANSLATED};
        $total_words += $merger->{WORDS_TOTAL};
    }

    if ($complete) {
        print( STDOUT int( $total_trans / $total_words * 100 ) );
        exit( int( $total_trans / $total_words * 100 ) );
    }

    # Crunch totals
    my $rc = "total: ";

    if ( $total_words == 0 ) {
        print STDERR "warning: no words found\n";
        return;
    }

    if ( $total_fuzzy > 1 ) {
        $rc .= sprintf "%d/%d fuzzy words (%2.2f%%),  ", $total_fuzzy,
            $total_words, $total_fuzzy / $total_words * 100;
    }
    $rc .= sprintf "$total_trans/$total_words translated words (%2.2f%%)",
        $total_trans / $total_words * 100;
    print STDERR "$rc\n";

    $rc = "total: ";
    if ( $total_fuzzy > 1 ) {
        $rc .= sprintf "%2.2f fuzzy hours, ", $total_fuzzy / $frate;
    }

    $rc .= sprintf "%2.2f untranslated hours, ",
        ( $total_words - $total_trans - $total_fuzzy ) / $rate;
    $rc .= sprintf "%2.2f remaining hours ",
        ( $total_fuzzy / $frate )
        + ( ( $total_words - $total_trans - $total_fuzzy ) / $rate );

    print STDERR "$rc\n";
}

=pod

=item B<--verbose>

Display system calls and other messages to the screen as they are executed.

=item B<--report>

Display a report of the status of gettext .po file, does not require an sgml file

=item B<--po filename>

Provide the filename of po file to use

=item B<--sgml filename>

Provide the filename of sgml file to use

=item B<--fuzzy|--nofuzzy>

Treat fuzzy messages as fuzzy, or if no than as untranslated

=item B<--rate=X>

Set translation rate to X words per hour, default is $rate

=item B<--frate=X>

Set translation rate to X words per hour for fuzzy, default is $frate

=item B<--totals>

Display only the total figures when reporting

=cut

###############################################################################
# MAIN
###############################################################################

$| = 1;    # no buffering

pod2usage( -verbose => 1 ) if ($help);
pod2usage( -verbose => 2 ) if ($man);

if ($do_report) {
    if ($complete) { $totals_only = 1; }
    pod2usage( -verbose => 0, -message => "$0: po_file required.\n" )
        unless defined $ARGV[0];
    do_report(@ARGV);
    exit 0;
}

$po_file   = defined $po_file   ? $po_file   : $ARGV[1];
$sgml_file = defined $sgml_file ? $sgml_file : $ARGV[0];

pod2usage( -verbose => 0, -message => "$0: po_file required.\n" )
    unless defined $po_file;
pod2usage( -verbose => 0, -message => "$0: sgml_file required.\n" )
    unless defined $sgml_file;

my $merger = SGML::Translate::Merge->new();
$merger->debug($verbose);
$merger->fuzzy($fuzzy);
$merger->load_po($po_file);

my $content = '';
open my $fh, $sgml_file or die $!;
{
    local $/;
    $content = <$fh>;
}
close $fh;

$merger->parse($content);

print STDERR "$sgml_file: ", $merger->stats();

exit 0;
