#!/usr/bin/perl -w
#
# Usage:
#	msgxmerge <po-file> <pot-file>
#
# Description:
#	msgxmerge merges a po-file into a pot-file
#	ignoring differences within tags and entities
#
# Author:
#	Bernd Groh <bgroh@redhat.com>
#
=head
    Copyright (C) 2008 Red Hat, Inc.

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License along
    with this program; if not, write to the Free Software Foundation, Inc.,
    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

=cut


use strict;
use warnings;
use utf8;
use bytes;
use Getopt::Long;

my $extras_defined = 0;
my $loud = 0;
my $louder = 0;
my $quiet = 0;
my $status = 0;
my $output = 0;
my $failed = 0;
my $no_correction = 0;

GetOptions(
	'n|no-correction' => \$no_correction,
	's|status' => \$status,
	'q|quiet' => \$quiet,
	'l|loud' => \$loud,
	'll|louder' => \$louder,
	'f|failed' => \$failed,
	'e|d|extras|defined|extras-defined|defined-extras' => \$extras_defined,
	'o|output|output-file' => \$output
);

if (@ARGV != 2) {
	print STDERR "Usage: msgxmerge [options] <po-file> <pot-file>\n";
	exit 0;
}

my $pofile = $ARGV[0];
my $potfile = $ARGV[1];
my $encoding = 'CHARSET';
my $msgid = "";
my $msgstr = "";
my $collect = 0;
my $fuzzytag = 0;
my @msgids = ();
my @msgstrs = ();
my @fuzzy = ();
my $header = '';

# check po header and get encoding

open(FILE,"<$pofile") or die $0;
my $checkheader = 4;
while (<FILE>) {
	if ($checkheader == 4) {
		$header .= $_;
		if (/^msgid \"\"$/) {
			$checkheader--;
		}
	} elsif ($checkheader == 3) {
		$header .= $_;
		if (/^msgstr \"\"$/) {
			$checkheader--;
		} else {
			print STDERR "Not a valid po-header!\n";
			exit 1;
		}
	} elsif ($checkheader == 2) {
		if (/^\"Content-Type: text\/plain; charset=(.+)\\n\"$/) {
			$encoding = $1;
			$checkheader--;
			$header .= "\"Content-Type: text\/plain; charset=UTF-8\\n\"\n";
		} elsif (/^$/) {
			print STDERR "Not a valid encoding!\n";
			exit 1;
		} else {
			$header .= $_;
		}
	} elsif ($checkheader == 1) {
		$header .= $_;
		if (/^$/) {
			$checkheader--;
		}
	} else {
		last;
	}
}
close(FILE);
if ($checkheader != 0) {
	print STDERR "Not a valid po-format!\n";
	exit 1;
}
if ($encoding eq 'CHARSET') {
	$encoding = 'UTF-8';
}

# if encoding is not utf-8, convert file to utf-8

if (!($encoding =~ /utf-?8/i)) {
	if (!$quiet) {
		print STDERR "iconv -f $encoding -t UTF-8 $pofile\n";
	}
	my $iconv = `iconv -f $encoding -t UTF-8 -o $pofile.utf8 $pofile`;
	if (!$iconv) {
		# if success
		system("mv $pofile $pofile.$encoding");
		system("mv $pofile.utf8 $pofile");
	} else {
		system("rm -f $pofile.utf8");
		print STDERR "Failed to convert $pofile from $encoding to UTF-8!\n";
		exit 1;
	}
}

# read msgids and msgstrs into memory

open(FILE,"<$pofile") or die $0;
while (<FILE>) {
	if (/^#, fuzzy/) {
		$fuzzytag = 1;
	} elsif (/^msgid \"(.+)\"$/) {
		$msgid = $1;
	} elsif (/^msgid \"\"$/) {
		$msgid = "";
		$collect = 1;
	} elsif (/^msgstr \"(.+)\"$/) {
		$msgstr = $1;
	} elsif (/^msgstr \"\"$/) {
		$msgstr = "";
		$collect = -1;
	} elsif (/^\"(.*)\"$/) {
		my $entry = $1;
		if ($collect > 0) {
			$msgid .= $entry;
			$collect++;
		} elsif ($collect < 0) {
			$msgstr .= $entry;
			$collect--;
		}
	} elsif (/^$/) {
		if (($msgid =~ /\S/) && ($msgstr =~ /\S/)) {
			push @msgids, $msgid;
			push @msgstrs, $msgstr;
			push @fuzzy, $fuzzytag;
		}
		$msgid = "";
		$msgstr = "";
		$collect = 0;
		$fuzzytag = 0;
	}
}
if (($msgid =~ /\S/) && ($msgstr =~ /\S/)) {
	push @msgids, $msgid;
	push @msgstrs, $msgstr;
	push @fuzzy, $fuzzytag;
}
close(FILE);

my @potids = ();

open(FILE,"<$potfile") or die $0;
while (<FILE>) {
	if (/^msgid \"(.+)\"$/) {
		$msgid = $1;
	} elsif (/^msgid \"\"$/) {
		$msgid = "";
		$collect = 1;
	} elsif (/^msgstr/) {
		$collect = 0;
	} elsif (/^\"(.*)\"$/) {
		if ($collect > 0) {
			$msgid .= $1;
			$collect++;
		}
	} elsif (/^$/) {
		if ($msgid =~ /\S/) {
			push @potids, $msgid;
		}
		$msgid = "";
		$collect = 0;
	}
}
if ($msgid =~ /\S/) {
	push @potids, $msgid;
}
close(FILE);

my $handle = undef;
if ($output) {
	open($handle, ">$output") or die "$0: Cannot write to $output";
}

if ($output) {
	print $handle $header;
} else {
	print $header;
}

if ($louder) {
	print STDERR "\n* Looking at $pofile (PO)\n\n";
}

my $exact = 0;
my $fuzzy = 0;
my $untranslated = 0;
for (my $i = 0; $i < @msgids; $i++) {
	if ($fuzzy[$i]) {
		if ($louder) {
			print STDERR '[' . ($i + 1) . '] IGNORED' . "\n";
			print STDERR "msgid: " . $msgids[$i] . "\n\n";
		}
		if ($output) {
			print $handle "# IGNORED\n";
			print $handle "#, fuzzy, no-c-format\n";
			print $handle "msgid \"" . $msgids[$i] . "\"\n";
			print $handle "msgstr \"" . $msgstrs[$i] . "\"\n\n";
		} else {
			print "# IGNORED\n";
			print "#, fuzzy, no-c-format\n";
			print "msgid \"" . $msgids[$i] . "\"\n";
			print "msgstr \"" . $msgstrs[$i] . "\"\n\n";
		}
	} else {
		my $j = 0;
		for ($j = 0; $j < @potids; $j++) {
			$msgid = $msgids[$i];
			my $potid = $potids[$j];
			
			if (($msgid ne $potid) && (!($no_correction))) {
				$msgid =~ s/(\W)\s+/$1/g;
				$msgid =~ s/\s+(\W)/$1/g;
				$msgid =~ s/\s+/ /g;
				$potid =~ s/(\W)\s+/$1/g;
				$potid =~ s/\s+(\W)/$1/g;
				$potid =~ s/\s+/ /g;
			}
			
			if ($msgid eq $potid) {
				if ($louder) {
					print STDERR '[' . ($i + 1) . '] EXACT MATCH' . "\n";
					print STDERR "msgid: " . $potids[$j] . "\n\n";
				}
				if ($output) {
					print $handle "# EXACT MATCH\n";
					print $handle "#, no-c-format\n";
					print $handle "msgid \"" . $potids[$j] . "\"\n";
					print $handle "msgstr \"" . $msgstrs[$i] . "\"\n\n";
				} else {
					print "# EXACT MATCH\n";
					print "#, no-c-format\n";
					print "msgid \"" . $potids[$j] . "\"\n";
					print "msgstr \"" . $msgstrs[$i] . "\"\n\n";
				}
				$msgids[$i] = $potids[$j];
				
				$exact++;
				last;
			}
		}
		if ($j == @potids) {
			for ($j = 0; $j < @potids; $j++) {
				$msgid = $msgids[$i];
				$msgstr = $msgstrs[$i];
				my $msgcmp = $msgids[$i];
				my $potcmp = $potids[$j];
				$msgcmp =~ s/<[^<>]+>//g;
				$potcmp =~ s/<[^<>]+>//g;
				if (!$extras_defined) {
					$potcmp =~ s/\[(\w+)\s*\]/&$1;/g;
				}
				
				if (($msgcmp ne $potcmp) && (!($no_correction))) {
					$msgcmp =~ s/(\W)\s+/$1/g;
					$msgcmp =~ s/\s+(\W)/$1/g;
					$msgcmp =~ s/\s+/ /g;
					$potcmp =~ s/(\W)\s+/$1/g;
					$potcmp =~ s/\s+(\W)/$1/g;
					$potcmp =~ s/\s+/ /g;
				}
				
				if ($msgcmp eq $potcmp) {
					if ($louder) {
						print STDERR '[' . ($i + 1) . '] FUZZY MATCH' . "\n";
						print STDERR "msgid: " . $msgids[$i] . "\n";
						print STDERR "msgid: " . $potids[$j] . "\n\n";
					}
					
					# cover some known cases
					
					if ($msgids[$i] =~ /^([^<>]+)$/) {
						$msgid = $1;
						$msgstr = undef;
						if (!($msgstrs[$i] =~ /^([^<>]+)$/)) {
							warn ($msgid . ' <=> ' . $msgstrs[$i]);
						} else {
							$msgstr = $1;
						}
						
						my $pre_tags = '';
						my $post_tags = '';
						$potcmp = $potids[$j];
						while ($potcmp =~ s/^\s*(<[^<>]+>)//) {
							$pre_tags .= $1;
						}
						while ($potcmp =~ s/(<[^<>]+>)\s*$//) {
							$post_tags = $1 . $post_tags;
						}
						
						$msgid = $pre_tags . $msgid . $post_tags;
						$msgstr = $pre_tags . $msgstr . $post_tags;
					} elsif ($potids[$j] =~ /^([^<>]+)$/) {
						$msgid =~ s/<[^<>]+>//g;
						$msgstr =~ s/<[^<>]+>//g;
					} else {
						# make tag adjustments
						
						my @pottags = ();
						my @msgtags = ();
						$msgcmp = $msgids[$i];
						$potcmp = $potids[$j];
						
						while ($potcmp =~ s/(<[^<>]+>)//) {
							my $tag = $1;
							if ((!($tag =~ /<ulink/)) && (!($tag =~ /<xref/))) {
								push @pottags, $1;
							}
						}
						
						for (my $t = 0; $t < @pottags; $t++) {
							$potcmp = $pottags[$t];
							
							if ($msgcmp =~ /($potcmp)/) {
								push @msgtags, $1;
							} else {
								$potcmp =~ s/^<(\w+).*$/$1/;
								if ($msgcmp =~ /(<$potcmp[^>]*>)/) {
									push @msgtags, $1;
								} else {
									if (!$quiet) {
										print STDERR "$potcmp: no matching tag\n";
									}
									push @msgtags, $pottags[$t];
								}
							}
						}
						
						for (my $t = 0; $t < @pottags; $t++) {
							if ($msgtags[$t] ne $pottags[$t]) {
							$msgcmp = $msgtags[$t];
							$potcmp = $pottags[$t];
								
								if ($loud) {
									print STDERR "$msgcmp => $potcmp\n";
								}
								
								$msgid =~ s/$msgcmp/$potcmp/g;
								$msgstr =~ s/$msgcmp/$potcmp/g;
							}
						}
					}
					
					while ($msgid =~ /linkend=\\"([\w\.]+-[\w\.-]+)\\"\s*>/s) {
						my $id = $1;
						my $newid = lc($id);
						$newid =~ s/^s\d-/sec\./;
						$newid =~ s/-/\./g;
						$msgid =~ s/linkend=\\"$id\\"\s*>/linkend=\\"$newid\\"\/>/sg;
					}
					while ($msgstr =~ /linkend=\\"([\w\.]+-[\w\.-]+)\\"\s*>/s) {
						my $id = $1;
						my $newid = lc($id);
						$newid =~ s/^s\d-/sec\./;
						$newid =~ s/-/\./g;
						$msgstr =~ s/linkend=\\"$id\\"\s*>/linkend=\\"$newid\\"\/>/sg;
					}
					
					if (!$extras_defined) {
						my @pottags = ();
						my @msgtags = ();
						$msgcmp = $msgid;
						$msgcmp =~ s/\[/\\\[/;
						$msgcmp =~ s/\]/\\\]/;
						$potcmp = $potids[$j];
						
						while ($potcmp =~ s/(\[\w+\s*\])//) {
							push @pottags, $1;
						}
						
						for (my $t = 0; $t < @pottags; $t++) {
							$potcmp = $pottags[$t];
							$potcmp =~ s/\[/\\\[/;
							$potcmp =~ s/\]/\\\]/;
							
							if ($msgcmp =~ /($potcmp)/) {
								push @msgtags, $1;
							} else {
								$potcmp =~ s/^\\\[(\w+)\s*\\\]$/$1/;
								if ($msgcmp =~ /(&$potcmp;)/) {
									push @msgtags, $1;
								} else {
									if (!$quiet) {
										print STDERR "Error: $potcmp\n";
									}
									push @msgtags, $pottags[$t];
								}
							}
						}
						
						for (my $t = 0; $t < @pottags; $t++) {
							if ($msgtags[$t] ne $pottags[$t]) {
								$msgcmp = $msgtags[$t];
								$potcmp = $pottags[$t];
								
								if ($loud) {
									print STDERR "$msgcmp => $potcmp\n";
								}
								
								$msgid =~ s/$msgcmp/$potcmp/g;
								$msgstr =~ s/$msgcmp/$potcmp/g;
							}
						}
					}
					
					# cover some more known cases
					
					$potcmp = $potids[$j];
					$msgcmp = $msgid;
					
					if (($msgid ne $potcmp) && (!($no_correction))) {
						$msgid =~ s/(\W)\s+/$1/g;
						$msgid =~ s/\s+(\W)/$1/g;
						$msgid =~ s/\s+/ /g;
						$potcmp =~ s/(\W)\s+/$1/g;
						$potcmp =~ s/\s+(\W)/$1/g;
						$potcmp =~ s/\s+/ /g;
					}
					
					if ($msgid ne $potcmp) {
						my $msgclr = $msgid;
						my $msgl = undef;
						my $msgr = undef;
						if ($msgclr =~ /^\s*(<[^<>]+>)(.*)(<[^<>]+>)\s*$/) {
							$msgl = $1;
							$msgr = $3;
							$msgclr = $2;
						}
						my $potclr = $potcmp;
						my $potl = undef;
						my $potr = undef;
						if ($potclr =~ /^\s*(<[^<>]+>)(.*)(<[^<>]+>)\s*$/) {
							$potl = $1;
							$potr = $3;
							$potclr = $2;
						}
						
						if ($potl && $potr && ($msgid eq $potclr)) {
							$msgid = $potl . $msgid . $potr;
							$msgstr = $potl . $msgstr . $potr;
						} elsif ($msgl && $msgr && ($potcmp eq $msgclr)) {
							$msgid =~ s/^\s*<[^<>]+>//;
							$msgid =~ s/<[^<>]+>\s*$//;
							$msgstr =~ s/^\s*<[^<>]+>//;
							$msgstr =~ s/<[^<>]+>\s*$//;
						}
					}
					
					if ($msgid eq $potcmp) {
						if ($output) {
							print $handle "# ADJUSTED MATCH\n";
							print $handle "#, no-c-format\n";
							print $handle "msgid \"" . $potids[$j] . "\"\n";
							print $handle "msgstr \"" . $msgstr . "\"\n\n";
						} else {
							print "# ADJUSTED MATCH\n";
							print "#, no-c-format\n";
							print "msgid \"" . $potids[$j] . "\"\n";
							print "msgstr \"" . $msgstr . "\"\n\n";
						}
						$msgids[$i] = $potids[$j];
						
						$fuzzy++;
						last;
					} else {
						if (!$quiet) {
							print STDERR ("\n" . $msgcmp . "\n\tNE\n" . $potids[$j] . "\n\n");
						}
						if ($output) {
							print $handle "# ADJUSTED MATCH\n";
						} else {
							print "# ADJUSTED MATCH\n";
						}
					}
				}
			}
		}
		if ($j == @potids) {
			if ($failed || $louder) {
				print STDERR '[' . ($i + 1) . '] NO MATCH' . "\n";
				print STDERR "msgid: " . $msgids[$i] . "\n\n";
			}
			if ($output) {
				print $handle "# NO MATCH\n";
				print $handle "#, no-c-format\n";
				print $handle "msgid \"" . $msgids[$i] . "\"\n";
				print $handle "msgstr \"" . $msgstrs[$i] . "\"\n\n";
			} else {
				print "# NO MATCH\n";
				print "#, no-c-format\n";
				print "msgid \"" . $msgids[$i] . "\"\n";
				print "msgstr \"" . $msgstrs[$i] . "\"\n\n";
			}
			$untranslated++;
		}
	}
}

if ((!$quiet) || ($status)) {
	print STDERR "PO: $exact exact, $fuzzy adjusted, and $untranslated failed matches.\n";
}

if ($louder) {
	print STDERR "\n* Looking at $potfile (POT)\n\n";
}

$exact = 0;
$fuzzy = 0;
$untranslated = 0;

for (my $i = 0; $i < @potids; $i++) {
	my $j = 0;
	while ($j < @msgids) {
		if ($msgids[$j] eq $potids[$i]) {
			$exact++;
			last;
		}
		$j++;
	}
	if ($j == @msgids) {
		$j = 0;
		while ($j < @msgids) {
			my $potid = $potids[$i];
			my $pre_tags = '';
			my $post_tags = '';
			while ($potid =~ /^\s*(<[^<>]+>)\s*(.*\S)\s*(<[^<>]+>)\s*$/) {
				$pre_tags .= $1;
				$post_tags = $3 . $post_tags;
				$potid = $2;
			}
			if ((!($fuzzy[$j])) && ($potid ne $potids[$i])) {
				$msgid = $msgids[$j];
				$msgstr = $msgstrs[$j];
				while ($msgid =~ /^\s*<[^<>]+>\s*(.*\S)\s*<[^<>]+>\s*$/) {
					$msgid = $1;
				}
				while ($msgstr =~ /^\s*<[^<>]+>\s*(.*\S)\s*<[^<>]+>\s*$/) {
					$msgstr = $1;
				}
				
				my $potcmp = $potid;
				if (($msgid ne $potcmp) && (!($no_correction))) {
					$msgid =~ s/(\W)\s+/$1/g;
					$msgid =~ s/\s+(\W)/$1/g;
					$msgid =~ s/\s+/ /g;
					$potcmp =~ s/(\W)\s+/$1/g;
					$potcmp =~ s/\s+(\W)/$1/g;
					$potcmp =~ s/\s+/ /g;
				}
				
				if ($msgid eq $potcmp) {
					$msgstr = $pre_tags . $msgstr . $post_tags;
					
					if ($louder) {
						print STDERR '[' . ($fuzzy + $untranslated) . '] ADJUSTED MATCH (POT)' . "\n";
						print STDERR "msgid: " . $potids[$i] . "\n\n";
					}
					
					if ($output) {
						print $handle "# ADJUSTED MATCH (POT)\n";
						print $handle "#, no-c-format\n";
							print $handle "msgid \"" . $potids[$i] . "\"\n";
						print $handle "msgstr \"" . $msgstr . "\"\n\n";
					} else {
						print "# ADJUSTED MATCH (POT)\n";
						print "#, no-c-format\n";
						print "msgid \"" . $potids[$i] . "\"\n";
						print "msgstr \"" . $msgstr . "\"\n\n";
					}
					
					$fuzzy++;
					last;
				}
			}
			
			$j++;
		}
	}
	if ($j == @msgids) {
		$untranslated++;
		if ($failed || $louder) {
			print STDERR '[' . ($fuzzy + $untranslated) . '] NO MATCH (POT)' . "\n";
			print STDERR "msgid: " . $potids[$i] . "\n\n";
		}
	}
}

if ((!$quiet) || ($status)) {
	print STDERR "POT: $exact exact, $fuzzy adjusted, and " . $untranslated . " failed matches.\n";
}
