#!/usr/bin/perl

# redzone - reduce a zone file
# Copyright 1999 by Leopold Toetsch <lt@toetsch.at>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

use strict;
use Getopt::Std;

# some global options vars
my ($inf, $outf, $rc,$verbose, $normalize, $newinf, $oldinf, $keep_files, $LEN);
my ($opt_only);
# statistics
my (@red, $redt, $tot, $rem);

my $LINK = 127; # maxzone

&getargs;
$| = 1;
&go;
1;

#======= subs
# get commandline args
#
sub getargs {
	my(%opt);
	push(@ARGV,'-V');
	$normalize = 0;	# defaultvalues to keep the compiler happy
	$keep_files = 0;
	$rc = 4;
	$verbose = 2;
	$LEN = 4;
	getopt('z:r:v:nl:ko', \%opt);
	$inf = $opt{'z'};
	$outf = $opt{'r'};
	$verbose = $opt{'v'} if($opt{'v'} ne '');
	$normalize=1 if($opt{'n'});
	$LEN = $opt{'l'} if ($opt{'l'});
	$keep_files=1 if(defined $opt{'k'});
	$opt_only=1 if(defined $opt{'o'});
	$rc = $LEN-1;
	&usage unless($inf && $outf);
}

# don't remember, what this sub is for
sub usage {
	print "$0 -z infile -r outfile [ -v verboselevel ] [ -n ] [ -l len ] [ -k ]\n";
	print "\tDefaults:\n";
	print "\tverboselevel = 2\n\tnormalize = NO (aussume is already)\n\tlen = 4\n\tkeep inetermediate files = NO\n";
	exit 1;
}

# main routine
sub go {
#
# first check, we can read and write
#
	my ($i);
	for ($i=0; $i<20; $i++) {
	    $red[$i] = 0;
	}
	open(IN, "$inf") or die("Can't read $inf");
	open(OUT, ">$outf") or die("Can't write $outf");
#
#	if data are not normalized i.e. sorted ascending and from < to
#	then we do it here
#
	if ($normalize) {
		print "Normalizing ...\n" if ($verbose);
		&normalize;
	}
#
#	data are prepared now, let's do the real work
#
    if($opt_only) {
		&optimize;
    }
    else {
		&reduce;
		&optimize;
    }
	&clean_up unless($keep_files);
	my $perc = $tot?$redt/$tot*100:0;
	if ($verbose) {
		print "Finito:\t$redt of $tot data where eliminated\n";
		printf "\tThis is a reduction of %4.1f %%\n", $perc;
		if ($verbose > 1) {
			my ($ab, $r);
			print "\nDetails\n";
			printf "Total records\t%6d\n", $tot;
			foreach $ab ('b','a') {
				for ($i=1; $i<=$rc; $i++) {
					$r = $red[$i + 10*($ab eq 'a')];
					printf "Pass %s-%d\t%6d\n", $ab, $i, 0-$r if($r);
				}
			}
			printf "Remaing recs\t%6d\n", $tot-$redt;
		}
	}
}

sub clean_up {
    system("rm $inf.{a,b,n}* t1~ t2~ 2>/dev/null");
}

## sort data correctly
#
sub normalize {
	my($from, $to, $z, $i);
	$i=0;
	while (<IN>) {
		chomp;
		($from, $to, $z) = split(/\s+/);
		$from .= 'X' x ($LEN-length($from));
		$to .= 'X' x ($LEN-length($to));
		($to, $from) = ($from, $to) if ($from gt $to);
		print OUT "$from $to $z\n";
		print STDERR "$i\r" if ($verbose >= 2 && $i%1000==0);
		$i++;
	}
	$tot=$i;
	sort_them('n0');
}

## open a new infile, outfile
# also deletes previous infile
#
sub open_new {
	my $f = shift;
	close(IN);
	close(OUT);
	unlink($oldinf) if ($oldinf && ! $keep_files);
	$oldinf = $newinf;
	$newinf = $f;
	unlink($newinf);
	rename($outf, $newinf) || die ("Can't rename $outf => $newinf");
	open(IN, "$newinf") || die("Can't read $newinf");
	open(OUT, ">$outf") or die("Can't write $outf");
	print "mv $outf $newinf\t open(r,$newinf)\t open(w,$outf)\n" if($verbose>2);
}


# reducing looks like this
# a    b    z
# 1234 2345 1
# 1234 2346 1
# 1234 2347 2
#
# 1234 2347 2
# 1234 234X 1

sub sort_them {
	my($x) = $_[0];
	open_new("$inf.$x.sort");
	close(IN);
	close(OUT);
	my($pass)=substr($x,1);
	print "Sorting ...\n" if ($verbose);
	if ($pass eq $rc) {
	    system(qq(export TMPDIR=.;sort < $newinf | sed -e"s/X\\+//g" > $outf));
	}
	else {
	    if ($x eq 'n0') {
		    system(qq(export TMPDIR=.;sort < $newinf | uniq > $outf));
		}
		else {
		    system(qq(export TMPDIR=.;sort < $newinf > $outf));
		}
	}
	open_new("$inf.$x.sorted");
}

sub reduce {
	my ($pass);
	for ($pass = 1; $pass <= $rc; $pass++) {
		&reduce_2($pass);
		sort_them('b'.$pass);
	}
}
sub reduce_2 {
	my ($pass) = $_[0];
	my($from, $to, $z, $i, $old, $olda, $red1, $j, $red, $k);
	my (@from, @to, @z, %zc, $redc, $eof, $line, $oldto);
	my ($which) = $LEN-$pass;
	print "Starting Pass b-$pass ...\n" if ($verbose);
	$old = $olda = '';
	my $XXX = 'X' x $pass;
	$red = $rem = 0;
	$i=0;
	while (1) {
		print STDERR "$i $red $rem\r" if ($verbose == 2 && $i%1000==0);
		$i++;
		if (!$eof) {
			$eof = 1 unless defined ($line = <IN>);
		}
		if (!$eof) {
			chomp($line);
			($from, $to, $z) = split(/\s+/, $line);
			print "R: '$from' '$to' '$z'\n" if($verbose>=4);
			$to .= 'X' x ($LEN-length($to)) if($pass==1);
			#
			# read one bunch with same <pass> digs at <LEN-pass> for constant <a>
			#
#			if ($pass > 1 && $to !~ /${XXX}$/) { # exception
#				print OUT "$from $to $z\n";
#				$rem++;
#				next;
#			}
			if ((substr($to, $which-1, $pass+1) =~ /^$old$/ || $old eq '') &&
			    ($olda eq $from || $olda eq '') &&
			    ($LEN-$pass-1 <= 0 ||
			         substr($to, 0, $LEN-$pass-1) eq substr($oldto,0, $LEN-$pass-1) ||
				 $oldto eq '')) {
				push(@from, $from);
				push(@to, $to);
				push(@z, $z);
				$old = substr($to, $which-1, 1) . '[X\d]' x $pass;
				$olda = $from;
				$oldto = $to;
				next;
			}
		} # not eof
		#
		# now we have some data, what is the most used zone in them
		#
		my $n = @to;
		last unless $n;
		%zc = ();
		foreach (@z) {
			$zc{$_}++;
		}
		$redc = ((sort {$zc{$b} <=> $zc{$a} } (keys(%zc)))[0]);
		print "Got $n: ($from[0] $to[0] - $to[$#to]) Red $redc Old '$old'\n" if ($verbose >= 3);
                print "There are ",scalar(keys(%zc))," zones\n" if($verbose>=3);

		#if there is a shorter one than this is the default
		if (scalar(keys(%zc)) == 1) { # write shortcut
		    $k=0;
		    substr($to[$k], $which, $pass) = 'X' x $pass;
		    print OUT "$from[$k] $to[$k] $z[$k]\n";
		    $rem++;
		    $redt-=$n,$red-=$n;
	        }
		else {
		my ($l);
		$l = $LEN-$pass+1;
		for ($j=1 ;$j < $n; $j++) {
		    $to[$j] =~ /^\d+/;
		    if (length($&) == $LEN-$pass) {
				$l = length($&);
				$redc = $z[$j];
				print "But '$from[$j] $to[$j]' is shorter Red $redc now\n" if ($verbose >= 3);
		    }
		}
		$k=-1;
		for ($j=0 ;$j < $n; $j++) {
			$to[$j] =~ /^\d+/;
			if ($z[$j] == $redc && length($&) == $l) {
				$red1++,$redt++,$red++;
				$k=$j;
				next;
			}
			print OUT "$from[$j] $to[$j] $z[$j]\n";
			$rem++;
		}
		# now write the default
		if ($k >= 0) {
		    substr($to[$k], $which, $pass) = 'X' x $pass;
		    print OUT "$from[$k] $to[$k] $z[$k]\n";
		    $rem++;
		    $redt--,$red--;
		}
		}
		# clean up & init for next bunch
		@from = @to = @z = ();
		push(@from, $from); # these we have already read
		push(@to, $to);
		push(@z, $z);
		if (length($to) > $which) {
		    $old = substr($to, $which-1, 1) . '[X\d]' x $pass;
		}
		$olda = $from;
		$oldto = $to;
		$red1 = 0;
		# are we ready?
		last if ($eof);
	} # while
	$tot = $i if($pass == 1 && $tot==0);
	$red[$pass] = $red;
	print "Pass b-$pass: $red data killed $rem remaining\n" if ($verbose);
}

sub optimize {
	my ($pass);
	my($OP) = 1;
	for ($pass = 1; $pass <= $OP; $pass++) {
		&optimize_2($pass);
		open_new("$inf.a-${pass}p");
		&sort_opt($pass);
		open_new("$inf.a-${pass}s") if ($pass < $OP);
	}
}
sub sort_opt {
    my($pass) = $_[0];
    my ($from, $to, $z);
    print "Sorting ...\n" if($verbose);
    while (<IN>) {
		chomp;
		($from, $to, $z) = split(/ /);
		$from .= 'X' x ($LEN-length($from)); # sort shorter after others
		$to .= 'X' x ($LEN-length($to));
		$to = "X$to" if ($z eq $LINK); # sort link after others
		print OUT "$from $to $z\n";
    }
    close(IN);
    close(OUT);
    $newinf = "$inf.a-${pass}q";
    rename($outf, $newinf);
    system(qq(sort < $newinf |uniq | sed -e"s/X\\+//g"  > $outf));
	my ($red, $wc, $orem);
	$wc = `wc --lines $outf`;
	$wc =~ /(\d+)\s/;
	$orem = $rem;
	$rem = $1;
	$red = $orem - $rem;
	$redt += $red;
    print "Pass o-$pass: $red data killed $rem remaining\n" if ($verbose);
    $red[$pass + 10] = $red;

}


sub optimize_2 {
    my ($pass) = $_[0];
    my ($from, $to, $z, $i, $old, $oldfr1, $oldfr2, $red1, $j, $red, $k, $jj);
    my (@from, @to, @z, %zc, $redc, $eof, $line, $stopped, $rem);
    my (@fr1, @to1, @z1);
    my (@fr2, @to2, @z2, %used1, %used2, %toprint);
    print "Starting Pass o-$pass ...\n" if ($verbose);
    $old = $oldfr1 = $oldfr2 = '';
    $red = $rem = 0;
    $i = 0;
    while (1) {
		print STDERR "$i $red\r" if ($verbose == 2 && $i%100==0);
		$i++;
		if (!$eof) {
    	    $eof = 1 unless defined ($line = <IN>);
		}
		if (!$eof) {
	    	chomp($line);
		    ($from, $to, $z) = split(/\s+/, $line);
			#
			# read one bunch with same digs at begin and same len
			#
		    if ($old eq '' ||
				(substr($from, 0 ,length($old)) eq $old &&
				length($from)==length($old)+1)) {
				push(@from, $from);
				push(@to, $to);
				push(@z, $z);
				$toprint{"$from $to $z"}=1;
				$old = substr($from,0, length($from)-1);
				next;
	    	}
		} # not eof
		$oldfr1 = $from[0];
		my ($next1, $next2, %udif, %short);
		$stopped = 0;
		$next1 = 0;
		if ($#from < 1) {
		    $rem++,print OUT "$from[0] $to[0] $z[0]\n if(@from)";
		    print OUT "$from $to $z\n";
		    $rem++;
		    last if ($eof);
	    	next;
		}
		push(@from,'end'); # for the loop to finish
		OUTER:
		for ($jj = 0; $jj < @from; $jj++) {
	    	if ($from[$jj] eq $oldfr1) {
				if ($from[$jj] ne 'end') {
		    		push(@fr1, substr($from[$jj],0,length($old)));
				    push(@to1, $to[$jj]);
				    push(@z1, $z[$jj]);
				}
		    }
		    else {
				$next2 = $jj;
				$oldfr2 = $from[$next2];
				print "Now Outer $from[$next1]\n" if($verbose==4);
				goto N1 if ($used2{$from[$next1]}); # this has already a link
				open(T1, ">t1~") || die("cant write t1~");
				for ($k=0; $k < @to1; $k++) {
				    print T1 "$fr1[$k] $to1[$k] $z1[$k]\n";
				}
				close(T1);
		INNER:
				for ($j = $next2; $j < @from; $j++) {
			    	if ($from[$j] eq $oldfr2) {
						if ($from[$j] ne 'end') {
						    push(@fr2, substr($from[$j],0,length($old)));
					    	push(@to2, $to[$j]);
						    push(@z2, $z[$j]);
						}
				    }
				    else {
						print "Now Inner $from[$next2]\n" if($verbose==4);
						print "Diffs $oldfr1 - $oldfr2=\n" if($verbose==3);
						print "Diffs $oldfr1 - $oldfr2=\n1:@to1\n2:@to2\n" if($verbose==4);
		    			$oldfr2 = $from[$j];
						open(T2, ">t2~") || die("cant write t2~");
    					for ($k=0; $k<@to2; $k++) {
						    print T2 "$fr2[$k] $to2[$k] $z2[$k]\n";
						}
						close(T2);
						my(@difls) = `diff -U0 t1~ t2~`;
						print "Are:@difls\n" if($verbose==4);
						my($add);
						$add=1;
						foreach (@difls) {
			    			my($l1, $c1, $l2, $c2, $l, $p);
						    if (/@@ -(\d+)(,(\d+))? \+(\d+)(,(\d+))?/) {
						        ($l1, $c1, $l2, $c2) = ($1,$3,$4,$6);
			    			    $c1 = !defined $c1 ? 1 : $c1;
			        			$c2 = !defined $c2 ? 1 : $c2;
						        foreach ($l=-1; $l<$c1-1; $l++) {
									# now look, what zone is this in @to2
									# and write it
									my ($m, $n, $t, $q, $o);
									$t = $to[$next1+$l1+$l];
#									$add++;
									NL: while(length($t)) {
										for ($m=0; $m<@to2; $m++) {
											if ($to2[$m] eq $t) {
												$q="$from[$next2] $to[$next1+$l1+$l] $z2[$m]";
												$udif{$q}=1 if($z2[$m] ne $z[$next1+$l1+$l]);
												# and all longer that match
												$t = $to[$next1+$l1+$l];
												for ($o = 0; $o < @to2; $o++) {
													if ($t ne $to2[$o] && $to2[$o] =~ /^$t/) {
														$udif{"$from[$next2] $to2[$o] $z2[$o]"}=1;
														print "\tAnd longer2 $from[$next2] $to2[$o] $z2[$o]\n" if($verbose==4);
													}
												}
												last NL;
											}
										} # for m
										$t = substr($t,0,length($t)-1);
									} # while
								    if ($verbose==4) {
				    				    $p="$from[$next1] $to[$next1+$l1+$l] $z[$next1+$l1+$l]";
										print "\t$p = $q\n";
								    }
								}
								if($c2) {
								    foreach ($l=-1; $l<$c2-1; $l++) {
										my ($m, $t);
										$t = $to[$next2+$l2+$l];
										$udif{"$from[$next2] $t $z[$next2+$l2+$l]"}=1;
										if ($verbose==4) {
										    $p="$from[$next2] $t $z[$next2+$l2+$l]";
										    print "\t$p\n";
										}
										for ($m = 0; $m < @to2; $m++) {
											if ($t ne $to2[$m] && $to2[$m] =~ /^$t/) {
												$udif{"$from[$next2] $to2[$m] $z2[$m]"}=1;
												print "\tAnd longer $from[$next2] $to2[$m] $z2[$m]\n" if($verbose==4);
											}
										}
										# also write all longer matching
								    } #foreach
								} #if c2
			    			} #if
						} # foreach difls
						if (keys(%udif)+$add < @fr2) {
					    	print "Used $from[$next1] $from[$next2]\n" if($verbose==4);
						    my %found;
							my $p;
							$red ++;
							if (!$used1{$from[$next1]}) {
							    for ($k=0; $k < @fr1; $k++) {
									$p="$from[$next1+$k] $to1[$k] $z1[$k]";
									print OUT "$p\n";
									$found{$to1[$k]}=1;
									delete $toprint{$p};
							    }
							}
						    $used1{$from[$next1]}=1;
						    $used2{$from[$next2]}=1;
						    for ($k=0; $k < @fr2; $k++) {
							   delete $toprint{"$from[$next2+$k] $to2[$k] $z2[$k]"};
						    }
						    print OUT "$from[$next2] $from[$next1] $LINK\n";
						    foreach $k (keys(%udif)) {
								print OUT "$k\n";
					    	}
						    #goto N1;
						}
						else {
						    print "Too many diffs $from[$next1] $from[$next2]\n" if($verbose>2);
						}
				N2:
	    				%udif = ();
						@fr2 = @to2 = @z2 = ();
						push(@fr2, substr($from[$j],0,length($old)));
						push(@to2, $to[$j]);
						push(@z2, $z[$j]);
						$next2 = $j;
				    }  # else
				} # for $j
			N1:
    			%udif = ();
				foreach $k (keys(%toprint)) {
					print OUT "$k\n";
					$rem++;
				}
				%toprint = ();
				@fr2 = @to2 = @z2 = ();
				@fr1 = @to1 = @z1 = ();
				if ($from[$jj] ne 'end') {
					push(@fr1, substr($from[$jj],0,length($old)));
					push(@to1, $to[$jj]);
					push(@z1, $z[$jj]);
				}
				$oldfr1 = $from[$jj];
				$next1 = $jj;
			} # else

		} # for	jj
		@fr2 = @to2 = @z2 = ();
		@fr1 = @to1 = @z1 = ();
		%used1 = %used2 = %toprint = ();
		$oldfr1 = $oldfr2 = '';
		# clean up & init for next bunch
		@from = @to = @z = ();
		push(@from, $from); # these we have already red
		push(@to, $to);
		push(@z, $z);
		$toprint{"$from $to $z"}=1;
		$old = '';
		last if($eof);
    } # while
    $tot = $i if($pass == 1 && $opt_only);
} # optimize
