Current File : //var/wcp4/hkaw/public_html/file/file/file/private/lib/GT/FileMan/Diff.pm
# ==================================================================
# File manager - enhanced web based file management system
#
#   Website  : http://gossamer-threads.com/
#   Support  : http://gossamer-threads.com/scripts/support/
#   Revision : $Id: Diff.pm,v 1.4 2001/07/20 01:08:18 alex Exp $
# 
# Copyright (c) 2001 Gossamer Threads Inc.  All Rights Reserved.
# Redistribution in part or in whole strictly prohibited. Please
# see LICENSE file for full details.
# ==================================================================

package GT::FileMan::Diff;
# ==================================================================
# This module was taken from Algorthim::Diff in almost it's entirity. 
#

use strict;
use vars qw($VERSION @EXPORT_OK @ISA @EXPORT);
use integer;		# see below in _replaceNextLargerWith() for mod to make
					# if you don't use this
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw();
@EXPORT_OK = qw(LCS diff traverse_sequences);
$VERSION = sprintf('%d.%02d', (q$Revision: 1.4 $ =~ /\d+/g));


# Create a hash that maps each element of $aCollection to the set of positions
# it occupies in $aCollection, restricted to the elements within the range of
# indexes specified by $start and $end.
# The fourth parameter is a subroutine reference that will be called to
# generate a string to use as a key.
# Additional parameters, if any, will be passed to this subroutine.
#
# my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );

sub _withPositionsOfInInterval
{
	my $aCollection = shift;	# array ref
	my $start = shift;
	my $end = shift;
	my $keyGen = shift;
	my %d;
	my $index;
	for ( $index = $start; $index <= $end; $index++ )
	{
		my $element = $aCollection->[ $index ];
		my $key = &$keyGen( $element, @_ );
		if ( exists( $d{ $key } ) )
		{
			push( @{ $d{ $key } }, $index );
		}
		else
		{
			$d{ $key } = [ $index ];
		}
	}
	return wantarray ? %d: \%d;
}

# Find the place at which aValue would normally be inserted into the array. If
# that place is already occupied by aValue, do nothing, and return undef. If
# the place does not exist (i.e., it is off the end of the array), add it to
# the end, otherwise replace the element at that point with aValue.
# It is assumed that the array's values are numeric.
# This is where the bulk (75%) of the time is spent in this module, so try to
# make it fast!

sub _replaceNextLargerWith
{
	my ( $array, $aValue, $high ) = @_;
	$high ||= $#$array;

	# off the end?
	if ( $high == -1 || $aValue > $array->[ -1 ] )
	{
		push( @$array, $aValue );
		return $high + 1;
	}

	# binary search for insertion point...
	my $low = 0;
	my $index;
	my $found;
	while ( $low <= $high )
	{
		$index = ( $high + $low ) / 2;
#		$index = int(( $high + $low ) / 2);		# without 'use integer'
		$found = $array->[ $index ];

		if ( $aValue == $found )
		{
			return undef;
		}
		elsif ( $aValue > $found )
		{
			$low = $index + 1;
		}
		else
		{
			$high = $index - 1;
		}
	}

	# now insertion point is in $low.
	$array->[ $low ] = $aValue;		# overwrite next larger
	return $low;
}

# This method computes the longest common subsequence in $a and $b.

# Result is array or ref, whose contents is such that
# 	$a->[ $i ] = $b->[ $result[ $i ] ]
# foreach $i in ( 0..scalar( @result ) if $result[ $i ] is defined.

# An additional argument may be passed; this is a hash or key generating
# function that should return a string that uniquely identifies the given
# element.  It should be the case that if the key is the same, the elements
# will compare the same. If this parameter is undef or missing, the key
# will be the element as a string.

# By default, comparisons will use "eq" and elements will be turned into keys
# using the default stringizing operator '""'.

# Additional parameters, if any, will be passed to the key generation routine.

sub _longestCommonSubsequence
{
	my $a = shift;	# array ref
	my $b = shift;	# array ref
	my $keyGen = shift;	# code ref
	my $compare;	# code ref

	# set up code refs
	# Note that these are optimized.
	if ( !defined( $keyGen ) )	# optimize for strings
	{
		$keyGen = sub { $_[0] };
		$compare = sub { my ($a, $b) = @_; $a eq $b };
	}
	else
	{
		$compare = sub {
			my $a = shift; my $b = shift;
			&$keyGen( $a, @_ ) eq &$keyGen( $b, @_ )
		};
	}

	my ($aStart, $aFinish, $bStart, $bFinish, $matchVector) = (0, $#$a, 0, $#$b, []);

	# First we prune off any common elements at the beginning
	while ( $aStart <= $aFinish
		and $bStart <= $bFinish
		and &$compare( $a->[ $aStart ], $b->[ $bStart ], @_ ) )
	{
		$matchVector->[ $aStart++ ] = $bStart++;
	}

	# now the end
	while ( $aStart <= $aFinish
		and $bStart <= $bFinish
		and &$compare( $a->[ $aFinish ], $b->[ $bFinish ], @_ ) )
	{
		$matchVector->[ $aFinish-- ] = $bFinish--;
	}

	# Now compute the equivalence classes of positions of elements
	my $bMatches = _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
	my $thresh = [];
	my $links = [];

	my ( $i, $ai, $j, $k );
	for ( $i = $aStart; $i <= $aFinish; $i++ )
	{
		$ai = &$keyGen( $a->[ $i ] );
		if ( exists( $bMatches->{ $ai } ) )
		{
			$k = 0;
			for $j ( reverse( @{ $bMatches->{ $ai } } ) )
			{
				# optimization: most of the time this will be true
				if ( $k
					and $thresh->[ $k ] > $j
					and $thresh->[ $k - 1 ] < $j )
				{
					$thresh->[ $k ] = $j;
				}
				else
				{
					$k = _replaceNextLargerWith( $thresh, $j, $k );
				}

				# oddly, it's faster to always test this (CPU cache?).
				if ( defined( $k ) )
				{
					$links->[ $k ] = 
						[ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
				}
			}
		}
	}

	if ( @$thresh )
	{
		for ( my $link = $links->[ $#$thresh ]; $link; $link = $link->[ 0 ] )
		{
			$matchVector->[ $link->[ 1 ] ] = $link->[ 2 ];
		}
	}

	return wantarray ? @$matchVector : $matchVector;
}

sub traverse_sequences
{
	my $a = shift;	# array ref
	my $b = shift;	# array ref
	my $callbacks = shift || { };
	my $keyGen = shift;
	my $matchCallback = $callbacks->{'MATCH'} || sub { };
	my $discardACallback = $callbacks->{'DISCARD_A'} || sub { };
	my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { };
	my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ );
	# Process all the lines in match vector
	my $lastA = $#$a;
	my $lastB = $#$b;
	my $bi = 0;
	my $ai;
	for ( $ai = 0; $ai <= $#$matchVector; $ai++ )
	{
		my $bLine = $matchVector->[ $ai ];
		if ( defined( $bLine ) )
		{
			&$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
			&$matchCallback( $ai, $bi++, @_ );
		}
		else
		{
			&$discardACallback( $ai, $bi, @_ );
		}
	}

	&$discardACallback( $ai++, $bi, @_ ) while ( $ai <= $lastA );
	&$discardBCallback( $ai, $bi++, @_ ) while ( $bi <= $lastB );
	return 1;
}

sub LCS
{
	my $a = shift;	# array ref
	my $matchVector = _longestCommonSubsequence( $a, @_ );
	my @retval;
	my $i;
	for ( $i = 0; $i <= $#$matchVector; $i++ )
	{
		if ( defined( $matchVector->[ $i ] ) )
		{
			push( @retval, $a->[ $i ] );
		}
	}
	return wantarray ? @retval : \@retval;
}

sub diff
{
	my $a = shift;	# array ref
	my $b = shift;	# array ref
	my $retval = [];
	my $hunk = [];
	my $discard = sub { push( @$hunk, [ '-', $_[ 0 ], $a->[ $_[ 0 ] ] ] ) };
	my $add = sub { push( @$hunk, [ '+', $_[ 1 ], $b->[ $_[ 1 ] ] ] ) };
	my $match = sub { push( @$retval, $hunk ) if scalar(@$hunk); $hunk = [] };
	traverse_sequences( $a, $b,
		{ MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add },
		@_ );
	&$match();
	return wantarray ? @$retval : $retval;
}

sub main_diff {
#-------------------------------------------------------------------------------
# Main diff function
    my ($f1, $f2) = @_;
    my $File_Length_Difference = 0;
    my $Context_Lines = 0; # lines of context to print. 0 for old-style diff
    my $output;
    # diffing
    my ($char1, $char2); # string to print before file names

    # diff yields lots of pieces, each of which is basically a Block object
    my $diffs = diff($f1, $f2);
    return "Two files look identical." unless ($#$diffs != -1);

    my ($hunk,$oldhunk);
    # Loop over hunks. If a hunk overlaps with the last hunk, join them.
    # Otherwise, print out the old one.
    foreach my $piece (@$diffs) {
        $hunk = new Hunk ($piece, $Context_Lines);
        next unless $oldhunk; # first time through

        # Don't need to check for overlap if blocks have no context lines
        if ($Context_Lines && $hunk->does_overlap($oldhunk)) {
        $hunk->prepend_hunk($oldhunk);
        } else {
        $oldhunk->output_old_diff($f1, $f2);
        }

    } continue {
        $oldhunk = $hunk;
    }

    # print the last hunk
    $oldhunk->output_old_diff($f1, $f2);

    return $output;
    # END MAIN PROGRAM

    ########
    # Package Hunk. A Hunk is a group of Blocks which overlap because of the
    # context surrounding each block. (So if we're not using context, every
    # hunk will contain one block.)
    {
    package Hunk;

    sub new {
    # Arg1 is output from &LCS::diff (which corresponds to one Block)
    # Arg2 is the number of items (lines, e.g.,) of context around each block
    #
    # This subroutine changes $File_Length_Difference
    #
    # Fields in a Hunk:
    # blocks      - a list of Block objects
    # start       - index in file 1 where first block of the hunk starts
    # end         - index in file 1 where last block of the hunk ends
    #
    # Variables:
    # before_diff - how much longer file 2 is than file 1 due to all hunks
    #               until but NOT including this one
    # after_diff  - difference due to all hunks including this one
        my ($class, $piece, $context_items) = @_;

        my $block = new Block ($piece); # this modifies $FLD!

        my $before_diff = $File_Length_Difference; # BEFORE this hunk
        my $after_diff = $before_diff + $block->{"length_diff"};
        $File_Length_Difference += $block->{"length_diff"};
        my @remove_array = $block->remove;
        my @insert_array = $block->insert;
        my ($a1, $a2, $b1, $b2, $start1, $start2, $end1, $end2);
        $a1 = @remove_array ? $remove_array[0 ]->{"item_no"} : -1;
        $a2 = @remove_array ? $remove_array[-1]->{"item_no"} : -1;
        $b1 = @insert_array ? $insert_array[0 ]->{"item_no"} : -1;
        $b2 = @insert_array ? $insert_array[-1]->{"item_no"} : -1;

        $start1 = $a1 == -1 ? $b1 - $before_diff : $a1;
        $end1   = $a2 == -1 ? $b2 - $after_diff  : $a2;
        $start2 = $b1 == -1 ? $a1 + $before_diff : $b1;
        $end2   = $b2 == -1 ? $a2 + $after_diff  : $b2;

        # At first, a hunk will have just one Block in it
        my $hunk = {
            "start1" => $start1,
            "start2" => $start2,
            "end1" => $end1,
            "end2" => $end2,
            "blocks" => [$block],
                  };
        bless $hunk, $class;

        $hunk->flag_context($context_items);

        return $hunk;
    }

    # Change the "start" and "end" fields to note that context should be added
    # to this hunk
    sub flag_context {
        my ($hunk, $context_items) = @_;
        return unless $context_items; # no context

        # add context before
        my $start1 = $hunk->{"start1"};
        my $num_added = $context_items > $start1 ? $start1 : $context_items;
        $hunk->{"start1"} -= $num_added;
        $hunk->{"start2"} -= $num_added;

        # context after
        my $end1 = $hunk->{"end1"};
        $num_added = ($end1+$context_items > $#$f1) ?
                      $#$f1 - $end1 :
                      $context_items;
        $hunk->{"end1"} += $num_added;
        $hunk->{"end2"} += $num_added;
    }

    # Is there an overlap between hunk arg0 and old hunk arg1?
    # Note: if end of old hunk is one less than beginning of second, they overlap
    sub does_overlap {
        my ($hunk, $oldhunk) = @_;
        return "" unless $oldhunk; # first time through, $oldhunk is empty

        # Do I actually need to test both?
        return ($hunk->{"start1"} - $oldhunk->{"end1"} <= 1 ||
                $hunk->{"start2"} - $oldhunk->{"end2"} <= 1);
    }

    # Prepend hunk arg1 to hunk arg0
    # Note that arg1 isn't updated! Only arg0 is.
    sub prepend_hunk {
        my ($hunk, $oldhunk) = @_;

        $hunk->{"start1"} = $oldhunk->{"start1"};
        $hunk->{"start2"} = $oldhunk->{"start2"};

        unshift (@{$hunk->{"blocks"}}, @{$oldhunk->{"blocks"}});
    }


    # DIFF OUTPUT ROUTINES. THESE ROUTINES CONTAIN DIFF FORMATTING INFO...

    sub output_old_diff {
    # Note that an old diff can't have any context. Therefore, we know that
    # there's only one block in the hunk.
        my ($hunk, $fileref1, $fileref2) = @_;
        my %op_hash = ('+' => 'a', '-' => 'd', '!' => 'c');

        my @blocklist = @{$hunk->{"blocks"}};
        warn ("Expecting one block in an old diff hunk!") if scalar @blocklist != 1;
        my $block = $blocklist[0];
        my $op = $block->op; # +, -, or !

        # Calculate item number range.
        # old diff range is just like a context diff range, except the ranges
        # are on one line with the action between them.
        my $range1 = $hunk->context_range(1);
        my $range2 = $hunk->context_range(2);
        my $action = $op_hash{$op} || warn "unknown op $op";
        $output .= "$range1$action$range2\n";

        # If removing anything, just print out all the remove lines in the hunk
        # which is just all the remove lines in the block
        if ($block->remove) {
            my @outlist = @$fileref1[$hunk->{"start1"}..$hunk->{"end1"}];
            map {$_ = "< $_\n"} @outlist; # all lines will be '< text\n'
            foreach (@outlist){
                $output .= $_;
            }
        }

        $output .= "---\n" if $op eq '!'; # only if inserting and removing
        if ($block->insert) {
            my @outlist = @$fileref2[$hunk->{"start2"}..$hunk->{"end2"}];
            map {$_ = "> $_\n"} @outlist; # all lines will be '> text\n'
            foreach (@outlist){
                $output .= $_;
            }
        }
    }



    sub context_range {
    # Generate a range of item numbers to print. Only print 1 number if the range
    # has only one item in it. Otherwise, it's 'start,end'
    # Flag is the number of the file (1 or 2)
        my ($hunk, $flag) = @_;
        my ($start, $end) = ($hunk->{"start$flag"},$hunk->{"end$flag"});
        $start++; $end++;  # index from 1, not zero
        my $range = ($start < $end) ? "$start,$end" : $end;
        return $range;
    }


    } # end Package Hunk

    ########
    # Package Block. A block is an operation removing, adding, or changing
    # a group of items. Basically, this is just a list of changes, where each
    # change adds or deletes a single item.
    # (Change could be a separate class, but it didn't seem worth it)
    {
    package Block;
    sub new {
    # Input is a chunk from &Algorithm::LCS::diff
    # Fields in a block:
    # length_diff - how much longer file 2 is than file 1 due to this block
    # Each change has:
    # sign        - '+' for insert, '-' for remove
    # item_no     - number of the item in the file (e.g., line number)
    # We don't bother storing the text of the item
    #
        my ($class,$chunk) = @_;
        my @changes = ();

    # This just turns each change into a hash.
        foreach my $item (@$chunk) {
        my ($sign, $item_no, $text) = @$item;
        my $hashref = {"sign" => $sign, "item_no" => $item_no};
        push @changes, $hashref;
        }

        my $block = { "changes" => \@changes };
        bless $block, $class;

        $block->{"length_diff"} = $block->insert - $block->remove;
        return $block;
    }


    # LOW LEVEL FUNCTIONS
    sub op {
    # what kind of block is this?
        my $block = shift;
        my $insert = $block->insert;
        my $remove = $block->remove;

        $remove && $insert and return '!';
        $remove and return '-';
        $insert and return '+';
        warn "unknown block type";
        return '^'; # context block
    }

    # Returns a list of the changes in this block that remove items
    # (or the number of removals if called in scalar context)
    sub remove { return grep {$_->{"sign"} eq '-'} @{shift->{"changes"}}; }

    # Returns a list of the changes in this block that insert items
    sub insert { return grep {$_->{"sign"} eq '+'} @{shift->{"changes"}}; }

    } # end of package Block

}

1;