| Current File : //var/wcp4/hkaw/public_html/file/file/file/private/lib/GT/FileMan/Diff.pm |
# ==================================================================
# File manager - enhanced web based file management system
#
# Website : http://gossamer-threads.com/
# Support : http://gossamer-threads.com/scripts/support/
# Revision : $Id: Diff.pm,v 1.4 2001/07/20 01:08:18 alex Exp $
#
# Copyright (c) 2001 Gossamer Threads Inc. All Rights Reserved.
# Redistribution in part or in whole strictly prohibited. Please
# see LICENSE file for full details.
# ==================================================================
package GT::FileMan::Diff;
# ==================================================================
# This module was taken from Algorthim::Diff in almost it's entirity.
#
use strict;
use vars qw($VERSION @EXPORT_OK @ISA @EXPORT);
use integer; # see below in _replaceNextLargerWith() for mod to make
# if you don't use this
require Exporter;
@ISA = qw(Exporter);
@EXPORT = qw();
@EXPORT_OK = qw(LCS diff traverse_sequences);
$VERSION = sprintf('%d.%02d', (q$Revision: 1.4 $ =~ /\d+/g));
# Create a hash that maps each element of $aCollection to the set of positions
# it occupies in $aCollection, restricted to the elements within the range of
# indexes specified by $start and $end.
# The fourth parameter is a subroutine reference that will be called to
# generate a string to use as a key.
# Additional parameters, if any, will be passed to this subroutine.
#
# my $hashRef = _withPositionsOfInInterval( \@array, $start, $end, $keyGen );
sub _withPositionsOfInInterval
{
my $aCollection = shift; # array ref
my $start = shift;
my $end = shift;
my $keyGen = shift;
my %d;
my $index;
for ( $index = $start; $index <= $end; $index++ )
{
my $element = $aCollection->[ $index ];
my $key = &$keyGen( $element, @_ );
if ( exists( $d{ $key } ) )
{
push( @{ $d{ $key } }, $index );
}
else
{
$d{ $key } = [ $index ];
}
}
return wantarray ? %d: \%d;
}
# Find the place at which aValue would normally be inserted into the array. If
# that place is already occupied by aValue, do nothing, and return undef. If
# the place does not exist (i.e., it is off the end of the array), add it to
# the end, otherwise replace the element at that point with aValue.
# It is assumed that the array's values are numeric.
# This is where the bulk (75%) of the time is spent in this module, so try to
# make it fast!
sub _replaceNextLargerWith
{
my ( $array, $aValue, $high ) = @_;
$high ||= $#$array;
# off the end?
if ( $high == -1 || $aValue > $array->[ -1 ] )
{
push( @$array, $aValue );
return $high + 1;
}
# binary search for insertion point...
my $low = 0;
my $index;
my $found;
while ( $low <= $high )
{
$index = ( $high + $low ) / 2;
# $index = int(( $high + $low ) / 2); # without 'use integer'
$found = $array->[ $index ];
if ( $aValue == $found )
{
return undef;
}
elsif ( $aValue > $found )
{
$low = $index + 1;
}
else
{
$high = $index - 1;
}
}
# now insertion point is in $low.
$array->[ $low ] = $aValue; # overwrite next larger
return $low;
}
# This method computes the longest common subsequence in $a and $b.
# Result is array or ref, whose contents is such that
# $a->[ $i ] = $b->[ $result[ $i ] ]
# foreach $i in ( 0..scalar( @result ) if $result[ $i ] is defined.
# An additional argument may be passed; this is a hash or key generating
# function that should return a string that uniquely identifies the given
# element. It should be the case that if the key is the same, the elements
# will compare the same. If this parameter is undef or missing, the key
# will be the element as a string.
# By default, comparisons will use "eq" and elements will be turned into keys
# using the default stringizing operator '""'.
# Additional parameters, if any, will be passed to the key generation routine.
sub _longestCommonSubsequence
{
my $a = shift; # array ref
my $b = shift; # array ref
my $keyGen = shift; # code ref
my $compare; # code ref
# set up code refs
# Note that these are optimized.
if ( !defined( $keyGen ) ) # optimize for strings
{
$keyGen = sub { $_[0] };
$compare = sub { my ($a, $b) = @_; $a eq $b };
}
else
{
$compare = sub {
my $a = shift; my $b = shift;
&$keyGen( $a, @_ ) eq &$keyGen( $b, @_ )
};
}
my ($aStart, $aFinish, $bStart, $bFinish, $matchVector) = (0, $#$a, 0, $#$b, []);
# First we prune off any common elements at the beginning
while ( $aStart <= $aFinish
and $bStart <= $bFinish
and &$compare( $a->[ $aStart ], $b->[ $bStart ], @_ ) )
{
$matchVector->[ $aStart++ ] = $bStart++;
}
# now the end
while ( $aStart <= $aFinish
and $bStart <= $bFinish
and &$compare( $a->[ $aFinish ], $b->[ $bFinish ], @_ ) )
{
$matchVector->[ $aFinish-- ] = $bFinish--;
}
# Now compute the equivalence classes of positions of elements
my $bMatches = _withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
my $thresh = [];
my $links = [];
my ( $i, $ai, $j, $k );
for ( $i = $aStart; $i <= $aFinish; $i++ )
{
$ai = &$keyGen( $a->[ $i ] );
if ( exists( $bMatches->{ $ai } ) )
{
$k = 0;
for $j ( reverse( @{ $bMatches->{ $ai } } ) )
{
# optimization: most of the time this will be true
if ( $k
and $thresh->[ $k ] > $j
and $thresh->[ $k - 1 ] < $j )
{
$thresh->[ $k ] = $j;
}
else
{
$k = _replaceNextLargerWith( $thresh, $j, $k );
}
# oddly, it's faster to always test this (CPU cache?).
if ( defined( $k ) )
{
$links->[ $k ] =
[ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
}
}
}
}
if ( @$thresh )
{
for ( my $link = $links->[ $#$thresh ]; $link; $link = $link->[ 0 ] )
{
$matchVector->[ $link->[ 1 ] ] = $link->[ 2 ];
}
}
return wantarray ? @$matchVector : $matchVector;
}
sub traverse_sequences
{
my $a = shift; # array ref
my $b = shift; # array ref
my $callbacks = shift || { };
my $keyGen = shift;
my $matchCallback = $callbacks->{'MATCH'} || sub { };
my $discardACallback = $callbacks->{'DISCARD_A'} || sub { };
my $discardBCallback = $callbacks->{'DISCARD_B'} || sub { };
my $matchVector = _longestCommonSubsequence( $a, $b, $keyGen, @_ );
# Process all the lines in match vector
my $lastA = $#$a;
my $lastB = $#$b;
my $bi = 0;
my $ai;
for ( $ai = 0; $ai <= $#$matchVector; $ai++ )
{
my $bLine = $matchVector->[ $ai ];
if ( defined( $bLine ) )
{
&$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
&$matchCallback( $ai, $bi++, @_ );
}
else
{
&$discardACallback( $ai, $bi, @_ );
}
}
&$discardACallback( $ai++, $bi, @_ ) while ( $ai <= $lastA );
&$discardBCallback( $ai, $bi++, @_ ) while ( $bi <= $lastB );
return 1;
}
sub LCS
{
my $a = shift; # array ref
my $matchVector = _longestCommonSubsequence( $a, @_ );
my @retval;
my $i;
for ( $i = 0; $i <= $#$matchVector; $i++ )
{
if ( defined( $matchVector->[ $i ] ) )
{
push( @retval, $a->[ $i ] );
}
}
return wantarray ? @retval : \@retval;
}
sub diff
{
my $a = shift; # array ref
my $b = shift; # array ref
my $retval = [];
my $hunk = [];
my $discard = sub { push( @$hunk, [ '-', $_[ 0 ], $a->[ $_[ 0 ] ] ] ) };
my $add = sub { push( @$hunk, [ '+', $_[ 1 ], $b->[ $_[ 1 ] ] ] ) };
my $match = sub { push( @$retval, $hunk ) if scalar(@$hunk); $hunk = [] };
traverse_sequences( $a, $b,
{ MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add },
@_ );
&$match();
return wantarray ? @$retval : $retval;
}
sub main_diff {
#-------------------------------------------------------------------------------
# Main diff function
my ($f1, $f2) = @_;
my $File_Length_Difference = 0;
my $Context_Lines = 0; # lines of context to print. 0 for old-style diff
my $output;
# diffing
my ($char1, $char2); # string to print before file names
# diff yields lots of pieces, each of which is basically a Block object
my $diffs = diff($f1, $f2);
return "Two files look identical." unless ($#$diffs != -1);
my ($hunk,$oldhunk);
# Loop over hunks. If a hunk overlaps with the last hunk, join them.
# Otherwise, print out the old one.
foreach my $piece (@$diffs) {
$hunk = new Hunk ($piece, $Context_Lines);
next unless $oldhunk; # first time through
# Don't need to check for overlap if blocks have no context lines
if ($Context_Lines && $hunk->does_overlap($oldhunk)) {
$hunk->prepend_hunk($oldhunk);
} else {
$oldhunk->output_old_diff($f1, $f2);
}
} continue {
$oldhunk = $hunk;
}
# print the last hunk
$oldhunk->output_old_diff($f1, $f2);
return $output;
# END MAIN PROGRAM
########
# Package Hunk. A Hunk is a group of Blocks which overlap because of the
# context surrounding each block. (So if we're not using context, every
# hunk will contain one block.)
{
package Hunk;
sub new {
# Arg1 is output from &LCS::diff (which corresponds to one Block)
# Arg2 is the number of items (lines, e.g.,) of context around each block
#
# This subroutine changes $File_Length_Difference
#
# Fields in a Hunk:
# blocks - a list of Block objects
# start - index in file 1 where first block of the hunk starts
# end - index in file 1 where last block of the hunk ends
#
# Variables:
# before_diff - how much longer file 2 is than file 1 due to all hunks
# until but NOT including this one
# after_diff - difference due to all hunks including this one
my ($class, $piece, $context_items) = @_;
my $block = new Block ($piece); # this modifies $FLD!
my $before_diff = $File_Length_Difference; # BEFORE this hunk
my $after_diff = $before_diff + $block->{"length_diff"};
$File_Length_Difference += $block->{"length_diff"};
my @remove_array = $block->remove;
my @insert_array = $block->insert;
my ($a1, $a2, $b1, $b2, $start1, $start2, $end1, $end2);
$a1 = @remove_array ? $remove_array[0 ]->{"item_no"} : -1;
$a2 = @remove_array ? $remove_array[-1]->{"item_no"} : -1;
$b1 = @insert_array ? $insert_array[0 ]->{"item_no"} : -1;
$b2 = @insert_array ? $insert_array[-1]->{"item_no"} : -1;
$start1 = $a1 == -1 ? $b1 - $before_diff : $a1;
$end1 = $a2 == -1 ? $b2 - $after_diff : $a2;
$start2 = $b1 == -1 ? $a1 + $before_diff : $b1;
$end2 = $b2 == -1 ? $a2 + $after_diff : $b2;
# At first, a hunk will have just one Block in it
my $hunk = {
"start1" => $start1,
"start2" => $start2,
"end1" => $end1,
"end2" => $end2,
"blocks" => [$block],
};
bless $hunk, $class;
$hunk->flag_context($context_items);
return $hunk;
}
# Change the "start" and "end" fields to note that context should be added
# to this hunk
sub flag_context {
my ($hunk, $context_items) = @_;
return unless $context_items; # no context
# add context before
my $start1 = $hunk->{"start1"};
my $num_added = $context_items > $start1 ? $start1 : $context_items;
$hunk->{"start1"} -= $num_added;
$hunk->{"start2"} -= $num_added;
# context after
my $end1 = $hunk->{"end1"};
$num_added = ($end1+$context_items > $#$f1) ?
$#$f1 - $end1 :
$context_items;
$hunk->{"end1"} += $num_added;
$hunk->{"end2"} += $num_added;
}
# Is there an overlap between hunk arg0 and old hunk arg1?
# Note: if end of old hunk is one less than beginning of second, they overlap
sub does_overlap {
my ($hunk, $oldhunk) = @_;
return "" unless $oldhunk; # first time through, $oldhunk is empty
# Do I actually need to test both?
return ($hunk->{"start1"} - $oldhunk->{"end1"} <= 1 ||
$hunk->{"start2"} - $oldhunk->{"end2"} <= 1);
}
# Prepend hunk arg1 to hunk arg0
# Note that arg1 isn't updated! Only arg0 is.
sub prepend_hunk {
my ($hunk, $oldhunk) = @_;
$hunk->{"start1"} = $oldhunk->{"start1"};
$hunk->{"start2"} = $oldhunk->{"start2"};
unshift (@{$hunk->{"blocks"}}, @{$oldhunk->{"blocks"}});
}
# DIFF OUTPUT ROUTINES. THESE ROUTINES CONTAIN DIFF FORMATTING INFO...
sub output_old_diff {
# Note that an old diff can't have any context. Therefore, we know that
# there's only one block in the hunk.
my ($hunk, $fileref1, $fileref2) = @_;
my %op_hash = ('+' => 'a', '-' => 'd', '!' => 'c');
my @blocklist = @{$hunk->{"blocks"}};
warn ("Expecting one block in an old diff hunk!") if scalar @blocklist != 1;
my $block = $blocklist[0];
my $op = $block->op; # +, -, or !
# Calculate item number range.
# old diff range is just like a context diff range, except the ranges
# are on one line with the action between them.
my $range1 = $hunk->context_range(1);
my $range2 = $hunk->context_range(2);
my $action = $op_hash{$op} || warn "unknown op $op";
$output .= "$range1$action$range2\n";
# If removing anything, just print out all the remove lines in the hunk
# which is just all the remove lines in the block
if ($block->remove) {
my @outlist = @$fileref1[$hunk->{"start1"}..$hunk->{"end1"}];
map {$_ = "< $_\n"} @outlist; # all lines will be '< text\n'
foreach (@outlist){
$output .= $_;
}
}
$output .= "---\n" if $op eq '!'; # only if inserting and removing
if ($block->insert) {
my @outlist = @$fileref2[$hunk->{"start2"}..$hunk->{"end2"}];
map {$_ = "> $_\n"} @outlist; # all lines will be '> text\n'
foreach (@outlist){
$output .= $_;
}
}
}
sub context_range {
# Generate a range of item numbers to print. Only print 1 number if the range
# has only one item in it. Otherwise, it's 'start,end'
# Flag is the number of the file (1 or 2)
my ($hunk, $flag) = @_;
my ($start, $end) = ($hunk->{"start$flag"},$hunk->{"end$flag"});
$start++; $end++; # index from 1, not zero
my $range = ($start < $end) ? "$start,$end" : $end;
return $range;
}
} # end Package Hunk
########
# Package Block. A block is an operation removing, adding, or changing
# a group of items. Basically, this is just a list of changes, where each
# change adds or deletes a single item.
# (Change could be a separate class, but it didn't seem worth it)
{
package Block;
sub new {
# Input is a chunk from &Algorithm::LCS::diff
# Fields in a block:
# length_diff - how much longer file 2 is than file 1 due to this block
# Each change has:
# sign - '+' for insert, '-' for remove
# item_no - number of the item in the file (e.g., line number)
# We don't bother storing the text of the item
#
my ($class,$chunk) = @_;
my @changes = ();
# This just turns each change into a hash.
foreach my $item (@$chunk) {
my ($sign, $item_no, $text) = @$item;
my $hashref = {"sign" => $sign, "item_no" => $item_no};
push @changes, $hashref;
}
my $block = { "changes" => \@changes };
bless $block, $class;
$block->{"length_diff"} = $block->insert - $block->remove;
return $block;
}
# LOW LEVEL FUNCTIONS
sub op {
# what kind of block is this?
my $block = shift;
my $insert = $block->insert;
my $remove = $block->remove;
$remove && $insert and return '!';
$remove and return '-';
$insert and return '+';
warn "unknown block type";
return '^'; # context block
}
# Returns a list of the changes in this block that remove items
# (or the number of removals if called in scalar context)
sub remove { return grep {$_->{"sign"} eq '-'} @{shift->{"changes"}}; }
# Returns a list of the changes in this block that insert items
sub insert { return grep {$_->{"sign"} eq '+'} @{shift->{"changes"}}; }
} # end of package Block
}
1;