User:PerfektesChaos/WikidiffLX/coding/DiffEngine.h

No other changes against wikidiff2 (rev 67994 Jun 2010) rather than exchanging WD2_ALLOCATOR by WDLX_ALLOCATOR and in include "wikidiff2" by "WDLX_config.h" to make it obvious that only this allocator is required but not the Wikidiff2 class.
/**
 * GPL blah blah, see below for history
 */

#ifndef DIFFENGINE_H
#define DIFFENGINE_H

//#define USE_JUDY

#include <vector>
#include <map>
#include <set>
#include <utility>
#include <algorithm>
#include <cassert>

#ifdef USE_JUDY
#include "JudyHS.h"
#endif

#include "WDLX_config.h"

/**
 * Diff operation
 *
 * from and to are vectors containing pointers to the objects passed in from_lines and to_lines
 *
 * op is one of the following
 *    copy:    A sequence of lines (in from and to) which are the same in both files.
 *    del:     A sequence of lines (in from) which were in the first file but not the second.
 *    add:     A sequence of lines (in to) which were in the second file but not the first.
 *    change:  A sequence of lines which are different between the two files. Lines from the
 *             first file are in from, lines from the second are in to. The two vectors need
 *             not be the same length.
 */
template<typename T>
class DiffOp
{
	public:
		typedef std::vector<const T*, WDLX_ALLOCATOR<const T*> > PointerVector;
		DiffOp(int op_, const PointerVector & from_, const PointerVector & to_)
			: op(op_), from(from_), to(to_) {}

		enum {copy, del, add, change};
		int op;
		PointerVector from;
		PointerVector to;
};

/**
 * Basic diff template class. After construction, edits will contain a vector of DiffOpTemplate
 * objects representing the diff
 */
template<typename T>
class Diff
{
	public:
		typedef std::vector<T, WDLX_ALLOCATOR<T> > ValueVector;
		typedef std::vector<DiffOp<T>, WDLX_ALLOCATOR<T> > DiffOpVector;

		Diff(const ValueVector & from_lines, const ValueVector & to_lines);

		virtual void add_edit(const DiffOp<T> & edit) {
			edits.push_back(edit);
		}
		unsigned size() { return edits.size(); }
		DiffOp<T> & operator[](int i) {return edits[i];}

		DiffOpVector edits;
};
/**
 * Class used internally by Diff to actually compute the diffs.
 *
 * The algorithm used here is mostly lifted from the perl module
 * Algorithm::Diff (version 1.06) by Ned Konz, which is available at:
 *	 http://www.perl.com/CPAN/authors/id/N/NE/NEDKONZ/Algorithm-Diff-1.06.zip
 *
 * More ideas are taken from:
 *	 http://www.ics.uci.edu/~eppstein/161/960229.html
 *
 * Some ideas are (and a bit of code) are from from analyze.c, from GNU
 * diffutils-2.7, which can be found at:
 *	 ftp://gnudist.gnu.org/pub/gnu/diffutils/diffutils-2.7.tar.gz
 *
 * This implementation is largely due to Geoffrey T. Dairiki, who wrote this
 * diff engine for phpwiki 1-3.3. It was then adopted by MediaWiki.
 *
 * Finally, it was ported to C++ by Tim Starling in February 2006
 *
 * @access private
 */

template<typename T>
class _DiffEngine
{
	public:
		// Vectors
		typedef std::vector<bool> BoolVector; // skip the allocator here to get the specialisation
		typedef std::vector<const T*, WDLX_ALLOCATOR<const T*> > PointerVector;
		typedef std::vector<T, WDLX_ALLOCATOR<T> > ValueVector;
		typedef std::vector<int, WDLX_ALLOCATOR<int> > IntVector;
		typedef std::vector<std::pair<int, int>, WDLX_ALLOCATOR<std::pair<int, int> > > IntPairVector;

		// Maps
#ifdef USE_JUDY
		typedef JudyHS<IntVector> MatchesMap;
#else
		typedef std::map<T, IntVector, std::less<T>, WDLX_ALLOCATOR<IntVector> > MatchesMap;
#endif

		// Sets
		typedef std::set<int, std::less<int>, WDLX_ALLOCATOR<int> > IntSet;
#ifdef USE_JUDY
		typedef JudySet ValueSet;
#else
		typedef std::set<T, std::less<T>, WDLX_ALLOCATOR<T> > ValueSet;
#endif

		_DiffEngine() : done(false) {}
		void clear();
		void diff (const ValueVector & from_lines,
				const ValueVector & to_lines, Diff<T> & diff);
		int _lcs_pos (int ypos);
		void _compareseq (int xoff, int xlim, int yoff, int ylim);
		void _shift_boundaries (const ValueVector & lines, BoolVector & changed,
				const BoolVector & other_changed);
	protected:
		int _diag (int xoff, int xlim, int yoff, int ylim, int nchunks,
				IntPairVector & seps);

		BoolVector xchanged, ychanged;
		PointerVector xv, yv;
		IntVector xind, yind;
		IntVector seq;
		IntSet in_seq;
		int lcs;
		bool done;
		enum {MAX_CHUNKS=8};
};

//-----------------------------------------------------------------------------
// _DiffEngine implementation
//-----------------------------------------------------------------------------
template<typename T>
void _DiffEngine<T>::clear()
{
	xchanged.clear();
	ychanged.clear();
	xv.clear();
	yv.clear();
	xind.clear();
	yind.clear();
	seq.clear();
	in_seq.clear();
	done = false;
}

template<typename T>
void _DiffEngine<T>::diff (const ValueVector & from_lines,
		const ValueVector & to_lines, Diff<T> & diff)
{
	int n_from = (int)from_lines.size();
	int n_to = (int)to_lines.size();

	// If this diff engine has been used before for a diff, clear the member variables
	if (done) {
		clear();
	}
	xchanged.resize(n_from);
	ychanged.resize(n_to);
	seq.resize(std::max(n_from, n_to) + 1);

	// Skip leading common lines.
	int skip, endskip;
	for (skip = 0; skip < n_from && skip < n_to; skip++) {
		if (from_lines[skip] != to_lines[skip])
			break;
		xchanged[skip] = ychanged[skip] = false;
	}
	// Skip trailing common lines.
	int xi = n_from, yi = n_to;
	for (endskip = 0; --xi > skip && --yi > skip; endskip++) {
		if (from_lines[xi] != to_lines[yi])
			break;
		xchanged[xi] = ychanged[yi] = false;
	}

	// Ignore lines which do not exist in both files.
	ValueSet xhash, yhash;
	for (xi = skip; xi < n_from - endskip; xi++) {
		xhash.insert(from_lines[xi]);
	}

	for (yi = skip; yi < n_to - endskip; yi++) {
		const T & line = to_lines[yi];
		if ( (ychanged[yi] = (xhash.find(line) == xhash.end())) )
			continue;
		yhash.insert(line);
		yv.push_back(&line);
		yind.push_back(yi);
	}
	for (xi = skip; xi < n_from - endskip; xi++) {
		const T & line = from_lines[xi];
		if ( (xchanged[xi] = (yhash.find(line) == yhash.end())) )
			continue;
		xv.push_back(&line);
		xind.push_back(xi);
	}

	// Find the LCS.
	_compareseq(0, xv.size(), 0, yv.size());

	// Merge edits when possible
	_shift_boundaries(from_lines, xchanged, ychanged);
	_shift_boundaries(to_lines, ychanged, xchanged);

	// Compute the edit operations.
	xi = yi = 0;
	while (xi < n_from || yi < n_to) {
		assert(yi < n_to || xchanged[xi]);
		assert(xi < n_from || ychanged[yi]);

		// Skip matching "snake".
		PointerVector del;
		PointerVector add;
		PointerVector empty;
		while (xi < n_from && yi < n_to && !xchanged[xi] && !ychanged[yi]) {
			del.push_back(&from_lines[xi]);
			add.push_back(&to_lines[yi]);
			++xi;
			++yi;
		}
		if (del.size()) {
			diff.add_edit(DiffOp<T>(DiffOp<T>::copy, del, add));
			del.clear();
			add.clear();
		}

		// Find deletes & adds.
		while (xi < n_from && xchanged[xi])
			del.push_back(&from_lines[xi++]);

		while (yi < n_to && ychanged[yi])
			add.push_back(&to_lines[yi++]);

		if (del.size() && add.size())
			diff.add_edit(DiffOp<T>(DiffOp<T>::change, del, add));
		else if (del.size())
			diff.add_edit(DiffOp<T>(DiffOp<T>::del, del, empty));
		else if (add.size())
			diff.add_edit(DiffOp<T>(DiffOp<T>::add, empty, add));
	}

	done = true;
}

/* Divide the Largest Common Subsequence (LCS) of the sequences
 * [XOFF, XLIM) and [YOFF, YLIM) into NCHUNKS approximately equally
 * sized segments.
 *
 * Returns (LCS, SEPS). LCS is the length of the LCS. SEPS is an
 * array of NCHUNKS+1 (X, Y) indexes giving the diving points between
 * sub sequences.  The first sub-sequence is contained in [X0, X1),
 * [Y0, Y1), the second in [X1, X2), [Y1, Y2) and so on.  Note
 * that (X0, Y0) == (XOFF, YOFF) and
 * (X[NCHUNKS], Y[NCHUNKS]) == (XLIM, YLIM).
 *
 * This function assumes that the first lines of the specified portions
 * of the two files do not match, and likewise that the last lines do not
 * match.  The caller must trim matching lines from the beginning and end
 * of the portions it is going to specify.
 */
template <typename T>
int _DiffEngine<T>::_diag (int xoff, int xlim, int yoff, int ylim, int nchunks,
		IntPairVector & seps)
{
	using std::swap;
	using std::make_pair;
	using std::copy;
	bool flip = false;
	MatchesMap ymatches;

	if (xlim - xoff > ylim - yoff) {
		// Things seems faster (I'm not sure I understand why)
		// when the shortest sequence in X.
		flip = true;
		swap(xoff, yoff);
		swap(xlim, ylim);
	}

	if (flip)
		for (int i = ylim - 1; i >= yoff; i--)
			ymatches[*xv[i]].push_back(i);
	else
		for (int i = ylim - 1; i >= yoff; i--)
			ymatches[*yv[i]].push_back(i);

	int nlines = ylim - yoff;
	lcs = 0;
	seq[0] = yoff - 1;
	in_seq.clear();

	// 2-d array, line major, chunk minor
	IntVector ymids(nlines * nchunks);

	int numer = xlim - xoff + nchunks - 1;
	int x = xoff, x1, y1;
	for (int chunk = 0; chunk < nchunks; chunk++) {
		if (chunk > 0)
			for (int i = 0; i <= lcs; i++)
				ymids.at(i * nchunks + chunk-1) = seq[i];

		x1 = xoff + (int)((numer + (xlim-xoff)*chunk) / nchunks);
		for ( ; x < x1; x++) {
			const T & line = flip ? *yv[x] : *xv[x];
#ifdef USE_JUDY
			IntVector * pMatches = ymatches.Get(line);
			if (!pMatches)
				continue;
#else
			typename MatchesMap::iterator iter = ymatches.find(line);
			if (iter == ymatches.end())
				continue;
			IntVector * pMatches = &(iter->second);
#endif
			IntVector::iterator y;
			int k = 0;

			for (y = pMatches->begin(); y != pMatches->end(); ++y) {
				if (!in_seq.count(*y)) {
					k = _lcs_pos(*y);
					assert(k > 0);
					copy(ymids.begin() + (k-1) * nchunks, ymids.begin() + k * nchunks,
							ymids.begin() + k * nchunks);
					++y;
					break;
				}
			}
			for ( ; y != pMatches->end(); ++y) {
				if (*y > seq[k-1]) {
					assert(*y < seq[k]);
					// Optimization: this is a common case:
					//	next match is just replacing previous match.
					in_seq.erase(seq[k]);
					seq[k] = *y;
					in_seq.insert(*y);
				} else if (!in_seq.count(*y)) {
					k = _lcs_pos(*y);
					assert(k > 0);
					copy(ymids.begin() + (k-1) * nchunks, ymids.begin() + k * nchunks,
							ymids.begin() + k * nchunks);
				}
			}
		}
	}

	seps.clear();
	seps.resize(nchunks + 1);

	seps[0] = flip ? make_pair(yoff, xoff) : make_pair(xoff, yoff);
	IntVector::iterator ymid = ymids.begin() + lcs * nchunks;
	for (int n = 0; n < nchunks - 1; n++) {
		x1 = xoff + (numer + (xlim - xoff) * n) / nchunks;
		y1 = ymid[n] + 1;
		seps[n+1] = flip ? make_pair(y1, x1) : make_pair(x1, y1);
	}
	seps[nchunks] = flip ? make_pair(ylim, xlim) : make_pair(xlim, ylim);
	return lcs;
}

template <typename T>
int _DiffEngine<T>::_lcs_pos (int ypos) {
	int end = lcs;
	if (end == 0 || ypos > seq[end]) {
		seq[++lcs] = ypos;
		in_seq.insert(ypos);
		return lcs;
	}

	int beg = 1;
	while (beg < end) {
		int mid = (beg + end) / 2;
		if ( ypos > seq[mid] )
			beg = mid + 1;
		else
			end = mid;
	}

	assert(ypos != seq[end]);

	in_seq.erase(seq[end]);
	seq[end] = ypos;
	in_seq.insert(ypos);
	return end;
}

/* Find LCS of two sequences.
 *
 * The results are recorded in the vectors {x,y}changed[], by
 * storing a 1 in the element for each line that is an insertion
 * or deletion (ie. is not in the LCS).
 *
 * The subsequence of file 0 is [XOFF, XLIM) and likewise for file 1.
 *
 * Note that XLIM, YLIM are exclusive bounds.
 * All line numbers are origin-0 and discarded lines are not counted.
 */
template <typename T>
void _DiffEngine<T>::_compareseq (int xoff, int xlim, int yoff, int ylim) {
	using std::pair;

	IntPairVector seps;
	int lcs;

	// Slide down the bottom initial diagonal.
	while (xoff < xlim && yoff < ylim && *xv[xoff] == *yv[yoff]) {
		++xoff;
		++yoff;
	}

	// Slide up the top initial diagonal.
	while (xlim > xoff && ylim > yoff && *xv[xlim - 1] == *yv[ylim - 1]) {
		--xlim;
		--ylim;
	}

	if (xoff == xlim || yoff == ylim)
		lcs = 0;
	else {
		// This is ad hoc but seems to work well.
		//nchunks = sqrt(min(xlim - xoff, ylim - yoff) / 2.5);
		//nchunks = max(2,min(8,(int)nchunks));
		int nchunks = std::min(MAX_CHUNKS-1, std::min(xlim - xoff, ylim - yoff)) + 1;
		lcs = _diag(xoff, xlim, yoff, ylim, nchunks, seps);
	}

	if (lcs == 0) {
		// X and Y sequences have no common subsequence:
		// mark all changed.
		while (yoff < ylim)
			ychanged[yind[yoff++]] = true;
		while (xoff < xlim)
			xchanged[xind[xoff++]] = true;
	} else {
		// Use the partitions to split this problem into subproblems.
		IntPairVector::iterator pt1, pt2;
		pt1 = pt2 = seps.begin();
		while (++pt2 != seps.end()) {
			_compareseq (pt1->first, pt2->first, pt1->second, pt2->second);
			pt1 = pt2;
		}
	}
}

/* Adjust inserts/deletes of identical lines to join changes
 * as much as possible.
 *
 * We do something when a run of changed lines include a
 * line at one end and has an excluded, identical line at the other.
 * We are free to choose which identical line is included.
 * `compareseq' usually chooses the one at the beginning,
 * but usually it is cleaner to consider the following identical line
 * to be the "change".
 *
 * This is extracted verbatim from analyze.c (GNU diffutils-2.7).
 */
template <typename T>
void _DiffEngine<T>::_shift_boundaries (const ValueVector & lines, BoolVector & changed,
		const BoolVector & other_changed)
{
	int i = 0;
	int j = 0;

	int len = (int)lines.size();
	int other_len = (int)other_changed.size();

	while (1) {
		/*
		 * Scan forwards to find beginning of another run of changes.
		 * Also keep track of the corresponding point in the other file.
		 *
		 * Throughout this code, i and j are adjusted together so that
		 * the first i elements of changed and the first j elements
		 * of other_changed both contain the same number of zeros
		 * (unchanged lines).
		 * Furthermore, j is always kept so that j == other_len or
		 * other_changed[j] == false.
		 */
		while (j < other_len && other_changed[j])
			j++;

		while (i < len && ! changed[i]) {
			i++; j++;
			while (j < other_len && other_changed[j])
				j++;
		}

		if (i == len)
			break;

		int start = i, runlength, corresponding;

		// Find the end of this run of changes.
		while (++i < len && changed[i])
			continue;

		do {
			/*
			 * Record the length of this run of changes, so that
			 * we can later determine whether the run has grown.
			 */
			runlength = i - start;

			/*
			 * Move the changed region back, so long as the
			 * previous unchanged line matches the last changed one.
			 * This merges with previous changed regions.
			 */
			while (start > 0 && lines[start - 1] == lines[i - 1]) {
				changed[--start] = true;
				changed[--i] = false;
				while (start > 0 && changed[start - 1])
					start--;
				while (other_changed[--j])
					continue;
			}

			/*
			 * Set CORRESPONDING to the end of the changed run, at the last
			 * point where it corresponds to a changed run in the other file.
			 * CORRESPONDING == LEN means no such point has been found.
			 */
			corresponding = j < other_len ? i : len;

			/*
			 * Move the changed region forward, so long as the
			 * first changed line matches the following unchanged one.
			 * This merges with following changed regions.
			 * Do this second, so that if there are no merges,
			 * the changed region is moved forward as far as possible.
			 */
			while (i < len && lines[start] == lines[i]) {
				changed[start++] = false;
				changed[i++] = true;
				while (i < len && changed[i])
					i++;

				j++;
				if (j < other_len && other_changed[j]) {
					corresponding = i;
					while (j < other_len && other_changed[j])
						j++;
				}
			}
		} while (runlength != i - start);

		/*
		 * If possible, move the fully-merged run of changes
		 * back to a corresponding run in the other file.
		 */
		while (corresponding < i) {
			changed[--start] = 1;
			changed[--i] = 0;
			while (other_changed[--j])
				continue;
		}
	}
}
//-----------------------------------------------------------------------------
// Diff implementation
//-----------------------------------------------------------------------------

template<typename T>
Diff<T>::Diff(const ValueVector & from_lines, const ValueVector & to_lines)
{
	_DiffEngine<T> engine;
	engine.diff(from_lines, to_lines, *this);
}

#endif