| // Copyright 2018 The diff-match-patch Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // This code (especially GetBisectSplitPoints) derive in part from |
| // diff_match_patch.cpp under the Apache 2.0 license and attribution: |
| // |
| // Author: fraser@google.com (Neil Fraser) |
| // Author: mikeslemmer@gmail.com (Mike Slemmer) |
| // |
| // Diff Match and Patch |
| // https://github.com/google/diff-match-patch |
| |
| // Returns the minimal number of edit operations (copy, delete, insert) |
| // needed to tranform one container of tokens into another (cf. `diff -d -e`). |
| // Requires random-access iterators of token streams. |
| |
| #ifndef EDITSCRIPT_H_ |
| #define EDITSCRIPT_H_ |
| |
| #include <algorithm> |
| #include <cstddef> |
| #include <cstdint> |
| #include <iterator> |
| #include <map> |
| #include <tuple> |
| #include <utility> |
| #include <vector> |
| |
| namespace diff { |
| |
| enum class Operation { EQUALS, DELETE, INSERT }; |
| |
| /** |
| * The data structure of edit operations to transform tokens1 into tokens2. |
| * Indices for EQUALS and DELETE point into tokens1, INSERT into tokens2. |
| * Concatenating EQUALS and DELETE tokens will return tokens1. |
| * Concatenating EQUALS and INSERT tokens will return tokens2. |
| * [start, end) is a semiopen interval. |
| */ |
| struct Edit { |
| Operation operation; // One of: EQUALS, DELETE, or INSERT. |
| int64_t start; // Start offset into tokens1 (tokens2 for INSERT). |
| int64_t end; // End offset into tokens1 (tokens2 for INSERT). |
| bool operator==(const Edit &rhs) const { |
| return (std::tie(operation, start, end) == |
| std::tie(rhs.operation, rhs.start, rhs.end)); |
| } |
| bool operator<(const Edit &rhs) const { |
| return (std::tie(operation, start, end) < |
| std::tie(rhs.operation, rhs.start, rhs.end)); |
| } |
| }; |
| using Edits = std::vector<Edit>; |
| |
| /** |
| * Finds the differences between two vectors of tokens, returning edits |
| * required to transform tokens1 into tokens2. |
| * Every token in the combined document belongs to exactly one edit. |
| * |
| * For example, for tokens1 = "Hello world." and tokens2 = "Goodbye world.": |
| * {Edit(Operation::DELETE, 0, 5), // token1[0,5) == "Hello" |
| * Edit(Operation::INSERT, 0, 7), // token2[0,7) == "Goodbye" |
| * Edit(Operation::EQUALS, 5, 12)} // token1[5, 12) == " world." |
| * |
| * @param tokens1_begin Beginning index into tokens1. |
| * @param tokens1_end Ending index into tokens1. |
| * @param tokens2_begin Beginning index into tokens2. |
| * @param tokens2_end Ending index into tokens2. |
| * @return Cumulative edits to transform tokens1 into tokens2. |
| */ |
| template <typename TokenIter> |
| Edits GetTokenDiffs(TokenIter tokens1_begin, TokenIter tokens1_end, |
| TokenIter tokens2_begin, TokenIter tokens2_end); |
| |
| ////////////////////////////////////////////////////////////////////////////// |
| // IMPLEMENTATION |
| ////////////////////////////////////////////////////////////////////////////// |
| |
| namespace diff_impl { |
| /** |
| * Appends an edit operation to the cumulative edits. |
| * Fuses with previous edit if possible. |
| * @param op Edit operation. |
| * @param start Start offset into tokens1 (tokens2 for INSERT). |
| * @param end End offset into tokens1 (tokens2 for INSERT). |
| * @param edits Cumulative vector of edits (inout). |
| */ |
| inline void AppendEdit(Operation op, int64_t start, int64_t end, Edits *edits) { |
| if (!edits->empty() && |
| edits->back().operation == op && |
| edits->back().end == start) { |
| // Merge into previous edit operation. |
| edits->back().end = end; |
| } else { |
| // Add a new edit operation. |
| edits->emplace_back(Edit{op, start, end}); |
| } |
| } |
| |
| /** |
| * Inserts an edit operation into the cumulative edits at given index. |
| * Fuses with neighboring edit if possible. |
| * @param index Index of insertion point. |
| * @param op Edit operation. |
| * @param start Start offset into tokens1 (tokens2 for INSERT). |
| * @param end End offset into tokens1 (tokens2 for INSERT). |
| * @param edits Cumulative vector of edits (inout). |
| */ |
| inline void InsertEditAt(int64_t index, Operation op, int64_t start, |
| int64_t end, Edits *edits) { |
| if (index > 0) { |
| Edit &prev_edit = (*edits)[index - 1]; |
| if (prev_edit.operation == op && |
| prev_edit.end == start) { |
| // Merge into previous edit operation. |
| prev_edit.end = end; |
| return; |
| } |
| } |
| if (index < static_cast<int64_t>(edits->size())) { |
| Edit &next_edit = (*edits)[index]; |
| if (next_edit.operation == op && |
| next_edit.end == start) { |
| // Merge into subsequent edit operation. |
| next_edit.end = end; |
| return; |
| } |
| } |
| |
| // Cannot merge into existing? Insert a new edit operation. |
| edits->emplace(edits->begin() + index, Edit{op, start, end}); |
| } |
| |
| /** |
| * Reverse direction of iterator (without moving its location). |
| * Returned iter will now point to one elem to left of input iter. |
| * @param iter Original bidirectional or random iterator. |
| */ |
| template <typename TokenIter> |
| inline std::reverse_iterator<TokenIter> Reverse(TokenIter iter) { |
| return std::reverse_iterator<TokenIter>(iter); |
| } |
| |
| /** |
| * Determine the common affix of two tokens. |
| * For common prefix, pass normal iterators. |
| * For common suffix, pass reverse iterators. |
| * @param span1_begin Beginning iterator into tokens1. |
| * @param span1_end Ending iterator into tokens1. |
| * @param span2_begin Beginning iterator into tokens2. |
| * @param span2_end Ending iterator into tokens2. |
| * @return The number of tokens common to the start of spans. |
| */ |
| template <typename TokenIter> |
| inline int64_t CommonAffix(TokenIter span1_begin, TokenIter span1_end, |
| TokenIter span2_begin, TokenIter span2_end) { |
| // std::mismatch wants the shorter container first. |
| if (span1_end - span1_begin > span2_end - span2_begin) { |
| using std::swap; |
| swap(span1_begin, span2_begin); |
| swap(span1_end, span2_end); |
| } |
| |
| auto affix_ends = std::mismatch(span1_begin, span1_end, span2_begin); |
| const int64_t affix_length = affix_ends.first - span1_begin; |
| return affix_length; |
| } |
| |
| /** |
| * Finds the differences between two vectors of tokens. |
| * Invoke using GetTokenDiffs. |
| */ |
| template <typename TokenIter> |
| class Diff { |
| private: |
| friend Edits |
| GetTokenDiffs<>(TokenIter tokens1_begin, TokenIter tokens1_end, |
| TokenIter tokens2_begin, TokenIter tokens2_end); |
| |
| /** |
| * Finds the differences between two vectors of tokens, returning edits |
| * required to transform tokens1 into tokens2. |
| * Every token in the combined document belongs to exactly one edit. |
| * @param tokens1_begin Iterator pointing to start of tokens1. |
| * @param tokens2_begin Iterator pointing to start of tokens2. |
| * @param edits Cumulative edits to transform tokens1 into tokens2 (inout). |
| */ |
| Diff(TokenIter tokens1_begin, TokenIter tokens2_begin) |
| : tokens1_begin_(tokens1_begin), tokens2_begin_(tokens2_begin) {} |
| |
| /** |
| * Find the differences between two vectors of tokens. |
| * @param b1 Beginning index into tokens1. |
| * @param e1 Ending index into tokens1. |
| * @param b2 Beginning index into tokens2. |
| * @param e2 Ending index into tokens2. |
| * @param edits Cumulative edits to transform tokens1 into tokens2 (inout). |
| * @param edits Vector of Edit objects |
| */ |
| void Generate(int64_t b1, int64_t e1, int64_t b2, int64_t e2, |
| Edits *edits) const { |
| // Avoid growing stack through recursion more than necessary. |
| int64_t prefix_size = 0; |
| int64_t suffix_size = 0; |
| int64_t edits_size = 0; |
| { |
| // The diff strategy is to recursively: |
| // - Peel off common prefix and suffix |
| // - Split what's left in two parts |
| // - Generate diffs for each and combine. |
| // The diffs could be done in parallel (with a fresh edits) and |
| // later combined if speed up is needed. |
| auto span1_begin = tokens1_begin_ + b1; |
| auto span2_begin = tokens2_begin_ + b2; |
| auto span1_end = tokens1_begin_ + e1; |
| auto span2_end = tokens2_begin_ + e2; |
| |
| // Check for equality (speedup). |
| if ((e1 - b1) == (e2 - b2) && |
| std::equal(span1_begin, span1_end, span2_begin)) { |
| if (span1_begin != span1_end) { |
| AppendEdit(Operation::EQUALS, b1, e1, edits); |
| } |
| return; |
| } |
| |
| // Find longest common prefix and suffix. |
| prefix_size = |
| CommonAffix(span1_begin, span1_end, span2_begin, span2_end); |
| suffix_size = |
| CommonAffix(Reverse(span1_end), Reverse(span1_begin + prefix_size), |
| Reverse(span2_end), Reverse(span2_begin + prefix_size)); |
| |
| // Remember the current location so we can insert a common prefix. |
| edits_size = edits->size(); |
| } |
| |
| // Compute the diff on the middle block. |
| Compute(b1 + prefix_size, e1 - suffix_size, |
| b2 + prefix_size, e2 - suffix_size, |
| edits); |
| |
| // Restore the prefix and suffix. |
| if (prefix_size != 0) { |
| InsertEditAt(edits_size, Operation::EQUALS, b1, b1 + prefix_size, edits); |
| } |
| if (suffix_size != 0) { |
| AppendEdit(Operation::EQUALS, e1 - suffix_size, e1, edits); |
| } |
| } |
| |
| /** |
| * Find the differences between two vectors of tokens. |
| * @pre Tokens are not equal nor share a common prefix or suffix. |
| * @param b1 Beginning index into tokens1. |
| * @param e1 Ending index into tokens1. |
| * @param b2 Beginning index into tokens2. |
| * @param e2 Ending index into tokens2. |
| * @param edits Cumulative edits to transform tokens1 into tokens2 (inout). |
| */ |
| void Compute(int64_t b1, int64_t e1, int64_t b2, int64_t e2, |
| Edits *edits) const { |
| // Try various speedups first. |
| // Enclose in a scope to save stack space before recursing. |
| { |
| const int64_t length1 = e1 - b1; |
| const int64_t length2 = e2 - b2; |
| if (length1 == 0 && length2 != 0) { |
| // tokens1 empty, but tokens2 isn't: Just insert tokens2 |
| AppendEdit(Operation::INSERT, b2, e2, edits); |
| return; |
| } |
| if (length2 == 0 && length1 != 0) { |
| // tokens2 empty, but tokens1 isn't: Just delete tokens1 |
| AppendEdit(Operation::DELETE, b1, e1, edits); |
| return; |
| } |
| if (length1 > length2) { |
| const int64_t offset = |
| std::search(tokens1_begin_ + b1, tokens1_begin_ + e1, |
| tokens2_begin_ + b2, tokens2_begin_ + e2) - |
| tokens1_begin_; |
| if (offset != e1) { |
| // tokens2 is a proper substring of tokens1: delete the rest. |
| const int64_t offset_end = offset + length2; |
| AppendEdit(Operation::DELETE, b1, offset, edits); |
| AppendEdit(Operation::EQUALS, offset, offset_end, edits); |
| AppendEdit(Operation::DELETE, offset_end, e1, edits); |
| return; |
| } |
| if (length2 == 1) { |
| // Single-token span. |
| // After the previous speedup, the operation can't be EQUALS. |
| AppendEdit(Operation::DELETE, b1, e1, edits); |
| AppendEdit(Operation::INSERT, b2, e2, edits); |
| return; |
| } |
| } else if (length2 > length1) { |
| const int64_t offset = |
| std::search(tokens2_begin_ + b2, tokens2_begin_ + e2, |
| tokens1_begin_ + b1, tokens1_begin_ + e1) - |
| tokens2_begin_; |
| if (offset != e2) { |
| // tokens1 is a proper substring of tokens2: insert the rest. |
| AppendEdit(Operation::INSERT, b2, offset, edits); |
| AppendEdit(Operation::EQUALS, b1, e1, edits); // Index into tokens1! |
| AppendEdit(Operation::INSERT, offset + length1, e2, edits); |
| return; |
| } |
| if (length1 == 1) { |
| // Single-token span. |
| // After the previous speedup, the operation can't be EQUALS. |
| AppendEdit(Operation::DELETE, b1, e1, edits); |
| AppendEdit(Operation::INSERT, b2, e2, edits); |
| return; |
| } |
| } |
| } |
| |
| // No speedups apply? Bisect and diff each half, then combine results. |
| Bisect(b1, e1, b2, e2, edits); |
| } |
| |
| /** |
| * Find the 'middle snake' of a diff, returning split points. |
| * See Myers 1986 paper: An O(ND) Difference Algorithm and Its Variations. |
| * @param b1 Beginning index into tokens1. |
| * @param e1 Ending index into tokens1. |
| * @param b2 Beginning index into tokens2. |
| * @param e2 Ending index into tokens2. |
| * @return Index of split points in tokens1 and tokens2, respectively. |
| * Returns {-1, -1} if there is no need to bisect. |
| */ |
| std::pair<int64_t, int64_t> GetBisectSplitPoints(int64_t b1, int64_t e1, |
| int64_t b2, |
| int64_t e2) const { |
| std::pair<int64_t, int64_t> splits; |
| int64_t &x1 = splits.first; |
| int64_t &y1 = splits.second; |
| |
| const int64_t length1 = e1 - b1; |
| const int64_t length2 = e2 - b2; |
| const int64_t max_d = (length1 + length2 + 1) / 2; |
| const int64_t v_offset = max_d; |
| const int64_t v_size = 2 * max_d; |
| const int64_t w_size = 2 * v_size; |
| std::vector<int64_t> w(w_size + 4, |
| -1); // interleave for cache friendliness |
| |
| w[2 * v_offset + 2] = 0; |
| w[2 * v_offset + 3] = 0; |
| const int64_t delta = length1 - length2; |
| |
| // If the total number of tokens is odd, then the front path will |
| // collide with the reverse path. |
| const bool front = (delta % 2 != 0); |
| |
| // Offsets for start and end of k loop. |
| // Prevents mapping of space beyond the grid. |
| int64_t k1start = 0; |
| int64_t k1end = 0; |
| int64_t k2start = 0; |
| int64_t k2end = 0; |
| for (int64_t d = 0; d < max_d; d++) { |
| // Walk the front path one step. |
| for (int64_t k1 = -d + k1start; k1 <= d - k1end; k1 += 2) { |
| const int64_t k1_offset = v_offset + k1; |
| x1 = 0; |
| if (k1 == -d || |
| (k1 != d && w[2 * k1_offset - 2] < w[2 * k1_offset + 2])) { |
| x1 = w[2 * k1_offset + 2]; |
| } else { |
| x1 = w[2 * k1_offset - 2] + 1; |
| } |
| y1 = x1 - k1; |
| while (x1 < length1 && y1 < length2 && |
| *(tokens1_begin_ + (b1 + x1)) == *(tokens2_begin_ + (b2 + y1))) { |
| x1++; |
| y1++; |
| } |
| w[2 * k1_offset] = x1; |
| if (x1 > length1) { |
| // Ran off the right of the graph. |
| k1end += 2; |
| } else if (y1 > length2) { |
| // Ran off the bottom of the graph. |
| k1start += 2; |
| } else if (front) { |
| int64_t k2_offset = v_offset + delta - k1; |
| if (k2_offset >= 0 && k2_offset < v_size && |
| w[2 * k2_offset + 1] != -1) { |
| // Mirror x2 onto top-left coordinate system. |
| int64_t x2 = length1 - w[2 * k2_offset + 1]; |
| if (x1 >= x2) { |
| // Overlap detected. |
| return splits; |
| } |
| } |
| } |
| } |
| |
| // Walk the reverse path one step. |
| for (int64_t k2 = -d + k2start; k2 <= d - k2end; k2 += 2) { |
| const int64_t k2_offset = v_offset + k2; |
| int64_t x2; |
| if (k2 == -d || |
| (k2 != d && w[2 * k2_offset - 1] < w[2 * k2_offset + 3])) { |
| x2 = w[2 * k2_offset + 3]; |
| } else { |
| x2 = w[2 * k2_offset - 1] + 1; |
| } |
| int64_t y2 = x2 - k2; |
| while (x2 < length1 && y2 < length2 && |
| *(tokens1_begin_ + (b1 + length1 - x2 - 1)) == |
| *(tokens2_begin_ + (b2 + length2 - y2 - 1))) { |
| x2++; |
| y2++; |
| } |
| w[2 * k2_offset + 1] = x2; |
| if (x2 > length1) { |
| // Ran off the left of the graph. |
| k2end += 2; |
| } else if (y2 > length2) { |
| // Ran off the top of the graph. |
| k2start += 2; |
| } else if (!front) { |
| int64_t k1_offset = v_offset + delta - k2; |
| if (k1_offset >= 0 && k1_offset < v_size && w[2 * k1_offset] != -1) { |
| x1 = w[2 * k1_offset]; |
| y1 = v_offset + x1 - k1_offset; |
| |
| // Mirror x2 onto top-left coordinate system. |
| x2 = length1 - x2; |
| if (x1 >= x2) { |
| // Overlap detected. |
| return splits; |
| } |
| } |
| } |
| } |
| } |
| x1 = -1; |
| y1 = -1; |
| return splits; |
| } |
| |
| /** |
| * Bisect and recurse if necessary. |
| * @param b1 Beginning index into tokens1. |
| * @param e1 Ending index into tokens1. |
| * @param b2 Beginning index into tokens2. |
| * @param e2 Ending index into tokens2. |
| * @param edits Cumulative edits to transform tokens1 into tokens2 (inout). |
| */ |
| void Bisect(int64_t b1, int64_t e1, int64_t b2, int64_t e2, |
| Edits *edits) const { |
| int64_t x1, x2; |
| std::tie(x1, x2) = GetBisectSplitPoints(b1, e1, b2, e2); |
| if (x1 >= 0) { |
| // Some commonality, so bisect and recurse. |
| Generate(b1, b1 + x1, b2, b2 + x2, edits); |
| Generate(b1 + x1, e1, b2 + x2, e2, edits); |
| } else { |
| // No commonality at all (number of edits equals number of tokens), |
| // so just delete the old and insert the new. |
| AppendEdit(Operation::DELETE, b1, e1, edits); |
| AppendEdit(Operation::INSERT, b2, e2, edits); |
| } |
| } |
| |
| private: |
| TokenIter tokens1_begin_; |
| TokenIter tokens2_begin_; |
| }; // class Diff |
| } // namespace diff_impl |
| |
| template <typename TokenIter> |
| inline Edits GetTokenDiffs(TokenIter tokens1_begin, TokenIter tokens1_end, |
| TokenIter tokens2_begin, TokenIter tokens2_end) { |
| Edits token_edits; |
| diff_impl::Diff<TokenIter>(tokens1_begin, tokens2_begin) |
| .Generate(0, std::distance(tokens1_begin, tokens1_end), |
| 0, std::distance(tokens2_begin, tokens2_end), |
| &token_edits); |
| return token_edits; // efficient: uses named return value optimization |
| } |
| |
| } // namespace diff |
| |
| #endif // EDITSCRIPT_H_ |