From 7381a2bfa5067855a8bc027cb6b2611c2bb94c73 Mon Sep 17 00:00:00 2001 From: Keanu Lee Date: Mon, 7 Apr 2014 16:13:21 -0700 Subject: [PATCH 1/7] Added support for atomic tags; ignore tag attributes; interpret whitespace. --- .gitignore | 1 - README.md | 2 +- js/htmldiff.js | 482 +++++++++++++++++++++++++++++ package.json | 4 +- src/htmldiff.coffee | 125 +++++++- test/diff.spec.coffee | 14 +- test/html_to_tokens.spec.coffee | 37 +++ test/render_operations.spec.coffee | 27 ++ 8 files changed, 672 insertions(+), 20 deletions(-) create mode 100644 js/htmldiff.js diff --git a/.gitignore b/.gitignore index 06f62bf..3c3629e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1 @@ -*.js node_modules diff --git a/README.md b/README.md index 7da4ee1..fce14f0 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # htmldiff.js ### HTML Diffing in JavaScript (ok, CoffeeScript actually.) -[![Build Status](https://secure.travis-ci.org/tnwinc/htmldiff.js.png)](http://travis-ci.org/tnwinc/htmldiff.js) +[![Build Status](https://travis-ci.org/keanulee/htmldiff.js.svg?branch=master)](https://travis-ci.org/keanulee/htmldiff.js) `htmldiff.js` is a CoffeeScript port of https://github.com/myobie/htmldiff (This one has a few more tests.) diff --git a/js/htmldiff.js b/js/htmldiff.js new file mode 100644 index 0000000..e4c56f0 --- /dev/null +++ b/js/htmldiff.js @@ -0,0 +1,482 @@ +// Generated by CoffeeScript 1.7.1 +(function() { + var Match, calculate_operations, consecutive_where, create_index, diff, find_match, find_matching_blocks, get_key_for_token, html_to_tokens, is_end_of_atomic_tag, is_end_of_tag, is_start_of_atomic_tag, is_start_of_tag, is_tag, is_whitespace, isnt_tag, op_map, recursively_find_matching_blocks, render_operations, wrap; + + is_end_of_tag = function(char) { + return char === '>'; + }; + + is_start_of_tag = function(char) { + return char === '<'; + }; + + is_whitespace = function(char) { + return /^\s+$/.test(char); + }; + + is_tag = function(token) { + return /^\s*<[^>]+>\s*$/.test(token); + }; + + isnt_tag = function(token) { + return !is_tag(token); + }; + + + /* + * Checks if the current word is the beginning of an atomic tag. An atomic tag is one whose + * child nodes should not be compared - the entire tag should be treated as one token. + * + * @param {string} word The characters of the current token read so far. + * + * @return {string|null} The name of the atomic tag if the word will be an atomic tag, + * null otherwise + */ + + is_start_of_atomic_tag = function(word) { + var result; + result = /^<(iframe|object|math|svg)/.exec(word); + if (result) { + result = result[1]; + } + return result; + }; + + + /* + * Checks if the current word is the end of an atomic tag (i.e. it has all the characters, + * except for the end bracket of the closing tag, such as "

') + .eql ['

', '', '

'] + + it 'should identify an object tag as a single token', -> + (expect @cut '

') + .eql ['

', '', '

'] + + it 'should identify a math tag as a single token', -> + (expect @cut '

' + + 'π' + + '' + + 'r2

') + .eql [ + '

', + '' + + 'π' + + '' + + 'r2', + '

'] + + it 'should identify a svg tag as a single token', -> + (expect @cut '

' + + '' + + '

') + .eql [ + '

', + '' + + '' + + '', + '

'] diff --git a/test/render_operations.spec.coffee b/test/render_operations.spec.coffee index a233315..179889a 100644 --- a/test/render_operations.spec.coffee +++ b/test/render_operations.spec.coffee @@ -63,3 +63,30 @@ describe 'render_operations', -> it 'should keep the change inside the

', -> (expect @res).to.equal '

thisI is awesome

' + + describe 'empty tokens', -> + it 'should not be wrapped', -> + before = ['text'] + after = ['text', ' '] + + @res = @cut before, after + + (expect @res).to.equal 'text' + + describe 'tags with attributes', -> + it 'should treat attribute changes as equal and output the after tag', -> + before = ['

', 'this', ' ', 'is', ' ', 'awesome', '

'] + after = ['

', 'this', ' ', 'is', ' ', 'awesome', '

'] + + @res = @cut before, after + + (expect @res).to.equal '

this is awesome

' + + it 'should show changes within tags with different attributes', -> + before = ['

', 'this', ' ', 'is', ' ', 'awesome', '

'] + after = ['

', 'that', ' ', 'is', ' ', 'awesome', '

'] + + @res = @cut before, after + + (expect @res).to.equal \ + '

thisthat is awesome

' From fcf51222893da261f2d22eca88e8d8bfa1c90962 Mon Sep 17 00:00:00 2001 From: Keanu Lee Date: Tue, 8 Apr 2014 12:27:45 -0700 Subject: [PATCH 2/7] PR comments --- src/htmldiff.coffee | 5 +++-- test/html_to_tokens.spec.coffee | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/htmldiff.coffee b/src/htmldiff.coffee index ec0475c..d746473 100644 --- a/src/htmldiff.coffee +++ b/src/htmldiff.coffee @@ -6,7 +6,8 @@ isnt_tag = (token)-> not is_tag token ### * Checks if the current word is the beginning of an atomic tag. An atomic tag is one whose - * child nodes should not be compared - the entire tag should be treated as one token. + * child nodes should not be compared - the entire tag should be treated as one token. This + * is useful for tags where it does not make sense to insert and tags. * * @param {string} word The characters of the current token read so far. * @@ -118,7 +119,7 @@ html_to_tokens = (html)-> ### * Creates a key that should be used to match tokens. This is useful, for example, if we want * to consider two open tag tokens as equal, even if they don't have the same attributes. We - * use a key instead of overwriting the token because we may want to render original string + * use a key instead of overwriting the token because we may want to render the original string * without losing the attributes. * * @param {string} token The token to create the key for. diff --git a/test/html_to_tokens.spec.coffee b/test/html_to_tokens.spec.coffee index a770b18..8455d61 100644 --- a/test/html_to_tokens.spec.coffee +++ b/test/html_to_tokens.spec.coffee @@ -55,7 +55,7 @@ describe 'html_to_tokens', -> 'r2', '

'] - it 'should identify a svg tag as a single token', -> + it 'should identify an svg tag as a single token', -> (expect @cut '

' + '' + '

') From 31944823ab90b0d62950d69daff9bb0bbe996a13 Mon Sep 17 00:00:00 2001 From: Keanu Lee Date: Thu, 10 Apr 2014 17:29:46 -0700 Subject: [PATCH 3/7] Wrap void and atomic tags with INS and DEL as well. --- js/htmldiff.js | 37 ++++++++++++++++++++++++++---- src/htmldiff.coffee | 24 +++++++++++++++++-- test/html_to_tokens.spec.coffee | 4 ++++ test/render_operations.spec.coffee | 17 ++++++++++++++ 4 files changed, 75 insertions(+), 7 deletions(-) diff --git a/js/htmldiff.js b/js/htmldiff.js index e4c56f0..76b01f4 100644 --- a/js/htmldiff.js +++ b/js/htmldiff.js @@ -1,6 +1,6 @@ // Generated by CoffeeScript 1.7.1 (function() { - var Match, calculate_operations, consecutive_where, create_index, diff, find_match, find_matching_blocks, get_key_for_token, html_to_tokens, is_end_of_atomic_tag, is_end_of_tag, is_start_of_atomic_tag, is_start_of_tag, is_tag, is_whitespace, isnt_tag, op_map, recursively_find_matching_blocks, render_operations, wrap; + var Match, calculate_operations, consecutive_where, create_index, diff, find_match, find_matching_blocks, get_key_for_token, html_to_tokens, is_end_of_atomic_tag, is_end_of_tag, is_start_of_atomic_tag, is_start_of_tag, is_tag, is_void_tag, is_whitespace, is_wrappable, isnt_tag, op_map, recursively_find_matching_blocks, render_operations, wrap; is_end_of_tag = function(char) { return char === '>'; @@ -25,7 +25,8 @@ /* * Checks if the current word is the beginning of an atomic tag. An atomic tag is one whose - * child nodes should not be compared - the entire tag should be treated as one token. + * child nodes should not be compared - the entire tag should be treated as one token. This + * is useful for tags where it does not make sense to insert and tags. * * @param {string} word The characters of the current token read so far. * @@ -35,7 +36,7 @@ is_start_of_atomic_tag = function(word) { var result; - result = /^<(iframe|object|math|svg)/.exec(word); + result = /^<(iframe|object|math|svg|script)/.exec(word); if (result) { result = result[1]; } @@ -58,6 +59,32 @@ return (word.substring(word.length - tag.length - 2)) === ("]+\/>\s*$/.test(token); + }; + + + /* + * Checks if a token can be wrapped inside a tag. + * + * @param {string} token The token to check. + * + * @return {boolean} True if the token can be wrapped inside a tag, false otherwise. + */ + + is_wrappable = function(token) { + return (isnt_tag(token)) || (is_start_of_atomic_tag(token)) || (is_void_tag(token)); + }; + Match = (function() { function Match(start_in_before, start_in_after, length) { this.start_in_before = start_in_before; @@ -176,7 +203,7 @@ /* * Creates a key that should be used to match tokens. This is useful, for example, if we want * to consider two open tag tokens as equal, even if they don't have the same attributes. We - * use a key instead of overwriting the token because we may want to render original string + * use a key instead of overwriting the token because we may want to render the original string * without losing the attributes. * * @param {string} token The token to create the key for. @@ -396,7 +423,7 @@ if (position >= length) { break; } - non_tags = consecutive_where(position, content, isnt_tag); + non_tags = consecutive_where(position, content, is_wrappable); position += non_tags.length; if (non_tags.length !== 0) { val = non_tags.join(''); diff --git a/src/htmldiff.coffee b/src/htmldiff.coffee index d746473..37a84a0 100644 --- a/src/htmldiff.coffee +++ b/src/htmldiff.coffee @@ -15,7 +15,7 @@ isnt_tag = (token)-> not is_tag token * null otherwise ### is_start_of_atomic_tag = (word)-> - result = /^<(iframe|object|math|svg)/.exec word + result = /^<(iframe|object|math|svg|script)/.exec word result = result[1] if result return result @@ -32,6 +32,26 @@ is_start_of_atomic_tag = (word)-> is_end_of_atomic_tag = (word, tag)-> (word.substring word.length - tag.length - 2) is " + /^\s*<[^>]+\/>\s*$/.test token + +### + * Checks if a token can be wrapped inside a tag. + * + * @param {string} token The token to check. + * + * @return {boolean} True if the token can be wrapped inside a tag, false otherwise. +### +is_wrappable = (token) -> + (isnt_tag token) or (is_start_of_atomic_tag token) or (is_void_tag token) + class Match constructor: (@start_in_before, @start_in_after, @length)-> @end_in_before = (@start_in_before + @length) - 1 @@ -333,7 +353,7 @@ wrap = (tag, content)-> loop break if position >= length - non_tags = consecutive_where position, content, isnt_tag + non_tags = consecutive_where position, content, is_wrappable position += non_tags.length if non_tags.length isnt 0 val = non_tags.join '' diff --git a/test/html_to_tokens.spec.coffee b/test/html_to_tokens.spec.coffee index 8455d61..15139f5 100644 --- a/test/html_to_tokens.spec.coffee +++ b/test/html_to_tokens.spec.coffee @@ -65,3 +65,7 @@ describe 'html_to_tokens', -> '' + '', '

'] + + it 'should identify a script tag as a single token', -> + (expect @cut '

') + .eql ['

', '', '

'] diff --git a/test/render_operations.spec.coffee b/test/render_operations.spec.coffee index 179889a..c7f0394 100644 --- a/test/render_operations.spec.coffee +++ b/test/render_operations.spec.coffee @@ -90,3 +90,20 @@ describe 'render_operations', -> (expect @res).to.equal \ '

thisthat is awesome

' + + describe 'wrappable tags', -> + it 'should wrap void tags', -> + before = ['old', ' ', 'text'] + after = ['new', '
', ' ', 'text'] + + @res = @cut before, after + + (expect @res).to.equal 'oldnew
text' + + it 'should wrap atomic tags', -> + before = ['old', '', ' ', 'text'] + after = ['new', ' ', 'text'] + + @res = @cut before, after + + (expect @res).to.equal 'oldnew text' From af8e6d7928a6c55c8bc6b5ffaa1901e602e32fac Mon Sep 17 00:00:00 2001 From: Keanu Lee Date: Fri, 11 Apr 2014 14:10:24 -0700 Subject: [PATCH 4/7] Pass in class name to include in wrapper tags. --- js/htmldiff.js | 41 +++++++++++++++++++++++++++-------------- src/htmldiff.coffee | 39 +++++++++++++++++++++++++-------------- test/diff.spec.coffee | 5 +++++ 3 files changed, 57 insertions(+), 28 deletions(-) diff --git a/js/htmldiff.js b/js/htmldiff.js index 76b01f4..1cde3f8 100644 --- a/js/htmldiff.js +++ b/js/htmldiff.js @@ -414,8 +414,8 @@ return []; }; - wrap = function(tag, content) { - var length, non_tags, position, rendering, tags, val; + wrap = function(tag, content, class_name) { + var attrs, length, non_tags, position, rendering, tags, val; rendering = ''; position = 0; length = content.length; @@ -427,8 +427,9 @@ position += non_tags.length; if (non_tags.length !== 0) { val = non_tags.join(''); + attrs = class_name ? " class=\"" + class_name + "\"" : ''; if (val.trim()) { - rendering += "<" + tag + ">" + val + ""; + rendering += "<" + tag + attrs + ">" + val + ""; } } if (position >= length) { @@ -442,36 +443,48 @@ }; op_map = { - equal: function(op, before_tokens, after_tokens) { + equal: function(op, before_tokens, after_tokens, class_name) { return after_tokens.slice(op.start_in_after, +op.end_in_after + 1 || 9e9).join(''); }, - insert: function(op, before_tokens, after_tokens) { + insert: function(op, before_tokens, after_tokens, class_name) { var val; val = after_tokens.slice(op.start_in_after, +op.end_in_after + 1 || 9e9); - return wrap('ins', val); + return wrap('ins', val, class_name); }, - "delete": function(op, before_tokens, after_tokens) { + "delete": function(op, before_tokens, after_tokens, class_name) { var val; val = before_tokens.slice(op.start_in_before, +op.end_in_before + 1 || 9e9); - return wrap('del', val); + return wrap('del', val, class_name); } }; - op_map.replace = function(op, before_tokens, after_tokens) { - return (op_map["delete"](op, before_tokens, after_tokens)) + (op_map.insert(op, before_tokens, after_tokens)); + op_map.replace = function(op, before_tokens, after_tokens, class_name) { + return (op_map["delete"](op, before_tokens, after_tokens, class_name)) + (op_map.insert(op, before_tokens, after_tokens, class_name)); }; - render_operations = function(before_tokens, after_tokens, operations) { + render_operations = function(before_tokens, after_tokens, operations, class_name) { var op, rendering, _i, _len; rendering = ''; for (_i = 0, _len = operations.length; _i < _len; _i++) { op = operations[_i]; - rendering += op_map[op.action](op, before_tokens, after_tokens); + rendering += op_map[op.action](op, before_tokens, after_tokens, class_name); } return rendering; }; - diff = function(before, after) { + + /* + * Compares two pieces of HTML content and returns the combined content with differences + * wrapped in and tags. + * + * @param {string} before The HTML content before the changes. + * @param {string} after The HTML content after the changes. + * @param {string} class_name (Optional) The class attribute to include in and tags. + * + * @return {string} The combined HTML content with differences wrapped in and tags. + */ + + diff = function(before, after, class_name) { var ops; if (before === after) { return before; @@ -479,7 +492,7 @@ before = html_to_tokens(before); after = html_to_tokens(after); ops = calculate_operations(before, after); - return render_operations(before, after, ops); + return render_operations(before, after, ops, class_name); }; diff.html_to_tokens = html_to_tokens; diff --git a/src/htmldiff.coffee b/src/htmldiff.coffee index 37a84a0..27e32e6 100644 --- a/src/htmldiff.coffee +++ b/src/htmldiff.coffee @@ -346,7 +346,7 @@ consecutive_where = (start, content, predicate)-> return content[0..last_matching_index] if last_matching_index? return [] -wrap = (tag, content)-> +wrap = (tag, content, class_name)-> rendering = '' position = 0 length = content.length @@ -357,7 +357,8 @@ wrap = (tag, content)-> position += non_tags.length if non_tags.length isnt 0 val = non_tags.join '' - rendering += "<#{tag}>#{val}" if val.trim() + attrs = if class_name then " class=\"#{class_name}\"" else '' + rendering += "<#{tag}#{attrs}>#{val}" if val.trim() break if position >= length tags = consecutive_where position, content, is_tag @@ -367,29 +368,39 @@ wrap = (tag, content)-> return rendering op_map = - equal: (op, before_tokens, after_tokens)-> + equal: (op, before_tokens, after_tokens, class_name)-> after_tokens[op.start_in_after..op.end_in_after].join '' - insert: (op, before_tokens, after_tokens)-> + insert: (op, before_tokens, after_tokens, class_name)-> val = after_tokens[op.start_in_after..op.end_in_after] - wrap 'ins', val + wrap 'ins', val, class_name - delete: (op, before_tokens, after_tokens)-> + delete: (op, before_tokens, after_tokens, class_name)-> val = before_tokens[op.start_in_before..op.end_in_before] - wrap 'del', val + wrap 'del', val, class_name -op_map.replace = (op, before_tokens, after_tokens)-> - (op_map.delete op, before_tokens, after_tokens) + - (op_map.insert op, before_tokens, after_tokens) +op_map.replace = (op, before_tokens, after_tokens, class_name)-> + (op_map.delete op, before_tokens, after_tokens, class_name) + + (op_map.insert op, before_tokens, after_tokens, class_name) -render_operations = (before_tokens, after_tokens, operations)-> +render_operations = (before_tokens, after_tokens, operations, class_name)-> rendering = '' for op in operations - rendering += op_map[op.action] op, before_tokens, after_tokens + rendering += op_map[op.action] op, before_tokens, after_tokens, class_name return rendering -diff = (before, after)-> +### + * Compares two pieces of HTML content and returns the combined content with differences + * wrapped in and tags. + * + * @param {string} before The HTML content before the changes. + * @param {string} after The HTML content after the changes. + * @param {string} class_name (Optional) The class attribute to include in and tags. + * + * @return {string} The combined HTML content with differences wrapped in and tags. +### +diff = (before, after, class_name)-> return before if before is after before = html_to_tokens before @@ -397,7 +408,7 @@ diff = (before, after)-> ops = calculate_operations before, after - render_operations before, after, ops + render_operations before, after, ops, class_name diff.html_to_tokens = html_to_tokens diff --git a/test/diff.spec.coffee b/test/diff.spec.coffee index c0ad77c..852ff3d 100644 --- a/test/diff.spec.coffee +++ b/test/diff.spec.coffee @@ -25,3 +25,8 @@ describe 'Diff', -> it 'should consider non-breaking spaces and non-adjacent regular spaces as equal', -> (expect @cut 'Hello world', 'Hello world').to.equal 'Hello world' + + describe 'When a class name is specified', -> + it 'should include the class in the wrapper tags', -> + (expect @cut 'input', 'input 2', 'diff-result').to.equal \ + 'input 2' From f2e637531219381f230bf0705b3f9015346da353 Mon Sep 17 00:00:00 2001 From: Keanu Lee Date: Mon, 14 Apr 2014 17:43:53 -0700 Subject: [PATCH 5/7] Fixed compilers arg in mocha options file --- package.json | 2 +- test/mocha.opts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index 893846a..2322191 100644 --- a/package.json +++ b/package.json @@ -4,7 +4,7 @@ "description": "HTML Diffing in JavaScript (CoffeeScript)", "main": "htmldiff.js", "scripts": { - "test": "mocha -R min --compilers coffee:coffee-script/register", + "test": "mocha -R min", "install": "coffee --output js/ --compile src/" }, "repository": { diff --git a/test/mocha.opts b/test/mocha.opts index b0f5199..ab28531 100644 --- a/test/mocha.opts +++ b/test/mocha.opts @@ -1,4 +1,4 @@ ---compilers coffee:coffee-script +--compilers coffee:coffee-script/register --require test/config.js --ui bdd --reporter spec From e78e2153f035fd63e1990ed206fde26266bba35c Mon Sep 17 00:00:00 2001 From: Keanu Lee Date: Tue, 15 Apr 2014 11:50:49 -0700 Subject: [PATCH 6/7] More code documentation. --- js/htmldiff.js | 167 ++++++++++++++++++++++++++++++++++++++++++++ src/htmldiff.coffee | 149 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 315 insertions(+), 1 deletion(-) diff --git a/js/htmldiff.js b/js/htmldiff.js index 1cde3f8..40ca063 100644 --- a/js/htmldiff.js +++ b/js/htmldiff.js @@ -1,4 +1,34 @@ // Generated by CoffeeScript 1.7.1 + +/* + * htmldiff.js is a library that compares HTML content. It creates a diff between two + * HTML documents by combining the two documents and wrapping the differences with + * and tags. Here is a high-level overview of how the diff works. + * + * 1. Tokenize the before and after HTML with html_to_tokens. + * 2. Generate a list of operations that convert the before list of tokens to the after + * list of tokens with calculate_operations, which does the following: + * a. Find all the matching blocks of tokens between the before and after lists of + * tokens with find_matching_blocks. This is done by finding the single longest + * matching block with find_match, then recursively finding the next longest + * matching block that precede and follow the longest matching block with + * recursively_find_matching_blocks. + * b. Determine insertions, deletions, and replacements from the matching blocks. + * This is done in calculate_operations. + * 3. Render the list of operations by wrapping tokens with and tags where + * appropriate with render_operations. + * + * Example usage: + * + * htmldiff = require 'htmldiff.js' + * + * htmldiff '

this is some text

', '

this is some more text

' + * == '

this is some more text

' + * + * htmldiff '

this is some text

', '

this is some more text

', 'diff-class' + * == '

this is some more text

' + */ + (function() { var Match, calculate_operations, consecutive_where, create_index, diff, find_match, find_matching_blocks, get_key_for_token, html_to_tokens, is_end_of_atomic_tag, is_end_of_tag, is_start_of_atomic_tag, is_start_of_tag, is_tag, is_void_tag, is_whitespace, is_wrappable, isnt_tag, op_map, recursively_find_matching_blocks, render_operations, wrap; @@ -85,6 +115,16 @@ return (isnt_tag(token)) || (is_start_of_atomic_tag(token)) || (is_void_tag(token)); }; + + /* + * A Match stores the information of a matching block. A matching block is a list of + * consecutive tokens that appear in both the before and after lists of tokens. + * + * @param {number} start_in_before The index of the first token in the list of before tokens. + * @param {number} start_in_after The index of the first token in the list of after tokens. + * @param {number} length The number of consecutive matching tokens in this block. + */ + Match = (function() { function Match(start_in_before, start_in_after, length) { this.start_in_before = start_in_before; @@ -223,6 +263,23 @@ return token; }; + + /* + * Finds the matching block with the most consecutive tokens within the given range in the + * before and after lists of tokens. + * + * @param {Array.} before_tokens The before list of tokens. + * @param {Array.} after_tokens The after list of tokens. + * @param {Object} index_of_before_locations_in_after_tokens The index that is used to search + * for tokens in the after list. + * @param {number} start_in_before The beginning of the range in the list of before tokens. + * @param {number} end_in_before The end of the range in the list of before tokens. + * @param {number} start_in_after The beginning of the range in the list of after tokens. + * @param {number} end_in_after The end of the range in the list of after tokens. + * + * @return {Match} A Match that describes the best matching block in the given range. + */ + find_match = function(before_tokens, after_tokens, index_of_before_locations_in_after_tokens, start_in_before, end_in_before, start_in_after, end_in_after) { var best_match_in_after, best_match_in_before, best_match_length, index_in_after, index_in_before, locations_in_after, looking_for, match, match_length_at, new_match_length, new_match_length_at, _i, _j, _len; best_match_in_before = start_in_before; @@ -260,6 +317,25 @@ return match; }; + + /* + * Finds all the matching blocks within the given range in the before and after lists of + * tokens. This function is called recursively to find the next best matches that precede + * and follow the first best match. + * + * @param {Array.} before_tokens The before list of tokens. + * @param {Array.} after_tokens The after list of tokens. + * @param {Object} index_of_before_locations_in_after_tokens The index that is used to search + * for tokens in the after list. + * @param {number} start_in_before The beginning of the range in the list of before tokens. + * @param {number} end_in_before The end of the range in the list of before tokens. + * @param {number} start_in_after The beginning of the range in the list of after tokens. + * @param {number} end_in_after The end of the range in the list of after tokens. + * @param {Array.} matching_blocks The list of matching blocks found so far. + * + * @return {Array.} The list of matching blocks in this range. + */ + recursively_find_matching_blocks = function(before_tokens, after_tokens, index_of_before_locations_in_after_tokens, start_in_before, end_in_before, start_in_after, end_in_after, matching_blocks) { var match; match = find_match(before_tokens, after_tokens, index_of_before_locations_in_after_tokens, start_in_before, end_in_before, start_in_after, end_in_after); @@ -314,6 +390,17 @@ return index; }; + + /* + * Finds all the matching blocks in the before and after lists of tokens. This function + * is a wrapper for the recursive function recursively_find_matching_blocks. + * + * @param {Array.} before_tokens The before list of tokens. + * @param {Array.} after_tokens The after list of tokens. + * + * @return {Array.} The list of matching blocks. + */ + find_matching_blocks = function(before_tokens, after_tokens) { var index_of_before_locations_in_after_tokens, matching_blocks; matching_blocks = []; @@ -324,6 +411,25 @@ return recursively_find_matching_blocks(before_tokens, after_tokens, index_of_before_locations_in_after_tokens, 0, before_tokens.length, 0, after_tokens.length, matching_blocks); }; + + /* + * Gets a list of operations required to transform the before list of tokens into the + * after list of tokens. An operation describes whether a particular list of consecutive + * tokens are equal, replaced, inserted, or deleted. + * + * @param {Array.} before_tokens The before list of tokens. + * @param {Array.} after_tokens The after list of tokens. + * + * @return {Array.} The list of operations to transform the before list of + * tokens into the after list of tokens, where each operation has the following + * keys: + * - {string} action One of {'replace', 'insert', 'delete', 'equal'}. + * - {number} start_in_before The beginning of the range in the list of before tokens. + * - {number} end_in_before The end of the range in the list of before tokens. + * - {number} start_in_after The beginning of the range in the list of after tokens. + * - {number} end_in_after The end of the range in the list of after tokens. + */ + calculate_operations = function(before_tokens, after_tokens) { var action_map, action_up_to_match_positions, index, is_single_whitespace, last_op, match, match_starts_at_current_position_in_after, match_starts_at_current_position_in_before, matches, op, operations, position_in_after, position_in_before, post_processed, _i, _j, _len, _len1; if (before_tokens == null) { @@ -394,6 +500,18 @@ return post_processed; }; + + /* + * Returns a list of tokens of a particular type starting at a given index. + * + * @param {number} start The index of first token to test. + * @param {Array.} content The list of tokens. + * @param {function} predicate A function that returns true if a token is of + * a particular type, false otherwise. It should accept the following + * parameters: + * - {string} The token to test. + */ + consecutive_where = function(start, content, predicate) { var answer, index, last_matching_index, token, _i, _len; content = content.slice(start, +content.length + 1 || 9e9); @@ -414,6 +532,16 @@ return []; }; + + /* + * Wraps and concatenates a list of tokens with a tag. Does not wrap tag tokens, + * unless they are wrappable (i.e. void and atomic tags). + * + * @param {sting} tag The tag name of the wrapper tags. + * @param {Array.} content The list of tokens to wrap. + * @param {string} class_name (Optional) The class name to include in the wrapper tag. + */ + wrap = function(tag, content, class_name) { var attrs, length, non_tags, position, rendering, tags, val; rendering = ''; @@ -442,6 +570,25 @@ return rendering; }; + + /* + * op_map.equal/insert/delete/replace are functions that render an operation into + * HTML content. + * + * @param {Object} op The operation that applies to a prticular list of tokens. Has the + * following keys: + * - {string} action One of {'replace', 'insert', 'delete', 'equal'}. + * - {number} start_in_before The beginning of the range in the list of before tokens. + * - {number} end_in_before The end of the range in the list of before tokens. + * - {number} start_in_after The beginning of the range in the list of after tokens. + * - {number} end_in_after The end of the range in the list of after tokens. + * @param {Array.} before_tokens The before list of tokens. + * @param {Array.} after_tokens The after list of tokens. + * @param {string} class_name (Optional) The class name to include in the wrapper tag. + * + * @return {string} The rendering of that operation. + */ + op_map = { equal: function(op, before_tokens, after_tokens, class_name) { return after_tokens.slice(op.start_in_after, +op.end_in_after + 1 || 9e9).join(''); @@ -462,6 +609,26 @@ return (op_map["delete"](op, before_tokens, after_tokens, class_name)) + (op_map.insert(op, before_tokens, after_tokens, class_name)); }; + + /* + * Renders a list of operations into HTML content. The result is the combined version + * of the before and after tokens with the differences wrapped in tags. + * + * @param {Array.} before_tokens The before list of tokens. + * @param {Array.} after_tokens The after list of tokens. + * @param {Array.} operations The list of operations to transform the before + * list of tokens into the after list of tokens, where each operation has the + * following keys: + * - {string} action One of {'replace', 'insert', 'delete', 'equal'}. + * - {number} start_in_before The beginning of the range in the list of before tokens. + * - {number} end_in_before The end of the range in the list of before tokens. + * - {number} start_in_after The beginning of the range in the list of after tokens. + * - {number} end_in_after The end of the range in the list of after tokens. + * @param {string} class_name (Optional) The class name to include in the wrapper tag. + * + * @return {string} The rendering of the list of operations. + */ + render_operations = function(before_tokens, after_tokens, operations, class_name) { var op, rendering, _i, _len; rendering = ''; diff --git a/src/htmldiff.coffee b/src/htmldiff.coffee index 27e32e6..46cb5a1 100644 --- a/src/htmldiff.coffee +++ b/src/htmldiff.coffee @@ -1,3 +1,32 @@ +### + * htmldiff.js is a library that compares HTML content. It creates a diff between two + * HTML documents by combining the two documents and wrapping the differences with + * and tags. Here is a high-level overview of how the diff works. + * + * 1. Tokenize the before and after HTML with html_to_tokens. + * 2. Generate a list of operations that convert the before list of tokens to the after + * list of tokens with calculate_operations, which does the following: + * a. Find all the matching blocks of tokens between the before and after lists of + * tokens with find_matching_blocks. This is done by finding the single longest + * matching block with find_match, then recursively finding the next longest + * matching block that precede and follow the longest matching block with + * recursively_find_matching_blocks. + * b. Determine insertions, deletions, and replacements from the matching blocks. + * This is done in calculate_operations. + * 3. Render the list of operations by wrapping tokens with and tags where + * appropriate with render_operations. + * + * Example usage: + * + * htmldiff = require 'htmldiff.js' + * + * htmldiff '

this is some text

', '

this is some more text

' + * == '

this is some more text

' + * + * htmldiff '

this is some text

', '

this is some more text

', 'diff-class' + * == '

this is some more text

' +### + is_end_of_tag = (char)-> char is '>' is_start_of_tag = (char)-> char is '<' is_whitespace = (char)-> /^\s+$/.test char @@ -52,6 +81,14 @@ is_void_tag = (token) -> is_wrappable = (token) -> (isnt_tag token) or (is_start_of_atomic_tag token) or (is_void_tag token) +### + * A Match stores the information of a matching block. A matching block is a list of + * consecutive tokens that appear in both the before and after lists of tokens. + * + * @param {number} start_in_before The index of the first token in the list of before tokens. + * @param {number} start_in_after The index of the first token in the list of after tokens. + * @param {number} length The number of consecutive matching tokens in this block. +### class Match constructor: (@start_in_before, @start_in_after, @length)-> @end_in_before = (@start_in_before + @length) - 1 @@ -158,6 +195,21 @@ get_key_for_token = (token)-> return token +### + * Finds the matching block with the most consecutive tokens within the given range in the + * before and after lists of tokens. + * + * @param {Array.} before_tokens The before list of tokens. + * @param {Array.} after_tokens The after list of tokens. + * @param {Object} index_of_before_locations_in_after_tokens The index that is used to search + * for tokens in the after list. + * @param {number} start_in_before The beginning of the range in the list of before tokens. + * @param {number} end_in_before The end of the range in the list of before tokens. + * @param {number} start_in_after The beginning of the range in the list of after tokens. + * @param {number} end_in_after The end of the range in the list of after tokens. + * + * @return {Match} A Match that describes the best matching block in the given range. +### find_match = (before_tokens, after_tokens, index_of_before_locations_in_after_tokens, start_in_before, end_in_before, @@ -197,6 +249,23 @@ find_match = (before_tokens, after_tokens, return match +### + * Finds all the matching blocks within the given range in the before and after lists of + * tokens. This function is called recursively to find the next best matches that precede + * and follow the first best match. + * + * @param {Array.} before_tokens The before list of tokens. + * @param {Array.} after_tokens The after list of tokens. + * @param {Object} index_of_before_locations_in_after_tokens The index that is used to search + * for tokens in the after list. + * @param {number} start_in_before The beginning of the range in the list of before tokens. + * @param {number} end_in_before The end of the range in the list of before tokens. + * @param {number} start_in_after The beginning of the range in the list of after tokens. + * @param {number} end_in_after The end of the range in the list of after tokens. + * @param {Array.} matching_blocks The list of matching blocks found so far. + * + * @return {Array.} The list of matching blocks in this range. +### recursively_find_matching_blocks = (before_tokens, after_tokens, index_of_before_locations_in_after_tokens, start_in_before, end_in_before, @@ -258,6 +327,15 @@ create_index = (options)-> return index +### + * Finds all the matching blocks in the before and after lists of tokens. This function + * is a wrapper for the recursive function recursively_find_matching_blocks. + * + * @param {Array.} before_tokens The before list of tokens. + * @param {Array.} after_tokens The after list of tokens. + * + * @return {Array.} The list of matching blocks. +### find_matching_blocks = (before_tokens, after_tokens)-> matching_blocks = [] index_of_before_locations_in_after_tokens = @@ -271,6 +349,23 @@ find_matching_blocks = (before_tokens, after_tokens)-> 0, after_tokens.length, matching_blocks +### + * Gets a list of operations required to transform the before list of tokens into the + * after list of tokens. An operation describes whether a particular list of consecutive + * tokens are equal, replaced, inserted, or deleted. + * + * @param {Array.} before_tokens The before list of tokens. + * @param {Array.} after_tokens The after list of tokens. + * + * @return {Array.} The list of operations to transform the before list of + * tokens into the after list of tokens, where each operation has the following + * keys: + * - {string} action One of {'replace', 'insert', 'delete', 'equal'}. + * - {number} start_in_before The beginning of the range in the list of before tokens. + * - {number} end_in_before The end of the range in the list of before tokens. + * - {number} start_in_after The beginning of the range in the list of after tokens. + * - {number} end_in_after The end of the range in the list of after tokens. +### calculate_operations = (before_tokens, after_tokens)-> throw new Error 'before_tokens?' unless before_tokens? throw new Error 'after_tokens?' unless after_tokens? @@ -334,6 +429,16 @@ calculate_operations = (before_tokens, after_tokens)-> return post_processed +### + * Returns a list of tokens of a particular type starting at a given index. + * + * @param {number} start The index of first token to test. + * @param {Array.} content The list of tokens. + * @param {function} predicate A function that returns true if a token is of + * a particular type, false otherwise. It should accept the following + * parameters: + * - {string} The token to test. +### consecutive_where = (start, content, predicate)-> content = content[start..content.length] last_matching_index = undefined @@ -346,6 +451,14 @@ consecutive_where = (start, content, predicate)-> return content[0..last_matching_index] if last_matching_index? return [] +### + * Wraps and concatenates a list of tokens with a tag. Does not wrap tag tokens, + * unless they are wrappable (i.e. void and atomic tags). + * + * @param {sting} tag The tag name of the wrapper tags. + * @param {Array.} content The list of tokens to wrap. + * @param {string} class_name (Optional) The class name to include in the wrapper tag. +### wrap = (tag, content, class_name)-> rendering = '' position = 0 @@ -367,6 +480,23 @@ wrap = (tag, content, class_name)-> return rendering +### + * op_map.equal/insert/delete/replace are functions that render an operation into + * HTML content. + * + * @param {Object} op The operation that applies to a prticular list of tokens. Has the + * following keys: + * - {string} action One of {'replace', 'insert', 'delete', 'equal'}. + * - {number} start_in_before The beginning of the range in the list of before tokens. + * - {number} end_in_before The end of the range in the list of before tokens. + * - {number} start_in_after The beginning of the range in the list of after tokens. + * - {number} end_in_after The end of the range in the list of after tokens. + * @param {Array.} before_tokens The before list of tokens. + * @param {Array.} after_tokens The after list of tokens. + * @param {string} class_name (Optional) The class name to include in the wrapper tag. + * + * @return {string} The rendering of that operation. +### op_map = equal: (op, before_tokens, after_tokens, class_name)-> after_tokens[op.start_in_after..op.end_in_after].join '' @@ -383,6 +513,24 @@ op_map.replace = (op, before_tokens, after_tokens, class_name)-> (op_map.delete op, before_tokens, after_tokens, class_name) + (op_map.insert op, before_tokens, after_tokens, class_name) +### + * Renders a list of operations into HTML content. The result is the combined version + * of the before and after tokens with the differences wrapped in tags. + * + * @param {Array.} before_tokens The before list of tokens. + * @param {Array.} after_tokens The after list of tokens. + * @param {Array.} operations The list of operations to transform the before + * list of tokens into the after list of tokens, where each operation has the + * following keys: + * - {string} action One of {'replace', 'insert', 'delete', 'equal'}. + * - {number} start_in_before The beginning of the range in the list of before tokens. + * - {number} end_in_before The end of the range in the list of before tokens. + * - {number} start_in_after The beginning of the range in the list of after tokens. + * - {number} end_in_after The end of the range in the list of after tokens. + * @param {string} class_name (Optional) The class name to include in the wrapper tag. + * + * @return {string} The rendering of the list of operations. +### render_operations = (before_tokens, after_tokens, operations, class_name)-> rendering = '' for op in operations @@ -410,7 +558,6 @@ diff = (before, after, class_name)-> render_operations before, after, ops, class_name - diff.html_to_tokens = html_to_tokens diff.find_matching_blocks = find_matching_blocks find_matching_blocks.find_match = find_match From d20e03180d6c342d97bf22b4932acca24d2af878 Mon Sep 17 00:00:00 2001 From: Keanu Lee Date: Mon, 14 Apr 2014 17:19:02 -0700 Subject: [PATCH 7/7] Make fork mergable with upstream tnwinc repo. --- .gitignore | 1 + README.md | 2 +- js/htmldiff.js | 689 ------------------------------------------------- package.json | 2 +- 4 files changed, 3 insertions(+), 691 deletions(-) delete mode 100644 js/htmldiff.js diff --git a/.gitignore b/.gitignore index 3c3629e..06f62bf 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ +*.js node_modules diff --git a/README.md b/README.md index fce14f0..7da4ee1 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # htmldiff.js ### HTML Diffing in JavaScript (ok, CoffeeScript actually.) -[![Build Status](https://travis-ci.org/keanulee/htmldiff.js.svg?branch=master)](https://travis-ci.org/keanulee/htmldiff.js) +[![Build Status](https://secure.travis-ci.org/tnwinc/htmldiff.js.png)](http://travis-ci.org/tnwinc/htmldiff.js) `htmldiff.js` is a CoffeeScript port of https://github.com/myobie/htmldiff (This one has a few more tests.) diff --git a/js/htmldiff.js b/js/htmldiff.js deleted file mode 100644 index 40ca063..0000000 --- a/js/htmldiff.js +++ /dev/null @@ -1,689 +0,0 @@ -// Generated by CoffeeScript 1.7.1 - -/* - * htmldiff.js is a library that compares HTML content. It creates a diff between two - * HTML documents by combining the two documents and wrapping the differences with - * and tags. Here is a high-level overview of how the diff works. - * - * 1. Tokenize the before and after HTML with html_to_tokens. - * 2. Generate a list of operations that convert the before list of tokens to the after - * list of tokens with calculate_operations, which does the following: - * a. Find all the matching blocks of tokens between the before and after lists of - * tokens with find_matching_blocks. This is done by finding the single longest - * matching block with find_match, then recursively finding the next longest - * matching block that precede and follow the longest matching block with - * recursively_find_matching_blocks. - * b. Determine insertions, deletions, and replacements from the matching blocks. - * This is done in calculate_operations. - * 3. Render the list of operations by wrapping tokens with and tags where - * appropriate with render_operations. - * - * Example usage: - * - * htmldiff = require 'htmldiff.js' - * - * htmldiff '

this is some text

', '

this is some more text

' - * == '

this is some more text

' - * - * htmldiff '

this is some text

', '

this is some more text

', 'diff-class' - * == '

this is some more text

' - */ - -(function() { - var Match, calculate_operations, consecutive_where, create_index, diff, find_match, find_matching_blocks, get_key_for_token, html_to_tokens, is_end_of_atomic_tag, is_end_of_tag, is_start_of_atomic_tag, is_start_of_tag, is_tag, is_void_tag, is_whitespace, is_wrappable, isnt_tag, op_map, recursively_find_matching_blocks, render_operations, wrap; - - is_end_of_tag = function(char) { - return char === '>'; - }; - - is_start_of_tag = function(char) { - return char === '<'; - }; - - is_whitespace = function(char) { - return /^\s+$/.test(char); - }; - - is_tag = function(token) { - return /^\s*<[^>]+>\s*$/.test(token); - }; - - isnt_tag = function(token) { - return !is_tag(token); - }; - - - /* - * Checks if the current word is the beginning of an atomic tag. An atomic tag is one whose - * child nodes should not be compared - the entire tag should be treated as one token. This - * is useful for tags where it does not make sense to insert and tags. - * - * @param {string} word The characters of the current token read so far. - * - * @return {string|null} The name of the atomic tag if the word will be an atomic tag, - * null otherwise - */ - - is_start_of_atomic_tag = function(word) { - var result; - result = /^<(iframe|object|math|svg|script)/.exec(word); - if (result) { - result = result[1]; - } - return result; - }; - - - /* - * Checks if the current word is the end of an atomic tag (i.e. it has all the characters, - * except for the end bracket of the closing tag, such as "