Text
- Item 1
- Item 2
diff --git a/htmldiff.gemspec b/htmldiff.gemspec index 1e61eab..c5a508b 100644 --- a/htmldiff.gemspec +++ b/htmldiff.gemspec @@ -13,6 +13,7 @@ Gem::Specification.new do |spec| spec.license = 'MIT' spec.add_dependency 'diff-lcs' + spec.add_dependency 'nokogiri', '>= 1' spec.files = Dir.glob('lib/**/*') + %w[LICENSE README] spec.require_paths = ['lib'] diff --git a/lib/html_diff.rb b/lib/html_diff.rb index f168ee0..8888bb9 100644 --- a/lib/html_diff.rb +++ b/lib/html_diff.rb @@ -1,7 +1,9 @@ # frozen_string_literal: true require 'html_diff/tokenizer' +require 'html_diff/dom_tokenizer' require 'html_diff/differ' +require 'html_diff/tree_differ' require 'html_diff/html_formatter' require 'html_diff/version' require 'html_diff/diff_builder' # @deprecated @@ -15,17 +17,26 @@ module HTMLDiff # # @param old_string [String] The original string # @param new_string [String] The new string + # @option :html_format [Hash] An optional hash of options to pass to the formatter. + # @option :preserve_dom [Boolean] Whether to preserve DOM structure in the diff + # output, ensuring valid HTML when diffing content with block elements. + # @option :merge_threshold [Object] Maximum string length of unchanged tokens + # to merge into neighboring changes. Value 0 merges only whitespace. + # Negative values disable merging. Default value is 5. # @option :tokenizer [Object] An optional object which responds to `tokenize`, # which is used break the input strings into an Array of LCS-diffable tokens. - # @option :format [Hash] An optional hash of options to pass to the formatter. # @option :formatter [Object] An optional object which responds to `format`, # which renders the LCS-diff output. - # @option :merge_threshold [Object] Maximum string length of unchanged tokens - # to merge into neighboring changes. Value 0 merges only whitespace. - # Negative values disable merging. Default value is 5. # @return [String] Diff of the two strings with additions and deletions marked. - def diff(old_string, new_string, tokenizer: nil, html_format: nil, formatter: nil, merge_threshold: nil) - tokenizer ||= Tokenizer + def diff(old_string, + new_string, + html_format: nil, + preserve_dom: false, + merge_threshold: nil, + tokenizer: nil, + formatter: nil) + + tokenizer ||= preserve_dom ? DomTokenizer : Tokenizer old_tokens = tokenizer.tokenize(old_string) new_tokens = tokenizer.tokenize(new_string) diff --git a/lib/html_diff/dom_tokenizer.rb b/lib/html_diff/dom_tokenizer.rb new file mode 100644 index 0000000..c7c98c6 --- /dev/null +++ b/lib/html_diff/dom_tokenizer.rb @@ -0,0 +1,277 @@ +# frozen_string_literal: true + +require 'nokogiri' + +module HTMLDiff + # Tokenizes HTML while preserving DOM structure + module DomTokenizer + extend self + + # Tokenizes HTML in a DOM-aware way + # + # @param html [String] The HTML string to tokenize + # @return [Array] Nested array structure representing the DOM + def tokenize(html) + return [] if !html || html.empty? + + html = html.encode('UTF-8', invalid: :replace, undef: :replace, replace: ' ') + + # Parse the HTML + doc = Nokogiri::HTML(html) + + result = [] + + # Add DOCTYPE if present + if doc.internal_subset + result << [''] + end + + # Process the root html element + html_element = doc.at_css('html') + if html_element + result << tokenize_element(html_element) + end + + result + end + + private + + # Tokenize an element into [name, attributes_hash, children_array] + def tokenize_element(element) + # Extract attributes + attrs = {} + element.attributes.each do |name, attr| + attrs[name] = attr.value + end + + # Process children + children = [] + element.children.each do |child| + if child.text? + # Tokenize text + tokens = tokenize_text(child.content) + children << tokens unless tokens.empty? + elsif child.element? + # Recursively tokenize element + children << tokenize_element(child) + end + end + + [element.name, attrs, children] + end + + # Tokenize text content + def tokenize_text(text) + return [] if text.strip.empty? + + # TODO: This junk needs to be fixed. + result = [] + words = text.split(/(\s+|\b|(?=[.,;:!?]))/) + .reject(&:empty?) + .map { |w| w =~ /\A\s+\z/ ? ' ' : w } + + words.each do |word| + result << word + end + + Tokenizer.tokenize(result.join('')) + end + end +end + +# frozen_string_literal: true + +require 'nokogiri' + +module HTMLDiff + # Tokenizes HTML while preserving DOM structure + class DomTokenizer + class ParseError < StandardError; end + + def initialize(options = {}) + @preserve_whitespace = options[:preserve_whitespace] || false + end + + # Tokenizes HTML in a DOM-aware way + # + # @param html [String] The HTML string to tokenize + # @return [Array] Nested array structure representing the DOM + def self.tokenize(html, options = {}) + new(options).tokenize(html) + end + + def tokenize(html) + return [] if !html || html.empty? + + begin + html = html.encode('UTF-8', invalid: :replace, undef: :replace, replace: ' ') + + # Simple text case + if !html.include?('<') && !html.include?('>') + return [html] + end + + # Parse the HTML + doc = Nokogiri::HTML(html, nil, 'UTF-8') + + result = [] + + # Add DOCTYPE if present (and if in the original HTML) + if doc.internal_subset && html.include?(''] + end + + # Check for malformed HTML + check_for_malformed_html(html) + + # Special case for a single paragaph of plain text + if doc.at_css('body') && + doc.at_css('body').children.size == 1 && + doc.at_css('body').children.first.text? && + !html.match(/<[^>]+>/) + return [doc.at_css('body').content] + end + + # Process html element or direct children depending on the input + if html.strip.start_with?(' *') + + # If no nodes found in body, try document level + nodes = doc.children.reject { |n| n.name == 'html' } if nodes.empty? + + nodes.each do |node| + token = process_node(node) + result << token if token + end + end + + result + rescue Nokogiri::XML::SyntaxError => e + raise ParseError, "Error parsing HTML: #{e.message}" + rescue => e + raise ParseError, "Error processing HTML: #{e.message}" + end + end + + private + + def check_for_malformed_html(html) + # Simple check for unbalanced tags + opening_tags = html.scan(/<([a-zA-Z0-9]+)(?:\s+[^>]*)?(?!\/)>/i).flatten + closing_tags = html.scan(/<\/([a-zA-Z0-9]+)>/i).flatten + + # Identify self-closing tags + void_elements = ['img', 'br', 'hr', 'meta', 'input', 'link', 'area', 'base', 'col', 'embed', + 'param', 'source', 'track', 'wbr'] + + # Count tags that need matching + tag_counts = Hash.new(0) + + opening_tags.each do |tag| + tag_counts[tag.downcase] += 1 unless void_elements.include?(tag.downcase) + end + + closing_tags.each do |tag| + tag_counts[tag.downcase] -= 1 + end + + # Check for any unbalanced tags + tag_counts.each do |tag, count| + if count != 0 + raise ParseError, "Unbalanced tags: #{tag}" + end + end + end + + def process_node(node) + case node.type + when Nokogiri::XML::Node::TEXT_NODE + process_text_node(node) + when Nokogiri::XML::Node::ELEMENT_NODE + process_element_node(node) + when Nokogiri::XML::Node::COMMENT_NODE + nil # Ignore comments + when Nokogiri::XML::Node::CDATA_SECTION_NODE + node.content # Return CDATA content as is + else + nil + end + end + + def process_text_node(node) + content = node.content + + # Skip empty text nodes unless preserving whitespace + return nil if !@preserve_whitespace && content.strip.empty? + + # Decode HTML entities + content = decode_html_entities(content) + + content + end + + def decode_html_entities(text) + text.gsub(/</, '<') + .gsub(/>/, '>') + .gsub(/&/, '&') + .gsub(/"/, '"') + .gsub(/'/, "'") + .gsub(/ /, ' ') + .gsub(/(\d+);/) { [$1.to_i].pack('U') } + .gsub(/([0-9a-fA-F]+);/) { [$1.to_i(16)].pack('U') } + end + + def process_element_node(node) + # Extract attributes + attrs = {} + node.attributes.each do |name, attr| + attrs[name] = attr.value + end + + # Handle self-closing tags + if is_self_closing?(node) + return [node.name, attrs, nil] + end + + # Process children + children = [] + has_text = false + has_elements = false + + node.children.each do |child| + if child.text? && (!child.content.strip.empty? || @preserve_whitespace) + has_text = true + elsif child.element? + has_elements = true + end + + child_token = process_node(child) + children << child_token if child_token + end + + # Return appropriate structure based on content type + if children.empty? + [node.name, attrs, []] + elsif has_text && has_elements + # Mixed content + [node.name, attrs, children] + elsif children.size == 1 && children.first.is_a?(String) + # Single text node child + [node.name, attrs, children.first] + else + # Multiple children or single element child + [node.name, attrs, children] + end + end + + def is_self_closing?(node) + ['img', 'br', 'hr', 'meta', 'input', 'link', 'area', 'base', 'col', 'embed', + 'param', 'source', 'track', 'wbr'].include?(node.name.downcase) && node.children.empty? + end + end +end diff --git a/lib/html_diff/tree_differ.rb b/lib/html_diff/tree_differ.rb new file mode 100644 index 0000000..bf3ceda --- /dev/null +++ b/lib/html_diff/tree_differ.rb @@ -0,0 +1,313 @@ +# frozen_string_literal: true + +require 'nokogiri' + +module HTMLDiff + # A structure-aware HTML differ that preserves DOM integrity + class TreeDiffer + # Block elements that should be preserved as structural units + BLOCK_ELEMENTS = %w[ + address article aside blockquote canvas dd div dl dt fieldset figcaption figure + footer form h1 h2 h3 h4 h5 h6 header hr li main nav ol p pre section + table tbody tfoot th thead tr ul video + ].freeze + + # Generate an HTML diff between two HTML strings + # + # @param old_html [String] The original HTML + # @param new_html [String] The new HTML + # @param html_format [Hash] Hash of options for formatting the output + # @return [String] HTML string with changes marked + def self.diff(old_html, new_html, html_format = {}) + diff_options = html_format || {} + + # Parse HTML documents + old_doc = parse_html(old_html) + new_doc = parse_html(new_html) + + # Normalize and prepare documents + normalize_nodes(old_doc) + normalize_nodes(new_doc) + + # Generate diff + changes = diff_trees(old_doc, new_doc) + + # Apply post-processing to ensure valid HTML + result_doc = post_process(changes, diff_options) + + # Convert back to HTML string + result_doc.to_html + end + + private + + def self.parse_html(html) + # Parse HTML with fragment to avoid adding html/body tags + Nokogiri::HTML.fragment(html) + end + + def self.normalize_nodes(doc) + # Remove comments + doc.xpath('//comment()').remove + + # Normalize whitespace in text nodes + doc.xpath('//text()').each do |node| + unless node.parent && BLOCK_ELEMENTS.include?(node.parent.name) + node.content = node.content.gsub(/\s+/, ' ') + end + end + end + + def self.diff_trees(old_doc, new_doc) + # Create a working copy we can manipulate + result_doc = old_doc.dup + + # Identify changes at each level + compare_nodes(result_doc, new_doc) + + result_doc + end + + def self.compare_nodes(old_node, new_node) + # If node types differ, replace entire node + if old_node.type != new_node.type + replace_node(old_node, new_node) + return + end + + # Handle text nodes + if old_node.text? && new_node.text? + if old_node.content != new_node.content + old_parent = old_node.parent + + # Replace with delete and insert tags + del_node = Nokogiri::XML::Node.new('del', old_parent.document) + del_node.content = old_node.content + ins_node = Nokogiri::XML::Node.new('ins', old_parent.document) + ins_node.content = new_node.content + + old_node.replace(del_node) + del_node.add_next_sibling(ins_node) + end + return + end + + # For element nodes, compare attributes + if old_node.element? && new_node.element? + # If tag names are different, replace the whole node + if old_node.name != new_node.name + replace_node(old_node, new_node) + return + end + + # Compare attributes + # For simplicity, we're not marking attribute changes in this example + end + + # Compare children + compare_children(old_node, new_node) + end + + def self.compare_children(old_parent, new_parent) + old_children = old_parent.children.to_a + new_children = new_parent.children.to_a + + # Use an LCS-based diff to find matching children + # This is a simplified version that won't handle all cases + i = 0 + j = 0 + + while i < old_children.length && j < new_children.length + old_child = old_children[i] + new_child = new_children[j] + + if nodes_equal?(old_child, new_child) + # Nodes match, recursively compare their children + compare_nodes(old_child, new_child) + i += 1 + j += 1 + elsif j + 1 < new_children.length && nodes_equal?(old_child, new_children[j + 1]) + # Current new node is an insertion + insert_node(old_child, new_child, position: :before) + j += 1 + elsif i + 1 < old_children.length && nodes_equal?(old_children[i + 1], new_child) + # Current old node is a deletion + wrap_in_delete(old_child) + i += 1 + else + # No good match found, treat as replacement + replace_node(old_child, new_child) + i += 1 + j += 1 + end + end + + # Handle remaining old nodes (deletions) + while i < old_children.length + wrap_in_delete(old_children[i]) + i += 1 + end + + # Handle remaining new nodes (insertions) + while j < new_children.length + insert_node(old_parent, new_children[j], position: :append) + j += 1 + end + end + + def self.nodes_equal?(node1, node2) + return false unless node1 && node2 + + if node1.text? && node2.text? + return node1.content.strip == node2.content.strip + elsif node1.element? && node2.element? + return node1.name == node2.name + end + + false + end + + def self.replace_node(old_node, new_node) + old_parent = old_node.parent + return unless old_parent + + # Create delete and insert nodes + del_node = Nokogiri::XML::Node.new('del', old_parent.document) + ins_node = Nokogiri::XML::Node.new('ins', old_parent.document) + + # Clone the old and new nodes to put inside del/ins + del_content = old_node.dup + ins_content = old_parent.document.import(new_node.dup) + + del_node.add_child(del_content) + ins_node.add_child(ins_content) + + # Replace the old node with the del+ins pair + old_node.replace(del_node) + del_node.add_next_sibling(ins_node) + end + + def self.insert_node(reference_node, new_node, position: :after) + parent = reference_node.parent + return unless parent + + # Create the insert node + ins_node = Nokogiri::XML::Node.new('ins', parent.document) + ins_content = parent.document.import(new_node.dup) + ins_node.add_child(ins_content) + + case position + when :before + reference_node.add_previous_sibling(ins_node) + when :after + reference_node.add_next_sibling(ins_node) + when :append + parent.add_child(ins_node) + end + end + + def self.wrap_in_delete(node) + parent = node.parent + return unless parent + + # Create the delete node + del_node = Nokogiri::XML::Node.new('del', parent.document) + + # Remove the node from its parent and add it to the delete node + node_dup = node.dup + del_node.add_child(node_dup) + + # Replace the original node with the delete node + node.replace(del_node) + end + + def self.post_process(doc, options) + fix_block_elements(doc) + fix_table_structure(doc) + fix_list_structure(doc) + doc + end + + def self.fix_block_elements(doc) + # Fix cases where ins/del tags break block element structure + BLOCK_ELEMENTS.each do |tag| + # Look for del/ins tags that contain block elements + doc.css("del #{tag}, ins #{tag}").each do |node| + # For block elements inside ins/del, we may need to restructure + # to maintain valid HTML + parent_change = node.parent + + # Extract this node from the parent ins/del + parent_change.add_previous_sibling(node) + + # Wrap the extracted node in its own ins/del + new_wrapper = Nokogiri::XML::Node.new(parent_change.name, doc) + node.replace(new_wrapper) + new_wrapper.add_child(node) + end + end + end + + def self.fix_table_structure(doc) + # Fix invalid table structures + # This is a simplified version - the real implementation would be more complex + + # Remove ins/del directly inside table, tbody, thead, tfoot, tr + %w[table tbody thead tfoot tr].each do |tag| + doc.css("#{tag} > ins, #{tag} > del").each do |node| + # Move contents up, replacing the ins/del + parent = node.parent + node.children.each do |child| + node.add_previous_sibling(child) + end + node.remove + end + end + + # For td/th elements, move ins/del inside + doc.css('tr > ins > td, tr > del > td, tr > ins > th, tr > del > th').each do |cell| + change_tag = cell.parent + change_type = change_tag.name # ins or del + + # Move cell out of ins/del + change_tag.add_previous_sibling(cell) + change_tag.remove + + # Create a new ins/del inside the cell + new_change = Nokogiri::XML::Node.new(change_type, doc) + cell.children.each do |child| + new_change.add_child(child) + end + cell.add_child(new_change) + end + end + + def self.fix_list_structure(doc) + # Fix invalid list structures (li must be direct children of ul/ol) + doc.css('del > li, ins > li').each do |li| + change_tag = li.parent + change_type = change_tag.name # ins or del + + # Move li out of ins/del + if change_tag.parent && %w[ul ol].include?(change_tag.parent.name) + change_tag.add_previous_sibling(li) + + # Add a class to mark deleted list items + if change_type == 'del' + li['class'] = [li['class'], 'del-li'].compact.join(' ') + end + + # Create a new ins/del inside the li + new_change = Nokogiri::XML::Node.new(change_type, doc) + li.children.each do |child| + new_change.add_child(child.dup) + end + li.children.remove + li.add_child(new_change) + + change_tag.remove if change_tag.children.empty? + end + end + end + end +end diff --git a/spec/html_diff/dom_tokenizer_spec.rb b/spec/html_diff/dom_tokenizer_spec.rb new file mode 100644 index 0000000..8e1d838 --- /dev/null +++ b/spec/html_diff/dom_tokenizer_spec.rb @@ -0,0 +1,253 @@ +# frozen_string_literal: true + +require_relative '../spec_helper' + +RSpec.describe HTMLDiff::DomTokenizer do + describe '.tokenize' do + subject(:tokenizer) { described_class } + + it 'returns an empty array for empty string' do + expect(tokenizer.tokenize('')).to eq([]) + end + + it 'parses a simple text node' do + expect(tokenizer.tokenize('Hello World')).to eq(['Hello World']) + end + + it 'parses a single tag without attributes' do + html = '
Paragraph
' + expected = [ + ['p', {}, 'Paragraph'] + ] + expect(tokenizer.tokenize(html)).to eq(expected) + end + + it 'parses a single tag with attributes' do + html = 'Paragraph
' + expected = [ + ['p', { 'class' => 'intro', 'id' => 'first' }, ['Paragraph']] + ] + expect(tokenizer.tokenize(html)).to eq(expected) + end + + it 'parses nested tags' do + html = 'Paragraph
Paragraph
Text

Description
This is important text
' + expected = [ + ['p', {}, [ + ['This is '], + ['strong', {}, ['important']], + [' ', 'text'] + ]] + ] + expect(tokenizer.tokenize(html)).to eq(expected) + end + + it 'handles malformed HTML by raising an error' do + html = 'Unclosed paragraph tag
Spaced text
\nText
<div> is a block element & "p" is another
' + expected = [ + ['p', {}, ['This is a sample nested HTML document.
+We are a fictional company that specializes in web development.
+You can reach us at:
+ + info@example.comThis is some text
' + new_html = 'This is modified text
' + + result = described_class.diff(old_html, new_html) + + expect(result).to include('Lorem ipsum dolor sit amet. foo
' + new_html = <<~HTML +Lorem ipsum dolor sit amet.
+New paragraph
+And yet another new paragraph
+ HTML + + result = described_class.diff(old_html, new_html) + + # The diff should have valid HTML structure + expect(result).to include('Lorem ipsum dolor sit amet. foo
New paragraph
') + expect(result).to include('And yet another new paragraph
') + + # Validate with Nokogiri to ensure it's well-formed + expect { Nokogiri::HTML.fragment(result) }.not_to raise_error + end + end + + context 'with nested content' do + it 'preserves structure in nested elements' do + old_html = 'Old content
New content
OldNew content
| Cell 1 | Cell 2 |
| Cell 1 Updated | Cell 2 |
| ')
+ expect(result).to include(' New paragraph New paragraph Old paragraph Old paragraph Lorem ipsum dolor sit amet. foo ' + new_html = <<~HTML +Lorem ipsum dolor sit amet. +New paragraph +And yet another new paragraph + HTML + + result = described_class.diff(old_html, new_html) + + # Parse the HTML to ensure it's valid + doc = Nokogiri::HTML.fragment(result) + + # Verify it's not broken like the original example + expect(result).not_to include('') + expect(result).not_to include('') + + # The content should make sense structurally + expect(doc.css('p').size).to eq(3) + expect(doc.css('del').size).to eq(1) + expect(doc.css('ins').size).to eq(2) + end + end + end +end |