From 8ebe54fa735e9acd95be4c1cba1cbd50899e4d84 Mon Sep 17 00:00:00 2001
From: johnnyshields <27655+johnnyshields@users.noreply.github.com>
Date: Fri, 14 Mar 2025 13:59:06 +0900
Subject: [PATCH 1/2] Preserve DOM mode WIP

---
 htmldiff.gemspec                     |   1 +
 lib/html_diff.rb                     |  23 +-
 lib/html_diff/dom_tokenizer.rb       |  81 +++++++
 lib/html_diff/tree_differ.rb         | 313 +++++++++++++++++++++++++++
 spec/html_diff/dom_tokenizer_spec.rb | 118 ++++++++++
 spec/html_diff/tree_differ_spec.rb   | 135 ++++++++++++
 6 files changed, 665 insertions(+), 6 deletions(-)
 create mode 100644 lib/html_diff/dom_tokenizer.rb
 create mode 100644 lib/html_diff/tree_differ.rb
 create mode 100644 spec/html_diff/dom_tokenizer_spec.rb
 create mode 100644 spec/html_diff/tree_differ_spec.rb

diff --git a/htmldiff.gemspec b/htmldiff.gemspec
index 1e61eab..c5a508b 100644
--- a/htmldiff.gemspec
+++ b/htmldiff.gemspec
@@ -13,6 +13,7 @@ Gem::Specification.new do |spec|
   spec.license = 'MIT'
 
   spec.add_dependency 'diff-lcs'
+  spec.add_dependency 'nokogiri', '>= 1'
 
   spec.files = Dir.glob('lib/**/*') + %w[LICENSE README]
   spec.require_paths = ['lib']
diff --git a/lib/html_diff.rb b/lib/html_diff.rb
index f168ee0..8888bb9 100644
--- a/lib/html_diff.rb
+++ b/lib/html_diff.rb
@@ -1,7 +1,9 @@
 # frozen_string_literal: true
 
 require 'html_diff/tokenizer'
+require 'html_diff/dom_tokenizer'
 require 'html_diff/differ'
+require 'html_diff/tree_differ'
 require 'html_diff/html_formatter'
 require 'html_diff/version'
 require 'html_diff/diff_builder' # @deprecated
@@ -15,17 +17,26 @@ module HTMLDiff
   #
   # @param old_string [String] The original string
   # @param new_string [String] The new string
+  # @option :html_format [Hash] An optional hash of options to pass to the formatter.
+  # @option :preserve_dom [Boolean] Whether to preserve DOM structure in the diff
+  #   output, ensuring valid HTML when diffing content with block elements.
+  # @option :merge_threshold [Object] Maximum string length of unchanged tokens
+  #   to merge into neighboring changes. Value 0 merges only whitespace.
+  #   Negative values disable merging. Default value is 5.
   # @option :tokenizer [Object] An optional object which responds to `tokenize`,
   #   which is used break the input strings into an Array of LCS-diffable tokens.
-  # @option :format [Hash] An optional hash of options to pass to the formatter.
   # @option :formatter [Object] An optional object which responds to `format`,
   #   which renders the LCS-diff output.
-  # @option :merge_threshold [Object] Maximum string length of unchanged tokens
-  #   to merge into neighboring changes. Value 0 merges only whitespace.
-  #   Negative values disable merging. Default value is 5.
   # @return [String] Diff of the two strings with additions and deletions marked.
-  def diff(old_string, new_string, tokenizer: nil, html_format: nil, formatter: nil, merge_threshold: nil)
-    tokenizer ||= Tokenizer
+  def diff(old_string,
+           new_string,
+           html_format: nil,
+           preserve_dom: false,
+           merge_threshold: nil,
+           tokenizer: nil,
+           formatter: nil)
+
+    tokenizer ||= preserve_dom ? DomTokenizer : Tokenizer
     old_tokens = tokenizer.tokenize(old_string)
     new_tokens = tokenizer.tokenize(new_string)
 
diff --git a/lib/html_diff/dom_tokenizer.rb b/lib/html_diff/dom_tokenizer.rb
new file mode 100644
index 0000000..5fda695
--- /dev/null
+++ b/lib/html_diff/dom_tokenizer.rb
@@ -0,0 +1,81 @@
+# frozen_string_literal: true
+
+require 'nokogiri'
+
+module HTMLDiff
+  # Tokenizes HTML while preserving DOM structure
+  module DomTokenizer
+    extend self
+
+    # Tokenizes HTML in a DOM-aware way
+    #
+    # @param html [String] The HTML string to tokenize
+    # @return [Array] Nested array structure representing the DOM
+    def tokenize(html)
+      return [] if !html || html.empty?
+
+      html = html.encode('UTF-8', invalid: :replace, undef: :replace, replace: ' ')
+
+      # Parse the HTML
+      doc = Nokogiri::HTML(html)
+
+      result = []
+
+      # Add DOCTYPE if present
+      if doc.internal_subset
+        result << ['<!DOCTYPE', ' ', 'html>']
+      end
+
+      # Process the root html element
+      html_element = doc.at_css('html')
+      if html_element
+        result << tokenize_element(html_element)
+      end
+
+      result
+    end
+
+    private
+
+    # Tokenize an element into [name, attributes_hash, children_array]
+    def tokenize_element(element)
+      # Extract attributes
+      attrs = {}
+      element.attributes.each do |name, attr|
+        attrs[name] = attr.value
+      end
+
+      # Process children
+      children = []
+      element.children.each do |child|
+        if child.text?
+          # Tokenize text
+          tokens = tokenize_text(child.content)
+          children << tokens unless tokens.empty?
+        elsif child.element?
+          # Recursively tokenize element
+          children << tokenize_element(child)
+        end
+      end
+
+      [element.name, attrs, children]
+    end
+
+    # Tokenize text content
+    def tokenize_text(text)
+      return [] if text.strip.empty?
+
+      # TODO: This junk needs to be fixed.
+      result = []
+      words = text.split(/(\s+|\b|(?=[.,;:!?]))/)
+                  .reject(&:empty?)
+                  .map { |w| w =~ /\A\s+\z/ ? ' ' : w }
+
+      words.each do |word|
+        result << word
+      end
+
+      Tokenizer.tokenize(result.join(''))
+    end
+  end
+end
diff --git a/lib/html_diff/tree_differ.rb b/lib/html_diff/tree_differ.rb
new file mode 100644
index 0000000..bf3ceda
--- /dev/null
+++ b/lib/html_diff/tree_differ.rb
@@ -0,0 +1,313 @@
+# frozen_string_literal: true
+
+require 'nokogiri'
+
+module HTMLDiff
+  # A structure-aware HTML differ that preserves DOM integrity
+  class TreeDiffer
+    # Block elements that should be preserved as structural units
+    BLOCK_ELEMENTS = %w[
+      address article aside blockquote canvas dd div dl dt fieldset figcaption figure
+      footer form h1 h2 h3 h4 h5 h6 header hr li main nav ol p pre section
+      table tbody tfoot th thead tr ul video
+    ].freeze
+
+    # Generate an HTML diff between two HTML strings
+    #
+    # @param old_html [String] The original HTML
+    # @param new_html [String] The new HTML
+    # @param html_format [Hash] Hash of options for formatting the output
+    # @return [String] HTML string with changes marked
+    def self.diff(old_html, new_html, html_format = {})
+      diff_options = html_format || {}
+
+      # Parse HTML documents
+      old_doc = parse_html(old_html)
+      new_doc = parse_html(new_html)
+
+      # Normalize and prepare documents
+      normalize_nodes(old_doc)
+      normalize_nodes(new_doc)
+
+      # Generate diff
+      changes = diff_trees(old_doc, new_doc)
+
+      # Apply post-processing to ensure valid HTML
+      result_doc = post_process(changes, diff_options)
+
+      # Convert back to HTML string
+      result_doc.to_html
+    end
+
+    private
+
+    def self.parse_html(html)
+      # Parse HTML with fragment to avoid adding html/body tags
+      Nokogiri::HTML.fragment(html)
+    end
+
+    def self.normalize_nodes(doc)
+      # Remove comments
+      doc.xpath('//comment()').remove
+
+      # Normalize whitespace in text nodes
+      doc.xpath('//text()').each do |node|
+        unless node.parent && BLOCK_ELEMENTS.include?(node.parent.name)
+          node.content = node.content.gsub(/\s+/, ' ')
+        end
+      end
+    end
+
+    def self.diff_trees(old_doc, new_doc)
+      # Create a working copy we can manipulate
+      result_doc = old_doc.dup
+
+      # Identify changes at each level
+      compare_nodes(result_doc, new_doc)
+
+      result_doc
+    end
+
+    def self.compare_nodes(old_node, new_node)
+      # If node types differ, replace entire node
+      if old_node.type != new_node.type
+        replace_node(old_node, new_node)
+        return
+      end
+
+      # Handle text nodes
+      if old_node.text? && new_node.text?
+        if old_node.content != new_node.content
+          old_parent = old_node.parent
+
+          # Replace with delete and insert tags
+          del_node = Nokogiri::XML::Node.new('del', old_parent.document)
+          del_node.content = old_node.content
+          ins_node = Nokogiri::XML::Node.new('ins', old_parent.document)
+          ins_node.content = new_node.content
+
+          old_node.replace(del_node)
+          del_node.add_next_sibling(ins_node)
+        end
+        return
+      end
+
+      # For element nodes, compare attributes
+      if old_node.element? && new_node.element?
+        # If tag names are different, replace the whole node
+        if old_node.name != new_node.name
+          replace_node(old_node, new_node)
+          return
+        end
+
+        # Compare attributes
+        # For simplicity, we're not marking attribute changes in this example
+      end
+
+      # Compare children
+      compare_children(old_node, new_node)
+    end
+
+    def self.compare_children(old_parent, new_parent)
+      old_children = old_parent.children.to_a
+      new_children = new_parent.children.to_a
+
+      # Use an LCS-based diff to find matching children
+      # This is a simplified version that won't handle all cases
+      i = 0
+      j = 0
+
+      while i < old_children.length && j < new_children.length
+        old_child = old_children[i]
+        new_child = new_children[j]
+
+        if nodes_equal?(old_child, new_child)
+          # Nodes match, recursively compare their children
+          compare_nodes(old_child, new_child)
+          i += 1
+          j += 1
+        elsif j + 1 < new_children.length && nodes_equal?(old_child, new_children[j + 1])
+          # Current new node is an insertion
+          insert_node(old_child, new_child, position: :before)
+          j += 1
+        elsif i + 1 < old_children.length && nodes_equal?(old_children[i + 1], new_child)
+          # Current old node is a deletion
+          wrap_in_delete(old_child)
+          i += 1
+        else
+          # No good match found, treat as replacement
+          replace_node(old_child, new_child)
+          i += 1
+          j += 1
+        end
+      end
+
+      # Handle remaining old nodes (deletions)
+      while i < old_children.length
+        wrap_in_delete(old_children[i])
+        i += 1
+      end
+
+      # Handle remaining new nodes (insertions)
+      while j < new_children.length
+        insert_node(old_parent, new_children[j], position: :append)
+        j += 1
+      end
+    end
+
+    def self.nodes_equal?(node1, node2)
+      return false unless node1 && node2
+
+      if node1.text? && node2.text?
+        return node1.content.strip == node2.content.strip
+      elsif node1.element? && node2.element?
+        return node1.name == node2.name
+      end
+
+      false
+    end
+
+    def self.replace_node(old_node, new_node)
+      old_parent = old_node.parent
+      return unless old_parent
+
+      # Create delete and insert nodes
+      del_node = Nokogiri::XML::Node.new('del', old_parent.document)
+      ins_node = Nokogiri::XML::Node.new('ins', old_parent.document)
+
+      # Clone the old and new nodes to put inside del/ins
+      del_content = old_node.dup
+      ins_content = old_parent.document.import(new_node.dup)
+
+      del_node.add_child(del_content)
+      ins_node.add_child(ins_content)
+
+      # Replace the old node with the del+ins pair
+      old_node.replace(del_node)
+      del_node.add_next_sibling(ins_node)
+    end
+
+    def self.insert_node(reference_node, new_node, position: :after)
+      parent = reference_node.parent
+      return unless parent
+
+      # Create the insert node
+      ins_node = Nokogiri::XML::Node.new('ins', parent.document)
+      ins_content = parent.document.import(new_node.dup)
+      ins_node.add_child(ins_content)
+
+      case position
+      when :before
+        reference_node.add_previous_sibling(ins_node)
+      when :after
+        reference_node.add_next_sibling(ins_node)
+      when :append
+        parent.add_child(ins_node)
+      end
+    end
+
+    def self.wrap_in_delete(node)
+      parent = node.parent
+      return unless parent
+
+      # Create the delete node
+      del_node = Nokogiri::XML::Node.new('del', parent.document)
+
+      # Remove the node from its parent and add it to the delete node
+      node_dup = node.dup
+      del_node.add_child(node_dup)
+
+      # Replace the original node with the delete node
+      node.replace(del_node)
+    end
+
+    def self.post_process(doc, options)
+      fix_block_elements(doc)
+      fix_table_structure(doc)
+      fix_list_structure(doc)
+      doc
+    end
+
+    def self.fix_block_elements(doc)
+      # Fix cases where ins/del tags break block element structure
+      BLOCK_ELEMENTS.each do |tag|
+        # Look for del/ins tags that contain block elements
+        doc.css("del #{tag}, ins #{tag}").each do |node|
+          # For block elements inside ins/del, we may need to restructure
+          # to maintain valid HTML
+          parent_change = node.parent
+
+          # Extract this node from the parent ins/del
+          parent_change.add_previous_sibling(node)
+
+          # Wrap the extracted node in its own ins/del
+          new_wrapper = Nokogiri::XML::Node.new(parent_change.name, doc)
+          node.replace(new_wrapper)
+          new_wrapper.add_child(node)
+        end
+      end
+    end
+
+    def self.fix_table_structure(doc)
+      # Fix invalid table structures
+      # This is a simplified version - the real implementation would be more complex
+
+      # Remove ins/del directly inside table, tbody, thead, tfoot, tr
+      %w[table tbody thead tfoot tr].each do |tag|
+        doc.css("#{tag} > ins, #{tag} > del").each do |node|
+          # Move contents up, replacing the ins/del
+          parent = node.parent
+          node.children.each do |child|
+            node.add_previous_sibling(child)
+          end
+          node.remove
+        end
+      end
+
+      # For td/th elements, move ins/del inside
+      doc.css('tr > ins > td, tr > del > td, tr > ins > th, tr > del > th').each do |cell|
+        change_tag = cell.parent
+        change_type = change_tag.name # ins or del
+
+        # Move cell out of ins/del
+        change_tag.add_previous_sibling(cell)
+        change_tag.remove
+
+        # Create a new ins/del inside the cell
+        new_change = Nokogiri::XML::Node.new(change_type, doc)
+        cell.children.each do |child|
+          new_change.add_child(child)
+        end
+        cell.add_child(new_change)
+      end
+    end
+
+    def self.fix_list_structure(doc)
+      # Fix invalid list structures (li must be direct children of ul/ol)
+      doc.css('del > li, ins > li').each do |li|
+        change_tag = li.parent
+        change_type = change_tag.name # ins or del
+
+        # Move li out of ins/del
+        if change_tag.parent && %w[ul ol].include?(change_tag.parent.name)
+          change_tag.add_previous_sibling(li)
+
+          # Add a class to mark deleted list items
+          if change_type == 'del'
+            li['class'] = [li['class'], 'del-li'].compact.join(' ')
+          end
+
+          # Create a new ins/del inside the li
+          new_change = Nokogiri::XML::Node.new(change_type, doc)
+          li.children.each do |child|
+            new_change.add_child(child.dup)
+          end
+          li.children.remove
+          li.add_child(new_change)
+
+          change_tag.remove if change_tag.children.empty?
+        end
+      end
+    end
+  end
+end
diff --git a/spec/html_diff/dom_tokenizer_spec.rb b/spec/html_diff/dom_tokenizer_spec.rb
new file mode 100644
index 0000000..98a6f65
--- /dev/null
+++ b/spec/html_diff/dom_tokenizer_spec.rb
@@ -0,0 +1,118 @@
+# frozen_string_literal: true
+
+require_relative '../spec_helper'
+
+RSpec.describe HTMLDiff::DomTokenizer do
+  describe '.tokenize' do
+    let(:html) do
+      <<~HTML
+        <!DOCTYPE html>
+        <html lang="en">
+        <head>
+            <meta charset="UTF-8">
+            <meta name="viewport" content="width=device-width, initial-scale=1.0">
+            <title>My Sample Webpage</title>
+            <style>
+                body {
+                    font-family: Arial, sans-serif;
+                    margin: 0;
+                    padding: 20px;
+                }
+                .container {
+                    max-width: 800px;
+                    margin: 0 auto;
+                }
+                header {
+                    background-color: #f5f5f5;
+                    padding: 15px;
+                    border-radius: 5px;
+                }
+            </style>
+        </head>
+        <body>
+            <div class="container">
+                <header>
+                    <h1>Welcome to My Website</h1>
+                    <p>This is a sample nested HTML document.</p>
+                </header>
+                <main>
+                    <section>
+                        <h2>About Us</h2>
+                        <p>We are a <em>fictional</em> company that specializes in <strong>web development</strong>.</p>
+                        <ul>
+                            <li>HTML coding</li>
+                            <li>CSS styling</li>
+                            <li>JavaScript programming</li>
+                        </ul>
+                    </section>
+                    <section>
+                        <h2>Contact Information</h2>
+                        <p>You can reach us at:</p>
+                        <address>
+                            <a href="mailto:info@example.com">info@example.com</a><br>
+                            <a href="tel:+15551234567">+1 (555) 123-4567</a>
+                        </address>
+                    </section>
+                </main>
+                <footer>
+                    <p>&copy; 2025 My Sample Website. All rights reserved.</p>
+                </footer>
+            </div>
+        </body>
+        </html>
+      HTML
+    end
+
+    let(:expected) do
+      [
+        ['<!DOCTYPE', ' ', 'html>'],
+        ['html', {'lang' => 'en'}, [
+          ['head', {}, [
+            ['meta', {'charset' => 'UTF-8'}, []],
+            ['meta', {'name' => 'viewport', 'content' => 'width=device-width, initial-scale=1.0'}, []],
+            ['title', {}, [['My', ' ', 'Sample', ' ', 'Webpage']]],
+            ['style', {}, [['body', ' ', '{', ' ', 'font-family:', ' ', 'Arial,', ' ', 'sans-serif;', ' ', 'margin:', ' ', '0;', ' ', 'padding:', ' ', '20px;', ' ', '}', ' ',
+                            '.container', ' ', '{', ' ', 'max-width:', ' ', '800px;', ' ', 'margin:', ' ', '0', ' ', 'auto;', ' ', '}', ' ',
+                            'header', ' ', '{', ' ', 'background-color:', ' ', '#f5f5f5;', ' ', 'padding:', ' ', '15px;', ' ', 'border-radius:', ' ', '5px;', ' ', '}']]]
+          ]],
+          ['body', {}, [
+            ['div', {'class' => 'container'}, [
+              ['header', {}, [
+                ['h1', {}, [['Welcome', ' ', 'to', ' ', 'My', ' ', 'Website']]],
+                ['p', {}, [['This', ' ', 'is', ' ', 'a', ' ', 'sample', ' ', 'nested', ' ', 'HTML', ' ', 'document', '.']]]
+              ]],
+              ['main', {}, [
+                ['section', {}, [
+                  ['h2', {}, [['About', ' ', 'Us']]],
+                  ['p', {}, [['We', ' ', 'are', ' ', 'a', ' '], ['em', {}, [['fictional']]], [' ', 'company', ' ', 'that', ' ', 'specializes', ' ', 'in', ' '], ['strong', {}, [['web', ' ', 'development']], ['.']]]],
+                  ['ul', {}, [
+                    ['li', {}, [['HTML', ' ', 'coding']]],
+                    ['li', {}, [['CSS', ' ', 'styling']]],
+                    ['li', {}, [['JavaScript', ' ', 'programming']]]
+                  ]]
+                ]],
+                ['section', {}, [
+                  ['h2', {}, [['Contact', ' ', 'Information']]],
+                  ['p', {}, [['You', ' ', 'can', ' ', 'reach', ' ', 'us', ' ', 'at', ':']]],
+                  ['address', {}, [
+                    ['a', {'href' => 'mailto:info@example.com'}, [['info@example.com']]],
+                    ['br', {}, []],
+                    ['a', {'href' => 'tel:+15551234567'}, [["+1", " ", "(", "555", ")", " ", "123-4567"]]]
+                  ]]
+                ]]
+              ]],
+              ['footer', {}, [
+                ['p', {}, [['©', ' ', '2025', ' ', 'My', ' ', 'Sample', ' ', 'Website', '.', ' ', 'All', ' ', 'rights', ' ', 'reserved', '.']]]
+              ]]
+            ]]
+          ]]
+        ]]
+      ]
+    end
+
+    it 'tokenizes HTML correctly' do
+      tokens = described_class.tokenize(html)
+      expect(tokens).to eq(expected)
+    end
+  end
+end
diff --git a/spec/html_diff/tree_differ_spec.rb b/spec/html_diff/tree_differ_spec.rb
new file mode 100644
index 0000000..ac2e110
--- /dev/null
+++ b/spec/html_diff/tree_differ_spec.rb
@@ -0,0 +1,135 @@
+# frozen_string_literal: true
+
+require_relative '../spec_helper'
+
+RSpec.describe HTMLDiff::TreeDiffer do
+  describe '.diff' do
+    context 'with simple text changes' do
+      it 'marks text changes with ins and del tags' do
+        old_html = '<p>This is some text</p>'
+        new_html = '<p>This is modified text</p>'
+
+        result = described_class.diff(old_html, new_html)
+
+        expect(result).to include('<del>some</del>')
+        expect(result).to include('<ins>modified</ins>')
+      end
+    end
+
+    context 'with paragraph structure' do
+      it 'maintains valid HTML structure' do
+        old_html = '<p>Lorem ipsum dolor sit amet. foo</p>'
+        new_html = <<~HTML
+          <p>Lorem ipsum dolor sit amet.</p>
+          <p>New paragraph</p>
+          <p>And yet another new paragraph</p>
+        HTML
+
+        result = described_class.diff(old_html, new_html)
+
+        # The diff should have valid HTML structure
+        expect(result).to include('<p>Lorem ipsum dolor sit amet.<del> foo</del></p>')
+        expect(result).to include('<ins><p>New paragraph</p></ins>')
+        expect(result).to include('<ins><p>And yet another new paragraph</p></ins>')
+
+        # Validate with Nokogiri to ensure it's well-formed
+        expect { Nokogiri::HTML.fragment(result) }.not_to raise_error
+      end
+    end
+
+    context 'with nested content' do
+      it 'preserves structure in nested elements' do
+        old_html = '<div><p>Old content</p></div>'
+        new_html = '<div><p>New content</p></div>'
+
+        result = described_class.diff(old_html, new_html)
+
+        expect(result).to include('<div><p><del>Old</del><ins>New</ins> content</p></div>')
+      end
+    end
+
+    context 'with lists' do
+      it 'maintains valid list structure' do
+        old_html = '<ul><li>Item 1</li><li>Item 2</li></ul>'
+        new_html = '<ul><li>Item 1</li><li>Updated Item 2</li><li>Item 3</li></ul>'
+
+        result = described_class.diff(old_html, new_html)
+
+        # List structure should be preserved
+        expect(result).to include('<li>Item 1</li>')
+        expect(result).to include('<li><del>Item 2</del><ins>Updated Item 2</ins></li>')
+        expect(result).to include('<li><ins>Item 3</ins></li>')
+
+        # Validate with Nokogiri
+        doc = Nokogiri::HTML.fragment(result)
+        expect(doc.css('ul > li').size).to be >= 3
+      end
+    end
+
+    context 'with tables' do
+      it 'maintains valid table structure' do
+        old_html = '<table><tr><td>Cell 1</td><td>Cell 2</td></tr></table>'
+        new_html = '<table><tr><td>Cell 1 Updated</td><td>Cell 2</td></tr></table>'
+
+        result = described_class.diff(old_html, new_html)
+
+        # Table structure should be preserved
+        expect(result).to include('<table>')
+        expect(result).to include('<tr>')
+        expect(result).to include('<td>')
+        expect(result).to include('<del>Cell 1</del><ins>Cell 1 Updated</ins>')
+
+        # Validate with Nokogiri
+        doc = Nokogiri::HTML.fragment(result)
+        expect(doc.css('table > tr > td').size).to eq(2)
+      end
+    end
+
+    context 'with element addition' do
+      it 'shows added elements' do
+        old_html = '<div></div>'
+        new_html = '<div><p>New paragraph</p></div>'
+
+        result = described_class.diff(old_html, new_html)
+
+        expect(result).to include('<div><ins><p>New paragraph</p></ins></div>')
+      end
+    end
+
+    context 'with element removal' do
+      it 'shows deleted elements' do
+        old_html = '<div><p>Old paragraph</p></div>'
+        new_html = '<div></div>'
+
+        result = described_class.diff(old_html, new_html)
+
+        expect(result).to include('<div><del><p>Old paragraph</p></del></div>')
+      end
+    end
+
+    context 'with the problem case' do
+      it 'produces valid HTML for the original problem case' do
+        old_html = '<p>Lorem ipsum dolor sit amet. foo</p>'
+        new_html = <<~HTML
+          <p>Lorem ipsum dolor sit amet.</p>
+          <p>New paragraph</p>
+          <p>And yet another new paragraph</p>
+        HTML
+
+        result = described_class.diff(old_html, new_html)
+
+        # Parse the HTML to ensure it's valid
+        doc = Nokogiri::HTML.fragment(result)
+
+        # Verify it's not broken like the original example
+        expect(result).not_to include('</p><ins>')
+        expect(result).not_to include('</ins></p>')
+
+        # The content should make sense structurally
+        expect(doc.css('p').size).to eq(3)
+        expect(doc.css('del').size).to eq(1)
+        expect(doc.css('ins').size).to eq(2)
+      end
+    end
+  end
+end

From b437cec3c1f3035bccce2968bb54ba10767ed79d Mon Sep 17 00:00:00 2001
From: johnnyshields <27655+johnnyshields@users.noreply.github.com>
Date: Fri, 14 Mar 2025 20:20:29 +0900
Subject: [PATCH 2/2] More WIP

---
 lib/html_diff/dom_tokenizer.rb       | 196 +++++++++++++++++++++++
 spec/html_diff/dom_tokenizer_spec.rb | 227 +++++++++++++++++++++------
 2 files changed, 377 insertions(+), 46 deletions(-)

diff --git a/lib/html_diff/dom_tokenizer.rb b/lib/html_diff/dom_tokenizer.rb
index 5fda695..c7c98c6 100644
--- a/lib/html_diff/dom_tokenizer.rb
+++ b/lib/html_diff/dom_tokenizer.rb
@@ -79,3 +79,199 @@ def tokenize_text(text)
     end
   end
 end
+
+# frozen_string_literal: true
+
+require 'nokogiri'
+
+module HTMLDiff
+  # Tokenizes HTML while preserving DOM structure
+  class DomTokenizer
+    class ParseError < StandardError; end
+
+    def initialize(options = {})
+      @preserve_whitespace = options[:preserve_whitespace] || false
+    end
+
+    # Tokenizes HTML in a DOM-aware way
+    #
+    # @param html [String] The HTML string to tokenize
+    # @return [Array] Nested array structure representing the DOM
+    def self.tokenize(html, options = {})
+      new(options).tokenize(html)
+    end
+
+    def tokenize(html)
+      return [] if !html || html.empty?
+
+      begin
+        html = html.encode('UTF-8', invalid: :replace, undef: :replace, replace: ' ')
+
+        # Simple text case
+        if !html.include?('<') && !html.include?('>')
+          return [html]
+        end
+
+        # Parse the HTML
+        doc = Nokogiri::HTML(html, nil, 'UTF-8')
+
+        result = []
+
+        # Add DOCTYPE if present (and if in the original HTML)
+        if doc.internal_subset && html.include?('<!DOCTYPE')
+          result << ['<!DOCTYPE', ' ', 'html>']
+        end
+
+        # Check for malformed HTML
+        check_for_malformed_html(html)
+
+        # Special case for a single paragaph of plain text
+        if doc.at_css('body') &&
+          doc.at_css('body').children.size == 1 &&
+          doc.at_css('body').children.first.text? &&
+          !html.match(/<[^>]+>/)
+          return [doc.at_css('body').content]
+        end
+
+        # Process html element or direct children depending on the input
+        if html.strip.start_with?('<html') || html.include?('<!DOCTYPE')
+          html_node = doc.at_css('html')
+          result << process_node(html_node) if html_node
+        else
+          # Process all direct children
+          nodes = doc.css('body > *')
+
+          # If no nodes found in body, try document level
+          nodes = doc.children.reject { |n| n.name == 'html' } if nodes.empty?
+
+          nodes.each do |node|
+            token = process_node(node)
+            result << token if token
+          end
+        end
+
+        result
+      rescue Nokogiri::XML::SyntaxError => e
+        raise ParseError, "Error parsing HTML: #{e.message}"
+      rescue => e
+        raise ParseError, "Error processing HTML: #{e.message}"
+      end
+    end
+
+    private
+
+    def check_for_malformed_html(html)
+      # Simple check for unbalanced tags
+      opening_tags = html.scan(/<([a-zA-Z0-9]+)(?:\s+[^>]*)?(?!\/)>/i).flatten
+      closing_tags = html.scan(/<\/([a-zA-Z0-9]+)>/i).flatten
+
+      # Identify self-closing tags
+      void_elements = ['img', 'br', 'hr', 'meta', 'input', 'link', 'area', 'base', 'col', 'embed',
+                       'param', 'source', 'track', 'wbr']
+
+      # Count tags that need matching
+      tag_counts = Hash.new(0)
+
+      opening_tags.each do |tag|
+        tag_counts[tag.downcase] += 1 unless void_elements.include?(tag.downcase)
+      end
+
+      closing_tags.each do |tag|
+        tag_counts[tag.downcase] -= 1
+      end
+
+      # Check for any unbalanced tags
+      tag_counts.each do |tag, count|
+        if count != 0
+          raise ParseError, "Unbalanced tags: #{tag}"
+        end
+      end
+    end
+
+    def process_node(node)
+      case node.type
+      when Nokogiri::XML::Node::TEXT_NODE
+        process_text_node(node)
+      when Nokogiri::XML::Node::ELEMENT_NODE
+        process_element_node(node)
+      when Nokogiri::XML::Node::COMMENT_NODE
+        nil # Ignore comments
+      when Nokogiri::XML::Node::CDATA_SECTION_NODE
+        node.content # Return CDATA content as is
+      else
+        nil
+      end
+    end
+
+    def process_text_node(node)
+      content = node.content
+
+      # Skip empty text nodes unless preserving whitespace
+      return nil if !@preserve_whitespace && content.strip.empty?
+
+      # Decode HTML entities
+      content = decode_html_entities(content)
+
+      content
+    end
+
+    def decode_html_entities(text)
+      text.gsub(/&lt;/, '<')
+          .gsub(/&gt;/, '>')
+          .gsub(/&amp;/, '&')
+          .gsub(/&quot;/, '"')
+          .gsub(/&apos;/, "'")
+          .gsub(/&nbsp;/, ' ')
+          .gsub(/&#(\d+);/) { [$1.to_i].pack('U') }
+        .gsub(/&#x([0-9a-fA-F]+);/) { [$1.to_i(16)].pack('U') }
+    end
+
+    def process_element_node(node)
+      # Extract attributes
+      attrs = {}
+      node.attributes.each do |name, attr|
+        attrs[name] = attr.value
+      end
+
+      # Handle self-closing tags
+      if is_self_closing?(node)
+        return [node.name, attrs, nil]
+      end
+
+      # Process children
+      children = []
+      has_text = false
+      has_elements = false
+
+      node.children.each do |child|
+        if child.text? && (!child.content.strip.empty? || @preserve_whitespace)
+          has_text = true
+        elsif child.element?
+          has_elements = true
+        end
+
+        child_token = process_node(child)
+        children << child_token if child_token
+      end
+
+      # Return appropriate structure based on content type
+      if children.empty?
+        [node.name, attrs, []]
+      elsif has_text && has_elements
+        # Mixed content
+        [node.name, attrs, children]
+      elsif children.size == 1 && children.first.is_a?(String)
+        # Single text node child
+        [node.name, attrs, children.first]
+      else
+        # Multiple children or single element child
+        [node.name, attrs, children]
+      end
+    end
+
+    def is_self_closing?(node)
+      ['img', 'br', 'hr', 'meta', 'input', 'link', 'area', 'base', 'col', 'embed',
+       'param', 'source', 'track', 'wbr'].include?(node.name.downcase) && node.children.empty?
+    end
+  end
+end
diff --git a/spec/html_diff/dom_tokenizer_spec.rb b/spec/html_diff/dom_tokenizer_spec.rb
index 98a6f65..8e1d838 100644
--- a/spec/html_diff/dom_tokenizer_spec.rb
+++ b/spec/html_diff/dom_tokenizer_spec.rb
@@ -4,8 +4,142 @@
 
 RSpec.describe HTMLDiff::DomTokenizer do
   describe '.tokenize' do
-    let(:html) do
-      <<~HTML
+    subject(:tokenizer) { described_class }
+
+    it 'returns an empty array for empty string' do
+      expect(tokenizer.tokenize('')).to eq([])
+    end
+
+    it 'parses a simple text node' do
+      expect(tokenizer.tokenize('Hello World')).to eq(['Hello World'])
+    end
+
+    it 'parses a single tag without attributes' do
+      html = '<p>Paragraph</p>'
+      expected = [
+        ['p', {}, 'Paragraph']
+      ]
+      expect(tokenizer.tokenize(html)).to eq(expected)
+    end
+
+    it 'parses a single tag with attributes' do
+      html = '<p class="intro" id="first">Paragraph</p>'
+      expected = [
+        ['p', { 'class' => 'intro', 'id' => 'first' }, ['Paragraph']]
+      ]
+      expect(tokenizer.tokenize(html)).to eq(expected)
+    end
+
+    it 'parses nested tags' do
+      html = '<div><p>Paragraph</p></div>'
+      expected = [
+        ['div', {}, [
+          ['p', {}, ['Paragraph']]
+        ]]
+      ]
+      expect(tokenizer.tokenize(html)).to eq(expected)
+    end
+
+    it 'parses sibling tags' do
+      html = '<div><h1>Title</h1><p>Paragraph</p></div>'
+      expected = [
+        ['div', {}, [
+          ['h1', {}, ['Title']],
+          ['p', {}, ['Paragraph']]
+        ]]
+      ]
+      expect(tokenizer.tokenize(html)).to eq(expected)
+    end
+
+    it 'parses complex nested structure' do
+      html = '<section id="content"><header><h1>Title</h1></header><article><p>Text</p><ul><li>Item 1</li><li>Item 2</li></ul></article></section>'
+      expected = [
+        ['section', { 'id' => 'content' }, [
+          ['header', {}, [
+            ['h1', {}, ['Title']]
+          ]],
+          ['article', {}, [
+            ['p', {}, ['Text']],
+            ['ul', {}, [
+              ['li', {}, ['Item 1']],
+              ['li', {}, ['Item 2']]
+            ]]
+          ]]
+        ]]
+      ]
+      expect(tokenizer.tokenize(html)).to eq(expected)
+    end
+
+    it 'handles self-closing tags' do
+      html = '<div><img src="image.jpg" alt="Image"/><p>Description</p></div>'
+      expected = [
+        ['div', {}, [
+          ['img', { 'src' => 'image.jpg', 'alt' => 'Image' }, nil],
+          ['p', {}, ['Description']]
+        ]]
+      ]
+      expect(tokenizer.tokenize(html)).to eq(expected)
+    end
+
+    it 'handles tags with mixed content' do
+      html = '<p>This is <strong>important</strong> text</p>'
+      expected = [
+        ['p', {}, [
+          ['This is '],
+          ['strong', {}, ['important']],
+          [' ', 'text']
+        ]]
+      ]
+      expect(tokenizer.tokenize(html)).to eq(expected)
+    end
+
+    it 'handles malformed HTML by raising an error' do
+      html = '<div><p>Unclosed paragraph tag</div>'
+      expect { tokenizer.tokenize(html) }.to raise_error(described_class::ParseError)
+    end
+
+    it 'preserves whitespace when specified' do
+      tokenizer = described_class.tokenize(preserve_whitespace: true)
+      html = '<div>\n  <p>  Spaced  text  </p>\n</div>'
+      expected = [
+        ['div', {}, [
+          ["\n", " ", " "],
+          ['p', {}, ' ', ' ', 'Spaced', ' ', ' ', 'text', ' ', ' '],
+          ["\n"]
+        ]]
+      ]
+      expect(tokenizer.tokenize(html)).to eq(expected)
+    end
+
+    it 'ignores comments' do
+      html = '<div><!-- This is a comment --><p>Text</p></div>'
+      expected = [
+        ['div', {}, [
+          ['p', {}, ['Text']]
+        ]]
+      ]
+      expect(tokenizer.tokenize(html)).to eq(expected)
+    end
+
+    it 'handles CDATA sections' do
+      html = '<div><![CDATA[<strong>This should not be parsed</strong>]]></div>'
+      expected = [
+        ['div', {}, '<strong>This should not be parsed</strong>']
+      ]
+      expect(tokenizer.tokenize(html)).to eq(expected)
+    end
+
+    it 'handles HTML entities' do
+      html = '<p>&lt;div&gt; is a block element &amp; &quot;p&quot; is another</p>'
+      expected = [
+        ['p', {}, ['<div>', ' ', 'is', ' ', 'a', ' ', 'block', ' ', 'element', ' ', '&', ' ', '"', 'p', '"', ' ', 'is', ' ', 'another']]
+      ]
+      expect(tokenizer.tokenize(html)).to eq(expected)
+    end
+
+    context 'complex case' do
+      let(:html) do
+        <<~HTML
         <!DOCTYPE html>
         <html lang="en">
         <head>
@@ -61,58 +195,59 @@
         </body>
         </html>
       HTML
-    end
+      end
 
-    let(:expected) do
-      [
-        ['<!DOCTYPE', ' ', 'html>'],
-        ['html', {'lang' => 'en'}, [
-          ['head', {}, [
-            ['meta', {'charset' => 'UTF-8'}, []],
-            ['meta', {'name' => 'viewport', 'content' => 'width=device-width, initial-scale=1.0'}, []],
-            ['title', {}, [['My', ' ', 'Sample', ' ', 'Webpage']]],
-            ['style', {}, [['body', ' ', '{', ' ', 'font-family:', ' ', 'Arial,', ' ', 'sans-serif;', ' ', 'margin:', ' ', '0;', ' ', 'padding:', ' ', '20px;', ' ', '}', ' ',
-                            '.container', ' ', '{', ' ', 'max-width:', ' ', '800px;', ' ', 'margin:', ' ', '0', ' ', 'auto;', ' ', '}', ' ',
-                            'header', ' ', '{', ' ', 'background-color:', ' ', '#f5f5f5;', ' ', 'padding:', ' ', '15px;', ' ', 'border-radius:', ' ', '5px;', ' ', '}']]]
-          ]],
-          ['body', {}, [
-            ['div', {'class' => 'container'}, [
-              ['header', {}, [
-                ['h1', {}, [['Welcome', ' ', 'to', ' ', 'My', ' ', 'Website']]],
-                ['p', {}, [['This', ' ', 'is', ' ', 'a', ' ', 'sample', ' ', 'nested', ' ', 'HTML', ' ', 'document', '.']]]
-              ]],
-              ['main', {}, [
-                ['section', {}, [
-                  ['h2', {}, [['About', ' ', 'Us']]],
-                  ['p', {}, [['We', ' ', 'are', ' ', 'a', ' '], ['em', {}, [['fictional']]], [' ', 'company', ' ', 'that', ' ', 'specializes', ' ', 'in', ' '], ['strong', {}, [['web', ' ', 'development']], ['.']]]],
-                  ['ul', {}, [
-                    ['li', {}, [['HTML', ' ', 'coding']]],
-                    ['li', {}, [['CSS', ' ', 'styling']]],
-                    ['li', {}, [['JavaScript', ' ', 'programming']]]
-                  ]]
+      let(:expected) do
+        [
+          ['<!DOCTYPE', ' ', 'html>'],
+          ['html', {'lang' => 'en'}, [
+            ['head', {}, [
+              ['meta', {'charset' => 'UTF-8'}, []],
+              ['meta', {'name' => 'viewport', 'content' => 'width=device-width, initial-scale=1.0'}, []],
+              ['title', {}, [['My', ' ', 'Sample', ' ', 'Webpage']]],
+              ['style', {}, [['body', ' ', '{', ' ', 'font-family:', ' ', 'Arial,', ' ', 'sans-serif;', ' ', 'margin:', ' ', '0;', ' ', 'padding:', ' ', '20px;', ' ', '}', ' ',
+                              '.container', ' ', '{', ' ', 'max-width:', ' ', '800px;', ' ', 'margin:', ' ', '0', ' ', 'auto;', ' ', '}', ' ',
+                              'header', ' ', '{', ' ', 'background-color:', ' ', '#f5f5f5;', ' ', 'padding:', ' ', '15px;', ' ', 'border-radius:', ' ', '5px;', ' ', '}']]]
+            ]],
+            ['body', {}, [
+              ['div', {'class' => 'container'}, [
+                ['header', {}, [
+                  ['h1', {}, [['Welcome', ' ', 'to', ' ', 'My', ' ', 'Website']]],
+                  ['p', {}, [['This', ' ', 'is', ' ', 'a', ' ', 'sample', ' ', 'nested', ' ', 'HTML', ' ', 'document', '.']]]
                 ]],
-                ['section', {}, [
-                  ['h2', {}, [['Contact', ' ', 'Information']]],
-                  ['p', {}, [['You', ' ', 'can', ' ', 'reach', ' ', 'us', ' ', 'at', ':']]],
-                  ['address', {}, [
-                    ['a', {'href' => 'mailto:info@example.com'}, [['info@example.com']]],
-                    ['br', {}, []],
-                    ['a', {'href' => 'tel:+15551234567'}, [["+1", " ", "(", "555", ")", " ", "123-4567"]]]
+                ['main', {}, [
+                  ['section', {}, [
+                    ['h2', {}, [['About', ' ', 'Us']]],
+                    ['p', {}, [['We', ' ', 'are', ' ', 'a', ' '], ['em', {}, [['fictional']]], [' ', 'company', ' ', 'that', ' ', 'specializes', ' ', 'in', ' '], ['strong', {}, [['web', ' ', 'development']], ['.']]]],
+                    ['ul', {}, [
+                      ['li', {}, [['HTML', ' ', 'coding']]],
+                      ['li', {}, [['CSS', ' ', 'styling']]],
+                      ['li', {}, [['JavaScript', ' ', 'programming']]]
+                    ]]
+                  ]],
+                  ['section', {}, [
+                    ['h2', {}, [['Contact', ' ', 'Information']]],
+                    ['p', {}, [['You', ' ', 'can', ' ', 'reach', ' ', 'us', ' ', 'at', ':']]],
+                    ['address', {}, [
+                      ['a', {'href' => 'mailto:info@example.com'}, [['info@example.com']]],
+                      ['br', {}, []],
+                      ['a', {'href' => 'tel:+15551234567'}, [["+1", " ", "(", "555", ")", " ", "123-4567"]]]
+                    ]]
                   ]]
+                ]],
+                ['footer', {}, [
+                  ['p', {}, [['©', ' ', '2025', ' ', 'My', ' ', 'Sample', ' ', 'Website', '.', ' ', 'All', ' ', 'rights', ' ', 'reserved', '.']]]
                 ]]
-              ]],
-              ['footer', {}, [
-                ['p', {}, [['©', ' ', '2025', ' ', 'My', ' ', 'Sample', ' ', 'Website', '.', ' ', 'All', ' ', 'rights', ' ', 'reserved', '.']]]
               ]]
             ]]
           ]]
-        ]]
-      ]
-    end
+        ]
+      end
 
-    it 'tokenizes HTML correctly' do
-      tokens = described_class.tokenize(html)
-      expect(tokens).to eq(expected)
+      it 'tokenizes HTML correctly' do
+        tokens = described_class.tokenize(html)
+        expect(tokens).to eq(expected)
+      end
     end
   end
 end