Text
- Item 1
- Item 2
From 8ebe54fa735e9acd95be4c1cba1cbd50899e4d84 Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Fri, 14 Mar 2025 13:59:06 +0900 Subject: [PATCH 1/2] Preserve DOM mode WIP --- htmldiff.gemspec | 1 + lib/html_diff.rb | 23 +- lib/html_diff/dom_tokenizer.rb | 81 +++++++ lib/html_diff/tree_differ.rb | 313 +++++++++++++++++++++++++++ spec/html_diff/dom_tokenizer_spec.rb | 118 ++++++++++ spec/html_diff/tree_differ_spec.rb | 135 ++++++++++++ 6 files changed, 665 insertions(+), 6 deletions(-) create mode 100644 lib/html_diff/dom_tokenizer.rb create mode 100644 lib/html_diff/tree_differ.rb create mode 100644 spec/html_diff/dom_tokenizer_spec.rb create mode 100644 spec/html_diff/tree_differ_spec.rb diff --git a/htmldiff.gemspec b/htmldiff.gemspec index 1e61eab..c5a508b 100644 --- a/htmldiff.gemspec +++ b/htmldiff.gemspec @@ -13,6 +13,7 @@ Gem::Specification.new do |spec| spec.license = 'MIT' spec.add_dependency 'diff-lcs' + spec.add_dependency 'nokogiri', '>= 1' spec.files = Dir.glob('lib/**/*') + %w[LICENSE README] spec.require_paths = ['lib'] diff --git a/lib/html_diff.rb b/lib/html_diff.rb index f168ee0..8888bb9 100644 --- a/lib/html_diff.rb +++ b/lib/html_diff.rb @@ -1,7 +1,9 @@ # frozen_string_literal: true require 'html_diff/tokenizer' +require 'html_diff/dom_tokenizer' require 'html_diff/differ' +require 'html_diff/tree_differ' require 'html_diff/html_formatter' require 'html_diff/version' require 'html_diff/diff_builder' # @deprecated @@ -15,17 +17,26 @@ module HTMLDiff # # @param old_string [String] The original string # @param new_string [String] The new string + # @option :html_format [Hash] An optional hash of options to pass to the formatter. + # @option :preserve_dom [Boolean] Whether to preserve DOM structure in the diff + # output, ensuring valid HTML when diffing content with block elements. + # @option :merge_threshold [Object] Maximum string length of unchanged tokens + # to merge into neighboring changes. Value 0 merges only whitespace. + # Negative values disable merging. Default value is 5. # @option :tokenizer [Object] An optional object which responds to `tokenize`, # which is used break the input strings into an Array of LCS-diffable tokens. - # @option :format [Hash] An optional hash of options to pass to the formatter. # @option :formatter [Object] An optional object which responds to `format`, # which renders the LCS-diff output. - # @option :merge_threshold [Object] Maximum string length of unchanged tokens - # to merge into neighboring changes. Value 0 merges only whitespace. - # Negative values disable merging. Default value is 5. # @return [String] Diff of the two strings with additions and deletions marked. - def diff(old_string, new_string, tokenizer: nil, html_format: nil, formatter: nil, merge_threshold: nil) - tokenizer ||= Tokenizer + def diff(old_string, + new_string, + html_format: nil, + preserve_dom: false, + merge_threshold: nil, + tokenizer: nil, + formatter: nil) + + tokenizer ||= preserve_dom ? DomTokenizer : Tokenizer old_tokens = tokenizer.tokenize(old_string) new_tokens = tokenizer.tokenize(new_string) diff --git a/lib/html_diff/dom_tokenizer.rb b/lib/html_diff/dom_tokenizer.rb new file mode 100644 index 0000000..5fda695 --- /dev/null +++ b/lib/html_diff/dom_tokenizer.rb @@ -0,0 +1,81 @@ +# frozen_string_literal: true + +require 'nokogiri' + +module HTMLDiff + # Tokenizes HTML while preserving DOM structure + module DomTokenizer + extend self + + # Tokenizes HTML in a DOM-aware way + # + # @param html [String] The HTML string to tokenize + # @return [Array] Nested array structure representing the DOM + def tokenize(html) + return [] if !html || html.empty? + + html = html.encode('UTF-8', invalid: :replace, undef: :replace, replace: ' ') + + # Parse the HTML + doc = Nokogiri::HTML(html) + + result = [] + + # Add DOCTYPE if present + if doc.internal_subset + result << [''] + end + + # Process the root html element + html_element = doc.at_css('html') + if html_element + result << tokenize_element(html_element) + end + + result + end + + private + + # Tokenize an element into [name, attributes_hash, children_array] + def tokenize_element(element) + # Extract attributes + attrs = {} + element.attributes.each do |name, attr| + attrs[name] = attr.value + end + + # Process children + children = [] + element.children.each do |child| + if child.text? + # Tokenize text + tokens = tokenize_text(child.content) + children << tokens unless tokens.empty? + elsif child.element? + # Recursively tokenize element + children << tokenize_element(child) + end + end + + [element.name, attrs, children] + end + + # Tokenize text content + def tokenize_text(text) + return [] if text.strip.empty? + + # TODO: This junk needs to be fixed. + result = [] + words = text.split(/(\s+|\b|(?=[.,;:!?]))/) + .reject(&:empty?) + .map { |w| w =~ /\A\s+\z/ ? ' ' : w } + + words.each do |word| + result << word + end + + Tokenizer.tokenize(result.join('')) + end + end +end diff --git a/lib/html_diff/tree_differ.rb b/lib/html_diff/tree_differ.rb new file mode 100644 index 0000000..bf3ceda --- /dev/null +++ b/lib/html_diff/tree_differ.rb @@ -0,0 +1,313 @@ +# frozen_string_literal: true + +require 'nokogiri' + +module HTMLDiff + # A structure-aware HTML differ that preserves DOM integrity + class TreeDiffer + # Block elements that should be preserved as structural units + BLOCK_ELEMENTS = %w[ + address article aside blockquote canvas dd div dl dt fieldset figcaption figure + footer form h1 h2 h3 h4 h5 h6 header hr li main nav ol p pre section + table tbody tfoot th thead tr ul video + ].freeze + + # Generate an HTML diff between two HTML strings + # + # @param old_html [String] The original HTML + # @param new_html [String] The new HTML + # @param html_format [Hash] Hash of options for formatting the output + # @return [String] HTML string with changes marked + def self.diff(old_html, new_html, html_format = {}) + diff_options = html_format || {} + + # Parse HTML documents + old_doc = parse_html(old_html) + new_doc = parse_html(new_html) + + # Normalize and prepare documents + normalize_nodes(old_doc) + normalize_nodes(new_doc) + + # Generate diff + changes = diff_trees(old_doc, new_doc) + + # Apply post-processing to ensure valid HTML + result_doc = post_process(changes, diff_options) + + # Convert back to HTML string + result_doc.to_html + end + + private + + def self.parse_html(html) + # Parse HTML with fragment to avoid adding html/body tags + Nokogiri::HTML.fragment(html) + end + + def self.normalize_nodes(doc) + # Remove comments + doc.xpath('//comment()').remove + + # Normalize whitespace in text nodes + doc.xpath('//text()').each do |node| + unless node.parent && BLOCK_ELEMENTS.include?(node.parent.name) + node.content = node.content.gsub(/\s+/, ' ') + end + end + end + + def self.diff_trees(old_doc, new_doc) + # Create a working copy we can manipulate + result_doc = old_doc.dup + + # Identify changes at each level + compare_nodes(result_doc, new_doc) + + result_doc + end + + def self.compare_nodes(old_node, new_node) + # If node types differ, replace entire node + if old_node.type != new_node.type + replace_node(old_node, new_node) + return + end + + # Handle text nodes + if old_node.text? && new_node.text? + if old_node.content != new_node.content + old_parent = old_node.parent + + # Replace with delete and insert tags + del_node = Nokogiri::XML::Node.new('del', old_parent.document) + del_node.content = old_node.content + ins_node = Nokogiri::XML::Node.new('ins', old_parent.document) + ins_node.content = new_node.content + + old_node.replace(del_node) + del_node.add_next_sibling(ins_node) + end + return + end + + # For element nodes, compare attributes + if old_node.element? && new_node.element? + # If tag names are different, replace the whole node + if old_node.name != new_node.name + replace_node(old_node, new_node) + return + end + + # Compare attributes + # For simplicity, we're not marking attribute changes in this example + end + + # Compare children + compare_children(old_node, new_node) + end + + def self.compare_children(old_parent, new_parent) + old_children = old_parent.children.to_a + new_children = new_parent.children.to_a + + # Use an LCS-based diff to find matching children + # This is a simplified version that won't handle all cases + i = 0 + j = 0 + + while i < old_children.length && j < new_children.length + old_child = old_children[i] + new_child = new_children[j] + + if nodes_equal?(old_child, new_child) + # Nodes match, recursively compare their children + compare_nodes(old_child, new_child) + i += 1 + j += 1 + elsif j + 1 < new_children.length && nodes_equal?(old_child, new_children[j + 1]) + # Current new node is an insertion + insert_node(old_child, new_child, position: :before) + j += 1 + elsif i + 1 < old_children.length && nodes_equal?(old_children[i + 1], new_child) + # Current old node is a deletion + wrap_in_delete(old_child) + i += 1 + else + # No good match found, treat as replacement + replace_node(old_child, new_child) + i += 1 + j += 1 + end + end + + # Handle remaining old nodes (deletions) + while i < old_children.length + wrap_in_delete(old_children[i]) + i += 1 + end + + # Handle remaining new nodes (insertions) + while j < new_children.length + insert_node(old_parent, new_children[j], position: :append) + j += 1 + end + end + + def self.nodes_equal?(node1, node2) + return false unless node1 && node2 + + if node1.text? && node2.text? + return node1.content.strip == node2.content.strip + elsif node1.element? && node2.element? + return node1.name == node2.name + end + + false + end + + def self.replace_node(old_node, new_node) + old_parent = old_node.parent + return unless old_parent + + # Create delete and insert nodes + del_node = Nokogiri::XML::Node.new('del', old_parent.document) + ins_node = Nokogiri::XML::Node.new('ins', old_parent.document) + + # Clone the old and new nodes to put inside del/ins + del_content = old_node.dup + ins_content = old_parent.document.import(new_node.dup) + + del_node.add_child(del_content) + ins_node.add_child(ins_content) + + # Replace the old node with the del+ins pair + old_node.replace(del_node) + del_node.add_next_sibling(ins_node) + end + + def self.insert_node(reference_node, new_node, position: :after) + parent = reference_node.parent + return unless parent + + # Create the insert node + ins_node = Nokogiri::XML::Node.new('ins', parent.document) + ins_content = parent.document.import(new_node.dup) + ins_node.add_child(ins_content) + + case position + when :before + reference_node.add_previous_sibling(ins_node) + when :after + reference_node.add_next_sibling(ins_node) + when :append + parent.add_child(ins_node) + end + end + + def self.wrap_in_delete(node) + parent = node.parent + return unless parent + + # Create the delete node + del_node = Nokogiri::XML::Node.new('del', parent.document) + + # Remove the node from its parent and add it to the delete node + node_dup = node.dup + del_node.add_child(node_dup) + + # Replace the original node with the delete node + node.replace(del_node) + end + + def self.post_process(doc, options) + fix_block_elements(doc) + fix_table_structure(doc) + fix_list_structure(doc) + doc + end + + def self.fix_block_elements(doc) + # Fix cases where ins/del tags break block element structure + BLOCK_ELEMENTS.each do |tag| + # Look for del/ins tags that contain block elements + doc.css("del #{tag}, ins #{tag}").each do |node| + # For block elements inside ins/del, we may need to restructure + # to maintain valid HTML + parent_change = node.parent + + # Extract this node from the parent ins/del + parent_change.add_previous_sibling(node) + + # Wrap the extracted node in its own ins/del + new_wrapper = Nokogiri::XML::Node.new(parent_change.name, doc) + node.replace(new_wrapper) + new_wrapper.add_child(node) + end + end + end + + def self.fix_table_structure(doc) + # Fix invalid table structures + # This is a simplified version - the real implementation would be more complex + + # Remove ins/del directly inside table, tbody, thead, tfoot, tr + %w[table tbody thead tfoot tr].each do |tag| + doc.css("#{tag} > ins, #{tag} > del").each do |node| + # Move contents up, replacing the ins/del + parent = node.parent + node.children.each do |child| + node.add_previous_sibling(child) + end + node.remove + end + end + + # For td/th elements, move ins/del inside + doc.css('tr > ins > td, tr > del > td, tr > ins > th, tr > del > th').each do |cell| + change_tag = cell.parent + change_type = change_tag.name # ins or del + + # Move cell out of ins/del + change_tag.add_previous_sibling(cell) + change_tag.remove + + # Create a new ins/del inside the cell + new_change = Nokogiri::XML::Node.new(change_type, doc) + cell.children.each do |child| + new_change.add_child(child) + end + cell.add_child(new_change) + end + end + + def self.fix_list_structure(doc) + # Fix invalid list structures (li must be direct children of ul/ol) + doc.css('del > li, ins > li').each do |li| + change_tag = li.parent + change_type = change_tag.name # ins or del + + # Move li out of ins/del + if change_tag.parent && %w[ul ol].include?(change_tag.parent.name) + change_tag.add_previous_sibling(li) + + # Add a class to mark deleted list items + if change_type == 'del' + li['class'] = [li['class'], 'del-li'].compact.join(' ') + end + + # Create a new ins/del inside the li + new_change = Nokogiri::XML::Node.new(change_type, doc) + li.children.each do |child| + new_change.add_child(child.dup) + end + li.children.remove + li.add_child(new_change) + + change_tag.remove if change_tag.children.empty? + end + end + end + end +end diff --git a/spec/html_diff/dom_tokenizer_spec.rb b/spec/html_diff/dom_tokenizer_spec.rb new file mode 100644 index 0000000..98a6f65 --- /dev/null +++ b/spec/html_diff/dom_tokenizer_spec.rb @@ -0,0 +1,118 @@ +# frozen_string_literal: true + +require_relative '../spec_helper' + +RSpec.describe HTMLDiff::DomTokenizer do + describe '.tokenize' do + let(:html) do + <<~HTML + + +
+ + +This is a sample nested HTML document.
+We are a fictional company that specializes in web development.
+You can reach us at:
+ + info@example.comThis is some text
' + new_html = 'This is modified text
' + + result = described_class.diff(old_html, new_html) + + expect(result).to include('Lorem ipsum dolor sit amet. foo
' + new_html = <<~HTML +Lorem ipsum dolor sit amet.
+New paragraph
+And yet another new paragraph
+ HTML + + result = described_class.diff(old_html, new_html) + + # The diff should have valid HTML structure + expect(result).to include('Lorem ipsum dolor sit amet. foo
New paragraph
') + expect(result).to include('And yet another new paragraph
') + + # Validate with Nokogiri to ensure it's well-formed + expect { Nokogiri::HTML.fragment(result) }.not_to raise_error + end + end + + context 'with nested content' do + it 'preserves structure in nested elements' do + old_html = 'Old content
New content
OldNew content
| Cell 1 | Cell 2 |
| Cell 1 Updated | Cell 2 |
| ')
+ expect(result).to include(' New paragraph New paragraph Old paragraph Old paragraph Lorem ipsum dolor sit amet. foo ' + new_html = <<~HTML +Lorem ipsum dolor sit amet. +New paragraph +And yet another new paragraph + HTML + + result = described_class.diff(old_html, new_html) + + # Parse the HTML to ensure it's valid + doc = Nokogiri::HTML.fragment(result) + + # Verify it's not broken like the original example + expect(result).not_to include('') + expect(result).not_to include('') + + # The content should make sense structurally + expect(doc.css('p').size).to eq(3) + expect(doc.css('del').size).to eq(1) + expect(doc.css('ins').size).to eq(2) + end + end + end +end From b437cec3c1f3035bccce2968bb54ba10767ed79d Mon Sep 17 00:00:00 2001 From: johnnyshields <27655+johnnyshields@users.noreply.github.com> Date: Fri, 14 Mar 2025 20:20:29 +0900 Subject: [PATCH 2/2] More WIP --- lib/html_diff/dom_tokenizer.rb | 196 +++++++++++++++++++++++ spec/html_diff/dom_tokenizer_spec.rb | 227 +++++++++++++++++++++------ 2 files changed, 377 insertions(+), 46 deletions(-) diff --git a/lib/html_diff/dom_tokenizer.rb b/lib/html_diff/dom_tokenizer.rb index 5fda695..c7c98c6 100644 --- a/lib/html_diff/dom_tokenizer.rb +++ b/lib/html_diff/dom_tokenizer.rb @@ -79,3 +79,199 @@ def tokenize_text(text) end end end + +# frozen_string_literal: true + +require 'nokogiri' + +module HTMLDiff + # Tokenizes HTML while preserving DOM structure + class DomTokenizer + class ParseError < StandardError; end + + def initialize(options = {}) + @preserve_whitespace = options[:preserve_whitespace] || false + end + + # Tokenizes HTML in a DOM-aware way + # + # @param html [String] The HTML string to tokenize + # @return [Array] Nested array structure representing the DOM + def self.tokenize(html, options = {}) + new(options).tokenize(html) + end + + def tokenize(html) + return [] if !html || html.empty? + + begin + html = html.encode('UTF-8', invalid: :replace, undef: :replace, replace: ' ') + + # Simple text case + if !html.include?('<') && !html.include?('>') + return [html] + end + + # Parse the HTML + doc = Nokogiri::HTML(html, nil, 'UTF-8') + + result = [] + + # Add DOCTYPE if present (and if in the original HTML) + if doc.internal_subset && html.include?(''] + end + + # Check for malformed HTML + check_for_malformed_html(html) + + # Special case for a single paragaph of plain text + if doc.at_css('body') && + doc.at_css('body').children.size == 1 && + doc.at_css('body').children.first.text? && + !html.match(/<[^>]+>/) + return [doc.at_css('body').content] + end + + # Process html element or direct children depending on the input + if html.strip.start_with?(' *') + + # If no nodes found in body, try document level + nodes = doc.children.reject { |n| n.name == 'html' } if nodes.empty? + + nodes.each do |node| + token = process_node(node) + result << token if token + end + end + + result + rescue Nokogiri::XML::SyntaxError => e + raise ParseError, "Error parsing HTML: #{e.message}" + rescue => e + raise ParseError, "Error processing HTML: #{e.message}" + end + end + + private + + def check_for_malformed_html(html) + # Simple check for unbalanced tags + opening_tags = html.scan(/<([a-zA-Z0-9]+)(?:\s+[^>]*)?(?!\/)>/i).flatten + closing_tags = html.scan(/<\/([a-zA-Z0-9]+)>/i).flatten + + # Identify self-closing tags + void_elements = ['img', 'br', 'hr', 'meta', 'input', 'link', 'area', 'base', 'col', 'embed', + 'param', 'source', 'track', 'wbr'] + + # Count tags that need matching + tag_counts = Hash.new(0) + + opening_tags.each do |tag| + tag_counts[tag.downcase] += 1 unless void_elements.include?(tag.downcase) + end + + closing_tags.each do |tag| + tag_counts[tag.downcase] -= 1 + end + + # Check for any unbalanced tags + tag_counts.each do |tag, count| + if count != 0 + raise ParseError, "Unbalanced tags: #{tag}" + end + end + end + + def process_node(node) + case node.type + when Nokogiri::XML::Node::TEXT_NODE + process_text_node(node) + when Nokogiri::XML::Node::ELEMENT_NODE + process_element_node(node) + when Nokogiri::XML::Node::COMMENT_NODE + nil # Ignore comments + when Nokogiri::XML::Node::CDATA_SECTION_NODE + node.content # Return CDATA content as is + else + nil + end + end + + def process_text_node(node) + content = node.content + + # Skip empty text nodes unless preserving whitespace + return nil if !@preserve_whitespace && content.strip.empty? + + # Decode HTML entities + content = decode_html_entities(content) + + content + end + + def decode_html_entities(text) + text.gsub(/</, '<') + .gsub(/>/, '>') + .gsub(/&/, '&') + .gsub(/"/, '"') + .gsub(/'/, "'") + .gsub(/ /, ' ') + .gsub(/(\d+);/) { [$1.to_i].pack('U') } + .gsub(/([0-9a-fA-F]+);/) { [$1.to_i(16)].pack('U') } + end + + def process_element_node(node) + # Extract attributes + attrs = {} + node.attributes.each do |name, attr| + attrs[name] = attr.value + end + + # Handle self-closing tags + if is_self_closing?(node) + return [node.name, attrs, nil] + end + + # Process children + children = [] + has_text = false + has_elements = false + + node.children.each do |child| + if child.text? && (!child.content.strip.empty? || @preserve_whitespace) + has_text = true + elsif child.element? + has_elements = true + end + + child_token = process_node(child) + children << child_token if child_token + end + + # Return appropriate structure based on content type + if children.empty? + [node.name, attrs, []] + elsif has_text && has_elements + # Mixed content + [node.name, attrs, children] + elsif children.size == 1 && children.first.is_a?(String) + # Single text node child + [node.name, attrs, children.first] + else + # Multiple children or single element child + [node.name, attrs, children] + end + end + + def is_self_closing?(node) + ['img', 'br', 'hr', 'meta', 'input', 'link', 'area', 'base', 'col', 'embed', + 'param', 'source', 'track', 'wbr'].include?(node.name.downcase) && node.children.empty? + end + end +end diff --git a/spec/html_diff/dom_tokenizer_spec.rb b/spec/html_diff/dom_tokenizer_spec.rb index 98a6f65..8e1d838 100644 --- a/spec/html_diff/dom_tokenizer_spec.rb +++ b/spec/html_diff/dom_tokenizer_spec.rb @@ -4,8 +4,142 @@ RSpec.describe HTMLDiff::DomTokenizer do describe '.tokenize' do - let(:html) do - <<~HTML + subject(:tokenizer) { described_class } + + it 'returns an empty array for empty string' do + expect(tokenizer.tokenize('')).to eq([]) + end + + it 'parses a simple text node' do + expect(tokenizer.tokenize('Hello World')).to eq(['Hello World']) + end + + it 'parses a single tag without attributes' do + html = 'Paragraph ' + expected = [ + ['p', {}, 'Paragraph'] + ] + expect(tokenizer.tokenize(html)).to eq(expected) + end + + it 'parses a single tag with attributes' do + html = 'Paragraph ' + expected = [ + ['p', { 'class' => 'intro', 'id' => 'first' }, ['Paragraph']] + ] + expect(tokenizer.tokenize(html)).to eq(expected) + end + + it 'parses nested tags' do + html = 'Paragraph TitleParagraph TitleText
![]() Description This is important text ' + expected = [ + ['p', {}, [ + ['This is '], + ['strong', {}, ['important']], + [' ', 'text'] + ]] + ] + expect(tokenizer.tokenize(html)).to eq(expected) + end + + it 'handles malformed HTML by raising an error' do + html = 'Unclosed paragraph tag \n '
+ expected = [
+ ['div', {}, [
+ ["\n", " ", " "],
+ ['p', {}, ' ', ' ', 'Spaced', ' ', ' ', 'text', ' ', ' '],
+ ["\n"]
+ ]]
+ ]
+ expect(tokenizer.tokenize(html)).to eq(expected)
+ end
+
+ it 'ignores comments' do
+ html = 'Spaced text \nText This should not be parsed]]> '
+ expected = [
+ ['div', {}, 'This should not be parsed']
+ ]
+ expect(tokenizer.tokenize(html)).to eq(expected)
+ end
+
+ it 'handles HTML entities' do
+ html = '<div> is a block element & "p" is another ' + expected = [ + ['p', {}, ['', ' ', 'is', ' ', 'a', ' ', 'block', ' ', 'element', ' ', '&', ' ', '"', 'p', '"', ' ', 'is', ' ', 'another']]
+ ]
+ expect(tokenizer.tokenize(html)).to eq(expected)
+ end
+
+ context 'complex case' do
+ let(:html) do
+ <<~HTML
@@ -61,58 +195,59 @@
HTML
- end
+ end
- let(:expected) do
- [
- [''],
- ['html', {'lang' => 'en'}, [
- ['head', {}, [
- ['meta', {'charset' => 'UTF-8'}, []],
- ['meta', {'name' => 'viewport', 'content' => 'width=device-width, initial-scale=1.0'}, []],
- ['title', {}, [['My', ' ', 'Sample', ' ', 'Webpage']]],
- ['style', {}, [['body', ' ', '{', ' ', 'font-family:', ' ', 'Arial,', ' ', 'sans-serif;', ' ', 'margin:', ' ', '0;', ' ', 'padding:', ' ', '20px;', ' ', '}', ' ',
- '.container', ' ', '{', ' ', 'max-width:', ' ', '800px;', ' ', 'margin:', ' ', '0', ' ', 'auto;', ' ', '}', ' ',
- 'header', ' ', '{', ' ', 'background-color:', ' ', '#f5f5f5;', ' ', 'padding:', ' ', '15px;', ' ', 'border-radius:', ' ', '5px;', ' ', '}']]]
- ]],
- ['body', {}, [
- ['div', {'class' => 'container'}, [
- ['header', {}, [
- ['h1', {}, [['Welcome', ' ', 'to', ' ', 'My', ' ', 'Website']]],
- ['p', {}, [['This', ' ', 'is', ' ', 'a', ' ', 'sample', ' ', 'nested', ' ', 'HTML', ' ', 'document', '.']]]
- ]],
- ['main', {}, [
- ['section', {}, [
- ['h2', {}, [['About', ' ', 'Us']]],
- ['p', {}, [['We', ' ', 'are', ' ', 'a', ' '], ['em', {}, [['fictional']]], [' ', 'company', ' ', 'that', ' ', 'specializes', ' ', 'in', ' '], ['strong', {}, [['web', ' ', 'development']], ['.']]]],
- ['ul', {}, [
- ['li', {}, [['HTML', ' ', 'coding']]],
- ['li', {}, [['CSS', ' ', 'styling']]],
- ['li', {}, [['JavaScript', ' ', 'programming']]]
- ]]
+ let(:expected) do
+ [
+ [''],
+ ['html', {'lang' => 'en'}, [
+ ['head', {}, [
+ ['meta', {'charset' => 'UTF-8'}, []],
+ ['meta', {'name' => 'viewport', 'content' => 'width=device-width, initial-scale=1.0'}, []],
+ ['title', {}, [['My', ' ', 'Sample', ' ', 'Webpage']]],
+ ['style', {}, [['body', ' ', '{', ' ', 'font-family:', ' ', 'Arial,', ' ', 'sans-serif;', ' ', 'margin:', ' ', '0;', ' ', 'padding:', ' ', '20px;', ' ', '}', ' ',
+ '.container', ' ', '{', ' ', 'max-width:', ' ', '800px;', ' ', 'margin:', ' ', '0', ' ', 'auto;', ' ', '}', ' ',
+ 'header', ' ', '{', ' ', 'background-color:', ' ', '#f5f5f5;', ' ', 'padding:', ' ', '15px;', ' ', 'border-radius:', ' ', '5px;', ' ', '}']]]
+ ]],
+ ['body', {}, [
+ ['div', {'class' => 'container'}, [
+ ['header', {}, [
+ ['h1', {}, [['Welcome', ' ', 'to', ' ', 'My', ' ', 'Website']]],
+ ['p', {}, [['This', ' ', 'is', ' ', 'a', ' ', 'sample', ' ', 'nested', ' ', 'HTML', ' ', 'document', '.']]]
]],
- ['section', {}, [
- ['h2', {}, [['Contact', ' ', 'Information']]],
- ['p', {}, [['You', ' ', 'can', ' ', 'reach', ' ', 'us', ' ', 'at', ':']]],
- ['address', {}, [
- ['a', {'href' => 'mailto:info@example.com'}, [['info@example.com']]],
- ['br', {}, []],
- ['a', {'href' => 'tel:+15551234567'}, [["+1", " ", "(", "555", ")", " ", "123-4567"]]]
+ ['main', {}, [
+ ['section', {}, [
+ ['h2', {}, [['About', ' ', 'Us']]],
+ ['p', {}, [['We', ' ', 'are', ' ', 'a', ' '], ['em', {}, [['fictional']]], [' ', 'company', ' ', 'that', ' ', 'specializes', ' ', 'in', ' '], ['strong', {}, [['web', ' ', 'development']], ['.']]]],
+ ['ul', {}, [
+ ['li', {}, [['HTML', ' ', 'coding']]],
+ ['li', {}, [['CSS', ' ', 'styling']]],
+ ['li', {}, [['JavaScript', ' ', 'programming']]]
+ ]]
+ ]],
+ ['section', {}, [
+ ['h2', {}, [['Contact', ' ', 'Information']]],
+ ['p', {}, [['You', ' ', 'can', ' ', 'reach', ' ', 'us', ' ', 'at', ':']]],
+ ['address', {}, [
+ ['a', {'href' => 'mailto:info@example.com'}, [['info@example.com']]],
+ ['br', {}, []],
+ ['a', {'href' => 'tel:+15551234567'}, [["+1", " ", "(", "555", ")", " ", "123-4567"]]]
+ ]]
]]
+ ]],
+ ['footer', {}, [
+ ['p', {}, [['©', ' ', '2025', ' ', 'My', ' ', 'Sample', ' ', 'Website', '.', ' ', 'All', ' ', 'rights', ' ', 'reserved', '.']]]
]]
- ]],
- ['footer', {}, [
- ['p', {}, [['©', ' ', '2025', ' ', 'My', ' ', 'Sample', ' ', 'Website', '.', ' ', 'All', ' ', 'rights', ' ', 'reserved', '.']]]
]]
]]
]]
- ]]
- ]
- end
+ ]
+ end
- it 'tokenizes HTML correctly' do
- tokens = described_class.tokenize(html)
- expect(tokens).to eq(expected)
+ it 'tokenizes HTML correctly' do
+ tokens = described_class.tokenize(html)
+ expect(tokens).to eq(expected)
+ end
end
end
end
|