From e8e77db274e19375e6215dd121dd599cb2795ffe Mon Sep 17 00:00:00 2001 From: Matthew Stadelman Date: Fri, 26 May 2017 05:26:18 -0400 Subject: [PATCH 01/12] Support arbitrary node attributes on paragraphs (#51) * Pass node diretly into Paragraph.new * Pass definition directly into ListParagraph.new * Change argument order for Paragraph.new and ListParagraph.new * Refactor some of the logic in ast_next_paragraph into a prepare_node fucntion This function wil later on ensure all nodes have a stype attribute even if it is an empty string to avoid nil's popping up. * Use node['class'] instead of directly passing the Paragraph style in * Change node['class'] to node['pStyle'] This will work better once I get some of the attribute logic working. * Implement simple parsing of attributes * Implement parsing of nested attributes This also removes the special case of ListParagraph * Remove ListParagraph class * Refactor list testing methods to handle direct usage of pPr to store list props * Fix comment in ast.rb * Refactor pattern to_docx usage * Rename prepare_node to prepare_paragraph * Return properties from prepare_paragraph and pass onto Paragraph initialization * Remove passing of node into Parapgrah, only properities is needed * Rename Node.process_attributes into Node.process_properties It also uses the @properties isntance var instead of @attributes * Fix html converter test to use new instance var * Rewrite transform_attr to work on arrays with nested hashes This replaces the much more fragile RegExp logic in place before. * Change numPr property defintion to be a nested list of hashes This was done to match the new interface expected by transform_attr * Rubocop fix, curly bracket spacing * Update converter test to check nested property structure instead of using regexp * Fix syntax error leftover from rebase accidently killed off an `end` tag * Refactor Node.process_properties method * Slight change in default handling of node properties when value is a scalar * slight refactor of Pararaph.ppr_docx and Paragraph.to_docx * Move logic to handle properties from Node into a NodeProperties class * Minor refactor to better handle proper hashes that have keys with nil value * Implement unit tets fot NodeProperties class * Refactor to pass tagname in during instantiation so to_docx call signature is consistent * Split up node properties test method into smaller tests cases * Add paragraph factory method to node properties --- lib/sablon/html/ast.rb | 97 +++++++++++++++++++++--------------- lib/sablon/html/converter.rb | 56 ++++++++++++--------- test/html/converter_test.rb | 94 ++++++++++++++++++++++++++++++++-- 3 files changed, 180 insertions(+), 67 deletions(-) diff --git a/lib/sablon/html/ast.rb b/lib/sablon/html/ast.rb index 6327da91..b558dd35 100644 --- a/lib/sablon/html/ast.rb +++ b/lib/sablon/html/ast.rb @@ -10,6 +10,58 @@ def self.node_name end end + class NodeProperties + def self.paragraph(properties) + NodeProperties.new('w:pPr', properties) + end + + def initialize(tagname, properties) + @tagname = tagname + @properties = properties + end + + def inspect + @properties.map { |k, v| v ? "#{k}=#{v}" : k }.join(';') + end + + def [](key) + @properties[key] + end + + def []=(key, value) + @properties[key] = value + end + + def to_docx + "<#{@tagname}>#{process}" unless @properties.empty? + end + + private + + # processes attributes defined on the node into wordML property syntax + def process + @properties.map { |k, v| transform_attr(k, v) }.join + end + + # properties that have a list as the value get nested in tags and + # each entry in the list is transformed. When a value is a hash the + # keys in the hash are used to explicitly buld the XML tag attributes. + def transform_attr(key, value) + if value.is_a? Array + sub_attrs = value.map do |sub_prop| + sub_prop.map { |k, v| transform_attr(k, v) } + end + "#{sub_attrs.join}" + elsif value.is_a? Hash + props = value.map { |k, v| format('w:%s="%s"', k, v) if v } + "" + else + value = format('w:val="%s" ', value) if value + "" + end + end + end + class Collection < Node attr_reader :nodes def initialize(nodes) @@ -45,23 +97,14 @@ def inspect end class Paragraph < Node - attr_accessor :style, :runs - def initialize(style, runs) - @style, @runs = style, runs + attr_accessor :runs + def initialize(properties, runs) + @properties = NodeProperties.paragraph(properties) + @runs = runs end - PATTERN = <<-XML.gsub("\n", "") - - - -%s - -%s - -XML - def to_docx - PATTERN % [style, ppr_docx, runs.to_docx] + "#{@properties.to_docx}#{runs.to_docx}" end def accept(visitor) @@ -70,31 +113,7 @@ def accept(visitor) end def inspect - "" - end - - private - def ppr_docx - end - end - - class ListParagraph < Paragraph - LIST_STYLE = <<-XML.gsub("\n", "") - - - - -XML - attr_accessor :numid, :ilvl - def initialize(style, runs, numid, ilvl) - super style, runs - @numid = numid - @ilvl = ilvl - end - - private - def ppr_docx - LIST_STYLE % [@ilvl, numid] + "" end end diff --git a/lib/sablon/html/converter.rb b/lib/sablon/html/converter.rb index 05cc48b0..534c1601 100644 --- a/lib/sablon/html/converter.rb +++ b/lib/sablon/html/converter.rb @@ -91,37 +91,47 @@ def initialize @numbering = nil end + # Adds the appropriate style class to the node + def prepare_paragraph(node) + # set default styles based on node name + styles = { 'div' => 'Normal', 'p' => 'Paragraph', 'h' => 'Heading', + 'ul' => 'ListBullet', 'ol' => 'ListNumber' } + styles['li'] = @definition.style if @definition + + # set the node class attribute based on the style, num allows h1,h2,.. + tag, num = node.name.match(/([a-z]+)(\d*)/)[1..2] + unless styles[tag] + raise ArgumentError, "Don't know how to handle node: #{node.inspect}" + end + # + properties = {} + properties['pStyle'] = styles[tag] + num + properties + end + def ast_next_paragraph node = @builder.next - if node.name == 'div' - @builder.new_layer - @builder.emit Paragraph.new('Normal', ast_text(node.children)) - elsif node.name == 'p' - @builder.new_layer - @builder.emit Paragraph.new('Paragraph', ast_text(node.children)) - elsif node.name =~ /h(\d+)/ - @builder.new_layer - @builder.emit Paragraph.new("Heading#{$1}", ast_text(node.children)) - elsif node.name == 'ul' - @builder.new_layer ilvl: true - unless @builder.nested? - @definition = @numbering.register('ListBullet') - end - @builder.push_all(node.children) - elsif node.name == 'ol' + return if node.text? + + properties = prepare_paragraph(node) + + # handle special cases + if node.name =~ /ul|ol/ @builder.new_layer ilvl: true unless @builder.nested? - @definition = @numbering.register('ListNumber') + @definition = @numbering.register(properties['pStyle']) end @builder.push_all(node.children) + return elsif node.name == 'li' - @builder.new_layer - @builder.emit ListParagraph.new(@definition.style, ast_text(node.children), @definition.numid, @builder.ilvl) - elsif node.text? - # SKIP? - else - raise ArgumentError, "Don't know how to handle node: #{node.inspect}" + properties['numPr'] = [ + { 'ilvl' => @builder.ilvl }, { 'numId' => @definition.numid } + ] end + + # create word_ml node + @builder.new_layer + @builder.emit Paragraph.new(properties, ast_text(node.children)) end def ast_text(nodes, format: TextFormat.default) diff --git a/test/html/converter_test.rb b/test/html/converter_test.rb index d2d619c5..15a6a30d 100644 --- a/test/html/converter_test.rb +++ b/test/html/converter_test.rb @@ -328,6 +328,79 @@ def setup @converter.instance_variable_set(:@numbering, Sablon::Environment.new(nil).numbering) end + def test_empty_node_properties_converison + # test empty properties + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', {}) + assert props.inspect == '' + assert props.to_docx.nil? + end + + def test_simple_node_property_converison + props = { 'pStyle' => 'Paragraph' } + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props) + assert props.inspect == 'pStyle=Paragraph' + assert props.to_docx == '' + end + + def test_node_property_with_nil_value_converison + props = { 'b' => nil } + props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props) + assert props.inspect == 'b' + assert props.to_docx == '' + end + + def test_node_property_with_hash_value_converison + props = { 'shd' => { color: 'clear', fill: '123456', test: nil } } + props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props) + assert props.inspect == 'shd={:color=>"clear", :fill=>"123456", :test=>nil}' + assert props.to_docx == '' + end + + def test_node_property_with_array_value_converison + props = { 'numPr' => [{ 'ilvl' => 1 }, { 'numId' => 34 }] } + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props) + assert props.inspect == 'numPr=[{"ilvl"=>1}, {"numId"=>34}]' + assert props.to_docx == '' + end + + def test_complex_node_properties_conversion + props = { + 'top1' => 'val1', + 'top2' => [ + { 'mid0' => nil }, + { 'mid1' => [ + { 'bottom1' => { key1: 'abc' } }, + { 'bottom2' => 'xyz' } + ] }, + { 'mid2' => 'val2' } + ], + 'top3' => { key1: 1, key2: '2', key3: nil, key4: true, key5: false } + } + output = <<-DOCX.gsub(/^\s*/, '').delete("\n") + + + + + + + + + + + + + DOCX + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props) + assert props.to_docx == output + end + + def test_node_properties_paragraph_factory + props = { 'pStyle' => 'Paragraph' } + props = Sablon::HTMLConverter::NodeProperties.paragraph(props) + assert props.inspect == 'pStyle=Paragraph' + assert props.to_docx == '' + end + def test_div input = '
Lorem ipsum dolor sit amet
' ast = @converter.processed_ast(input) @@ -408,19 +481,30 @@ def test_ol def test_num_id ast = @converter.processed_ast('
  1. Some
  2. Lorem
  • ipsum
  1. dolor
  2. sit
') - assert_equal [1001, 1001, 1002, 1003, 1003], ast.grep(Sablon::HTMLConverter::ListParagraph).map(&:numid) + assert_equal [1001, 1001, 1002, 1003, 1003], get_numpr_prop_from_ast(ast, 'numId') end def test_nested_lists_have_the_same_numid ast = @converter.processed_ast('
  • Lorem
    • ipsum
      • dolor
') - assert_equal [1001, 1001, 1001], ast.grep(Sablon::HTMLConverter::ListParagraph).map(&:numid) + assert_equal [1001, 1001, 1001], get_numpr_prop_from_ast(ast, 'numId') end def test_keep_nested_list_order input = '
  • 1
    • 1.1
      • 1.1.1
    • 1.2
  • 2
    • 1.3
      • 1.3.1
' ast = @converter.processed_ast(input) - list_p = ast.grep(Sablon::HTMLConverter::ListParagraph) - assert_equal [1001], list_p.map(&:numid).uniq - assert_equal [0, 1, 2, 1, 0, 1, 2], list_p.map(&:ilvl) + assert_equal [1001], get_numpr_prop_from_ast(ast, 'numId').uniq + assert_equal [0, 1, 2, 1, 0, 1, 2], get_numpr_prop_from_ast(ast, 'ilvl') + end + + private + + # returns the numid attribute from paragraphs + def get_numpr_prop_from_ast(ast, key) + values = [] + ast.grep(Sablon::HTMLConverter::Paragraph).each do |para| + numpr = para.instance_variable_get('@properties')['numPr'] + numpr.each { |val| values.push(val[key]) if val[key] } + end + values end end From ae2c1613ae5029642aad4aa1f5e941e8678c00f6 Mon Sep 17 00:00:00 2001 From: Yves Senn Date: Fri, 26 May 2017 11:28:09 +0200 Subject: [PATCH 02/12] refactor, put NodeProperties into separate test case. --- test/html/converter_test.rb | 148 ++++++++++++++++++------------------ 1 file changed, 75 insertions(+), 73 deletions(-) diff --git a/test/html/converter_test.rb b/test/html/converter_test.rb index 15a6a30d..bc68f3dc 100644 --- a/test/html/converter_test.rb +++ b/test/html/converter_test.rb @@ -328,79 +328,6 @@ def setup @converter.instance_variable_set(:@numbering, Sablon::Environment.new(nil).numbering) end - def test_empty_node_properties_converison - # test empty properties - props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', {}) - assert props.inspect == '' - assert props.to_docx.nil? - end - - def test_simple_node_property_converison - props = { 'pStyle' => 'Paragraph' } - props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props) - assert props.inspect == 'pStyle=Paragraph' - assert props.to_docx == '' - end - - def test_node_property_with_nil_value_converison - props = { 'b' => nil } - props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props) - assert props.inspect == 'b' - assert props.to_docx == '' - end - - def test_node_property_with_hash_value_converison - props = { 'shd' => { color: 'clear', fill: '123456', test: nil } } - props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props) - assert props.inspect == 'shd={:color=>"clear", :fill=>"123456", :test=>nil}' - assert props.to_docx == '' - end - - def test_node_property_with_array_value_converison - props = { 'numPr' => [{ 'ilvl' => 1 }, { 'numId' => 34 }] } - props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props) - assert props.inspect == 'numPr=[{"ilvl"=>1}, {"numId"=>34}]' - assert props.to_docx == '' - end - - def test_complex_node_properties_conversion - props = { - 'top1' => 'val1', - 'top2' => [ - { 'mid0' => nil }, - { 'mid1' => [ - { 'bottom1' => { key1: 'abc' } }, - { 'bottom2' => 'xyz' } - ] }, - { 'mid2' => 'val2' } - ], - 'top3' => { key1: 1, key2: '2', key3: nil, key4: true, key5: false } - } - output = <<-DOCX.gsub(/^\s*/, '').delete("\n") - - - - - - - - - - - - - DOCX - props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props) - assert props.to_docx == output - end - - def test_node_properties_paragraph_factory - props = { 'pStyle' => 'Paragraph' } - props = Sablon::HTMLConverter::NodeProperties.paragraph(props) - assert props.inspect == 'pStyle=Paragraph' - assert props.to_docx == '' - end - def test_div input = '
Lorem ipsum dolor sit amet
' ast = @converter.processed_ast(input) @@ -508,3 +435,78 @@ def get_numpr_prop_from_ast(ast, key) values end end + +class NodePropertiesTest < Sablon::TestCase + def test_empty_node_properties_converison + # test empty properties + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', {}) + assert props.inspect == '' + assert props.to_docx.nil? + end + + def test_simple_node_property_converison + props = { 'pStyle' => 'Paragraph' } + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props) + assert props.inspect == 'pStyle=Paragraph' + assert props.to_docx == '' + end + + def test_node_property_with_nil_value_converison + props = { 'b' => nil } + props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props) + assert props.inspect == 'b' + assert props.to_docx == '' + end + + def test_node_property_with_hash_value_converison + props = { 'shd' => { color: 'clear', fill: '123456', test: nil } } + props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props) + assert props.inspect == 'shd={:color=>"clear", :fill=>"123456", :test=>nil}' + assert props.to_docx == '' + end + + def test_node_property_with_array_value_converison + props = { 'numPr' => [{ 'ilvl' => 1 }, { 'numId' => 34 }] } + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props) + assert props.inspect == 'numPr=[{"ilvl"=>1}, {"numId"=>34}]' + assert props.to_docx == '' + end + + def test_complex_node_properties_conversion + props = { + 'top1' => 'val1', + 'top2' => [ + { 'mid0' => nil }, + { 'mid1' => [ + { 'bottom1' => { key1: 'abc' } }, + { 'bottom2' => 'xyz' } + ] }, + { 'mid2' => 'val2' } + ], + 'top3' => { key1: 1, key2: '2', key3: nil, key4: true, key5: false } + } + output = <<-DOCX.gsub(/^\s*/, '').delete("\n") + + + + + + + + + + + + + DOCX + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props) + assert props.to_docx == output + end + + def test_node_properties_paragraph_factory + props = { 'pStyle' => 'Paragraph' } + props = Sablon::HTMLConverter::NodeProperties.paragraph(props) + assert props.inspect == 'pStyle=Paragraph' + assert props.to_docx == '' + end +end From 6ca2454818688c2cf94bd3ec1b302d4b15cbcffc Mon Sep 17 00:00:00 2001 From: Yves Senn Date: Fri, 26 May 2017 11:28:43 +0200 Subject: [PATCH 03/12] refactor, no need to explicitly reference self. --- lib/sablon/html/ast.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/sablon/html/ast.rb b/lib/sablon/html/ast.rb index b558dd35..7556a9e8 100644 --- a/lib/sablon/html/ast.rb +++ b/lib/sablon/html/ast.rb @@ -12,7 +12,7 @@ def self.node_name class NodeProperties def self.paragraph(properties) - NodeProperties.new('w:pPr', properties) + new('w:pPr', properties) end def initialize(tagname, properties) From 3bfba2d9f554877efb3f1af2d708392ce843a896 Mon Sep 17 00:00:00 2001 From: Berin Larson Date: Thu, 6 Jul 2017 01:22:14 +0530 Subject: [PATCH 04/12] =?UTF-8?q?Fixes=20#56.=20Expression=20containing=20?= =?UTF-8?q?the=20string=20=E2=80=98comment=E2=80=99=20will=20no=20longer?= =?UTF-8?q?=20match=20the=20beginning=20of=20comment=20block.=20(#58)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/sablon/processor/document.rb | 2 +- .../xml/comment_block_and_comment_as_key.xml | 31 +++++++++++++++++++ test/processor/document_test.rb | 15 +++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 test/fixtures/xml/comment_block_and_comment_as_key.xml diff --git a/lib/sablon/processor/document.rb b/lib/sablon/processor/document.rb index cdb4b67b..5aa6bad6 100644 --- a/lib/sablon/processor/document.rb +++ b/lib/sablon/processor/document.rb @@ -171,7 +171,7 @@ def consume(allow_insertion) when /([^ ]+):if/ block = consume_block("#{$1}:endIf") Statement::Condition.new(Expression.parse($1), block) - when /comment/ + when /^comment$/ block = consume_block("endComment") Statement::Comment.new(block) end diff --git a/test/fixtures/xml/comment_block_and_comment_as_key.xml b/test/fixtures/xml/comment_block_and_comment_as_key.xml new file mode 100644 index 00000000..0eae8a5d --- /dev/null +++ b/test/fixtures/xml/comment_block_and_comment_as_key.xml @@ -0,0 +1,31 @@ +Before + + + + + «comment» + + + + + + Inside Comment! + + + + + + + «endComment» + + + + + + + + «=comment» + + + +After \ No newline at end of file diff --git a/test/processor/document_test.rb b/test/processor/document_test.rb index ea2d277c..cc56c9f4 100644 --- a/test/processor/document_test.rb +++ b/test/processor/document_test.rb @@ -440,6 +440,21 @@ def test_comment assert_equal "Before After", text(result) end + def test_comment_block_and_comment_as_key + result = process(snippet("comment_block_and_comment_as_key"), {comment: 'Contents of comment key'}) + + assert_xml_equal <<-document, result + Before + After + + + + Contents of comment key + + + document + end + private def process(document, context) From 91b775a1b4d879de944a65c268f5dfe980190791 Mon Sep 17 00:00:00 2001 From: Matthew Stadelman Date: Thu, 20 Jul 2017 15:24:46 -0400 Subject: [PATCH 05/12] Support style attribute (#55) * Pass node diretly into Paragraph.new * Pass definition directly into ListParagraph.new * Change argument order for Paragraph.new and ListParagraph.new * Refactor some of the logic in ast_next_paragraph into a prepare_node fucntion This function wil later on ensure all nodes have a stype attribute even if it is an empty string to avoid nil's popping up. * Use node['class'] instead of directly passing the Paragraph style in * Change node['class'] to node['pStyle'] This will work better once I get some of the attribute logic working. * Implement simple parsing of attributes * Implement parsing of nested attributes This also removes the special case of ListParagraph * Remove ListParagraph class * Refactor list testing methods to handle direct usage of pPr to store list props * Fix comment in ast.rb * Refactor pattern to_docx usage * Rename prepare_node to prepare_paragraph * Return properties from prepare_paragraph and pass onto Paragraph initialization * Remove passing of node into Parapgrah, only properities is needed * Rename Node.process_attributes into Node.process_properties It also uses the @properties isntance var instead of @attributes * Fix html converter test to use new instance var * Rewrite transform_attr to work on arrays with nested hashes This replaces the much more fragile RegExp logic in place before. * Change numPr property defintion to be a nested list of hashes This was done to match the new interface expected by transform_attr * Rubocop fix, curly bracket spacing * Update converter test to check nested property structure instead of using regexp * Fix syntax error leftover from rebase accidently killed off an `end` tag * Refactor Node.process_properties method * Slight change in default handling of node properties when value is a scalar * slight refactor of Pararaph.ppr_docx and Paragraph.to_docx * Squashed merge of support-style-attribute to add in changes without rebasing. Rebasing the support-style-attribute branch against the changes to support-arbitrary-node-attributes isn't practical due to it being rebased against 'master'. Merging instead is the cleaner option. The commits were squashed to prevent an odd history being generated since this is a branch of the support-arbitrary-node-attributes branch. Squashed commit of the following: commit d11797e7aa9fd19c077a2fc5298ede899c70e41c Author: Matthew Stadelman Date: Mon May 15 10:34:28 2017 -0400 Add method to pass parent attributes onto children, only used for lists currently commit d5be869b86c49a11d29abf1607a0a4dc7428e36f Author: Matthew Stadelman Date: Fri May 12 11:15:12 2017 -0400 Fix error in coverting CSS -> XML when processing a property hash of attributes The ending / was being omitted from the property tag commit 8a308fc780a753fb8cde174fead1243f25cbf8c3 Author: Matthew Stadelman Date: Fri May 12 10:28:53 2017 -0400 Change background-color to map to shd instead of highlight commit f19ee3248d6ccf9f13e60a22d9f7dcb06b8b6917 Author: Matthew Stadelman Date: Fri May 12 08:37:06 2017 -0400 Add support for text-decoration style Only underline and line-through are supported commit 78e13cd70d5c36f0b2e700e160f57cc35a1b9ae2 Author: Matthew Stadelman Date: Fri May 12 08:32:06 2017 -0400 Implement support for a hash of values to directly set XML node attributes in styles commit e8225b08fec31b14dd2d32734225e5c491b17ecd Author: Matthew Stadelman Date: Thu May 11 15:04:10 2017 -0400 Remove empty equals signs when value is nil commit e6f6ac703fe1833957cd5a4f56b01cc45b3a8bad Author: Matthew Stadelman Date: Thu May 11 14:55:29 2017 -0400 Add a separate class for testing each style. I haven't added all of the tests yet, I'll see how senny feels about storing this all in config and if so, what do we want in our default setup. Although I feel like full support of common styles should ship default with sablon to ease burden on the users commit 4d0800aa3f0c269483c34f1e474d1b7407a30ac8 Author: Matthew Stadelman Date: Thu May 11 14:45:00 2017 -0400 Fix test now that parapgraph props are being whitelisted commit af110f0d2d58fa24b5692f79ff25196a01745490 Author: Matthew Stadelman Date: Thu May 11 14:42:32 2017 -0400 Improve logic to whitelist properties passed in commit 8d6ef9c4dc50e47fa09a463bc8e4d343fa61b062 Author: Matthew Stadelman Date: Thu May 11 12:33:25 2017 -0400 Remove require 'pry' commit 227058d9c0ce90da0f3c4bcfb42532d847a26fd7 Author: Matthew Stadelman Date: Thu May 11 12:32:52 2017 -0400 Add explicit whitelisting of properties commit ee422cab624df7dcac834f35616aa87a13aff19b Author: Matthew Stadelman Date: Thu May 11 11:05:00 2017 -0400 Update converter_test to remove empty w:val tags commit 2544e196594b34588f521904a3233591ef16b7e5 Author: Matthew Stadelman Date: Thu May 11 11:04:27 2017 -0400 Change value check in transform_attr to check for nil instead of empty string commit 0351b8a7bc74999b30520943be86d06b7a566abb Author: Matthew Stadelman Date: Thu May 11 11:03:56 2017 -0400 Change element based styles from empty string to nil commit 3bb3d5a00ab81b7cf918e94ce381dc11fb323c05 Author: Matthew Stadelman Date: Thu May 11 10:33:21 2017 -0400 Implement tests for new code in converter_test.rb commit 0f416a61fe1265af4dd46ce92267d6903b1645ab Author: Matthew Stadelman Date: Thu May 11 10:32:17 2017 -0400 Minor refactors to remove invalid attributes for a run Currently filtered attributes are pStyle, numPr, jc commit ca4239e5bbbd1ca2333f4990bb57b1a7c82eb885 Author: Matthew Stadelman Date: Thu May 11 10:07:34 2017 -0400 Change code in ast_runs to pass the text to Run instead of the Node commit af7c4cd1677054f9f7270b4114859b2cdbf393b7 Author: Matthew Stadelman Date: Thu May 11 10:05:14 2017 -0400 Refactor Text class into Run class The new class properly handles turning run properties into a WordML string commit af8106f4ff8817c0c600527ca360fc6dd6a293d9 Author: Matthew Stadelman Date: Thu May 11 10:00:37 2017 -0400 Remove TextFormat class This is suplanted by the use of Run properties which are leagues more flexible commit cba4ee8d7f7814b7ee61eea4abfa1bbe2c6aa8f4 Author: Matthew Stadelman Date: Thu May 11 10:00:02 2017 -0400 Slight refactor of Paragraph class commit e20804cbce64e61a0dc8978198d319e3c8e3d912 Author: Matthew Stadelman Date: Thu May 11 09:57:09 2017 -0400 Refactor ast_text method and rename it ast_runs commit a35e4dce7aa0b51263184ead424962ec1ab88624 Author: Matthew Stadelman Date: Thu May 11 09:51:08 2017 -0400 Pass Paragraph properties directly onto the runs commit b3004a3f812cfa79b206a12bf85643133d2f07cc Author: Matthew Stadelman Date: Thu May 11 09:37:48 2017 -0400 Add prepare_run method to HTMLConverter commit 9f41139ac01272be4bc0a4b11a24fc70389b893d Author: Matthew Stadelman Date: Thu May 11 09:23:48 2017 -0400 Add style processing to preapre_paragraph commit 2bd9ce7df2416d2e821c6c82cbafba09ed6ede98 Author: Matthew Stadelman Date: Thu May 11 09:14:21 2017 -0400 Add process_style method to HTMLConverter commit e738ac4c35422bd1a9ec6c914f62f9d9f04afd5a Author: Matthew Stadelman Date: Thu May 11 08:53:33 2017 -0400 Update converter test to check nested property structure instead of using regexp commit e22a4e75f1f5b8cf0e54df80940b04974e8ddbaa Author: Matthew Stadelman Date: Thu May 11 08:48:19 2017 -0400 Rubocop fix, curly bracket spacing commit dec51d0bdb65356ccf8e77e20ccb4b99c2c46c87 Author: Matthew Stadelman Date: Thu May 11 08:44:53 2017 -0400 Change numPr property defintion to be a nested list of hashes This was done to match the new interface expected by transform_attr commit aaf28590a6afd9a8eb8347c73857c7a5ce4d9cdd Author: Matthew Stadelman Date: Thu May 11 08:43:45 2017 -0400 Rewrite transform_attr to work on arrays with nested hashes This replaces the much more fragile RegExp logic in place before. commit c90bb341810a501a3d3d53d2dbfd591cb62966be Author: Matthew Stadelman Date: Thu May 11 08:16:47 2017 -0400 Fix html converter test to use new instance var commit 3f8c84bbc716e0283d32c5c283a8c60def7ddc8c Author: Matthew Stadelman Date: Thu May 11 08:15:53 2017 -0400 Rename Node.process_attributes into Node.process_properties It also uses the @properties isntance var instead of @attributes commit 0be77816780b442a65fce0b7376dcbd97ee5ffdf Author: Matthew Stadelman Date: Thu May 11 08:11:34 2017 -0400 Remove passing of node into Parapgrah, only properities is needed commit b73a55730e03da02507d9e2f624bc762197f8b61 Author: Matthew Stadelman Date: Thu May 11 08:06:19 2017 -0400 Return properties from prepare_paragraph and pass onto Paragraph initialization commit 03194372302c26183591015629a2cc1897170eab Author: Matthew Stadelman Date: Thu May 11 08:02:24 2017 -0400 Rename prepare_node to prepare_paragraph commit 4cf0536c8ae1dc33da0603de2084e0c924e0aab7 Author: Matthew Stadelman Date: Mon May 8 06:51:45 2017 -0400 Refactor pattern to_docx usage commit dd3da5ddf078643731f64e68778f6373e5b956c0 Author: Matthew Stadelman Date: Mon May 8 06:48:36 2017 -0400 Fix comment in ast.rb commit 0917c7c1173d70aadfacdd647b7fe313d97f9b9c Author: Matthew Stadelman Date: Mon May 8 06:42:28 2017 -0400 Refactor list testing methods to handle direct usage of pPr to store list props commit 087dcf1d6cf3c8b2c13c9b99b199ff6676e3bcc4 Author: Matthew Stadelman Date: Thu May 4 17:49:07 2017 -0400 Remove ListParagraph class commit cadf3f7643919781d51343ac9726d37ce121e490 Author: Matthew Stadelman Date: Thu May 4 17:43:44 2017 -0400 Implement parsing of nested attributes This also removes the special case of ListParagraph commit 5c5c489d510f1a8ab4ac8673c5365b83ea75d607 Author: Matthew Stadelman Date: Thu May 4 17:03:31 2017 -0400 Implement simple parsing of attributes commit 0830ed356f00c594b7b43315b1f77cb10ee06025 Author: Matthew Stadelman Date: Thu May 4 16:37:23 2017 -0400 Change node['class'] to node['pStyle'] This will work better once I get some of the attribute logic working. commit 286b2c9ca5c54576b2857c506b223685ea25ce53 Author: Matthew Stadelman Date: Thu May 4 16:13:36 2017 -0400 Use node['class'] instead of directly passing the Paragraph style in commit 30385c23576e1eaa707a430e709aa6db02285e7d Author: Matthew Stadelman Date: Thu May 4 15:59:13 2017 -0400 Refactor some of the logic in ast_next_paragraph into a prepare_node fucntion This function wil later on ensure all nodes have a stype attribute even if it is an empty string to avoid nil's popping up. commit 099898f9cce6cfc278ac14304a85bf2e764c7a89 Author: Matthew Stadelman Date: Thu May 4 14:59:39 2017 -0400 Add blankline after private to satisfy rubocop commit 494349fb4094a4a624e0cc46ea286316c9ffa37e Author: Matthew Stadelman Date: Thu May 4 14:56:22 2017 -0400 Change argument order for Paragraph.new and ListParagraph.new commit efc6f8bb468b0d87b6ec55dfccedfee3ca254c65 Author: Matthew Stadelman Date: Thu May 4 14:54:27 2017 -0400 Pass definition directly into ListParagraph.new commit 963d96d1befc400c6461c9a8aca60497c71f4892 Author: Matthew Stadelman Date: Thu May 4 14:52:43 2017 -0400 Pass node diretly into Paragraph.new * Fix tests to match new handling of attributes witout a value Also added process method to HTMLConverterStyleTest class * Add paragraph style conversion tests * Allow unsupported text-decorations to be passed through as a toggle property This allows styles not-supported by CSS text-decoration but allowed by MS Word to function. examples include dstrike, emboss, imprint, caps, etc. * Add tests for CSS style conversion inside runs * Improve CSS style conversion unit tests * Create HTML snippets module * Add html snippets fixtures * Use HTMLSnippets module in html_test.rb * Add proper attribute merging for block level tags inside inline tags * Finish update html-content integration test * Fix comment * Update README * Add tip about regular expression to clean up whitespace in HTML * Support arbitrary node attributes on paragraphs (#51) * Pass node diretly into Paragraph.new * Pass definition directly into ListParagraph.new * Change argument order for Paragraph.new and ListParagraph.new * Refactor some of the logic in ast_next_paragraph into a prepare_node fucntion This function wil later on ensure all nodes have a stype attribute even if it is an empty string to avoid nil's popping up. * Use node['class'] instead of directly passing the Paragraph style in * Change node['class'] to node['pStyle'] This will work better once I get some of the attribute logic working. * Implement simple parsing of attributes * Implement parsing of nested attributes This also removes the special case of ListParagraph * Remove ListParagraph class * Refactor list testing methods to handle direct usage of pPr to store list props * Fix comment in ast.rb * Refactor pattern to_docx usage * Rename prepare_node to prepare_paragraph * Return properties from prepare_paragraph and pass onto Paragraph initialization * Remove passing of node into Parapgrah, only properities is needed * Rename Node.process_attributes into Node.process_properties It also uses the @properties isntance var instead of @attributes * Fix html converter test to use new instance var * Rewrite transform_attr to work on arrays with nested hashes This replaces the much more fragile RegExp logic in place before. * Change numPr property defintion to be a nested list of hashes This was done to match the new interface expected by transform_attr * Rubocop fix, curly bracket spacing * Update converter test to check nested property structure instead of using regexp * Fix syntax error leftover from rebase accidently killed off an `end` tag * Refactor Node.process_properties method * Slight change in default handling of node properties when value is a scalar * slight refactor of Pararaph.ppr_docx and Paragraph.to_docx * Move logic to handle properties from Node into a NodeProperties class * Minor refactor to better handle proper hashes that have keys with nil value * Implement unit tets fot NodeProperties class * Refactor to pass tagname in during instantiation so to_docx call signature is consistent * Split up node properties test method into smaller tests cases * Add paragraph factory method to node properties * refactor, put NodeProperties into separate test case. * refactor, no need to explicitly reference self. * Merge changes in upstream into feature Squashed commit of the following: commit 6ca2454818688c2cf94bd3ec1b302d4b15cbcffc Author: Yves Senn Date: Fri May 26 11:28:43 2017 +0200 refactor, no need to explicitly reference self. commit ae2c1613ae5029642aad4aa1f5e941e8678c00f6 Author: Yves Senn Date: Fri May 26 11:28:09 2017 +0200 refactor, put NodeProperties into separate test case. commit e8e77db274e19375e6215dd121dd599cb2795ffe Author: Matthew Stadelman Date: Fri May 26 05:26:18 2017 -0400 Support arbitrary node attributes on paragraphs (#51) * Pass node diretly into Paragraph.new * Pass definition directly into ListParagraph.new * Change argument order for Paragraph.new and ListParagraph.new * Refactor some of the logic in ast_next_paragraph into a prepare_node fucntion This function wil later on ensure all nodes have a stype attribute even if it is an empty string to avoid nil's popping up. * Use node['class'] instead of directly passing the Paragraph style in * Change node['class'] to node['pStyle'] This will work better once I get some of the attribute logic working. * Implement simple parsing of attributes * Implement parsing of nested attributes This also removes the special case of ListParagraph * Remove ListParagraph class * Refactor list testing methods to handle direct usage of pPr to store list props * Fix comment in ast.rb * Refactor pattern to_docx usage * Rename prepare_node to prepare_paragraph * Return properties from prepare_paragraph and pass onto Paragraph initialization * Remove passing of node into Parapgrah, only properities is needed * Rename Node.process_attributes into Node.process_properties It also uses the @properties isntance var instead of @attributes * Fix html converter test to use new instance var * Rewrite transform_attr to work on arrays with nested hashes This replaces the much more fragile RegExp logic in place before. * Change numPr property defintion to be a nested list of hashes This was done to match the new interface expected by transform_attr * Rubocop fix, curly bracket spacing * Update converter test to check nested property structure instead of using regexp * Fix syntax error leftover from rebase accidently killed off an `end` tag * Refactor Node.process_properties method * Slight change in default handling of node properties when value is a scalar * slight refactor of Pararaph.ppr_docx and Paragraph.to_docx * Move logic to handle properties from Node into a NodeProperties class * Minor refactor to better handle proper hashes that have keys with nil value * Implement unit tets fot NodeProperties class * Refactor to pass tagname in during instantiation so to_docx call signature is consistent * Split up node properties test method into smaller tests cases * Add paragraph factory method to node properties * Fix convert_test to use assert_equals instead of assert val == val2 * Move style processing to AST classes This has the advantage that nodes need to handle CSS properties differently. i.e. vertical-align for Paragraphs and Runs * Update html sample fixture It needed updated because the document.xml no longer contained runs with extra properties already set by their parent paragraph * Update unit and integration tests to cover all styles defined * Add more unit tests to better excerise the NodeProperties class * Fix infinite loop issue in prepare_paragraph Using hash[key] would retrigger the block if key didn't exist. This was fixed by using hash.key?(key) instead which doesn't trigger the block * Add a test for unknown tags passed into prepare_paragraph --- README.md | 37 ++- lib/sablon/html/ast.rb | 196 ++++++++---- lib/sablon/html/converter.rb | 95 ++++-- test/fixtures/html/html_test_content.html | 163 ++++++++++ test/fixtures/html_sample.docx | Bin 23568 -> 24070 bytes test/html/converter_test.rb | 351 ++++++++++++++++++++-- test/html_test.rb | 20 +- test/support/html_snippets.rb | 9 + 8 files changed, 738 insertions(+), 133 deletions(-) create mode 100644 test/fixtures/html/html_test_content.html create mode 100644 test/support/html_snippets.rb diff --git a/README.md b/README.md index 2ab629d4..47490db0 100644 --- a/README.md +++ b/README.md @@ -102,14 +102,13 @@ IMPORTANT: This feature is very much *experimental*. Currently, the insertion will replace the containing paragraph. This means that other content in the same paragraph is discarded. -##### HTML [experimental] +##### HTML -Similar to WordProcessingML it's possible to use html as input while processing the -tempalte. You don't need to modify your templates, a simple insertion operation +Similar to WordProcessingML it's possible to use html as input while processing the template. You don't need to modify your templates, a simple insertion operation is sufficient: ``` -«=article.body» +«=article» ``` To use HTML insertion prepare the context like so: @@ -118,24 +117,40 @@ To use HTML insertion prepare the context like so: html_body = <<-HTML
This text can contain additional formatting according to the HTML specification.
+

Right aligned +content with a yellow background color

+
Inline styles are possible as well
HTML context = { - article: { html_body: Sablon.content(:html, html_body) } + article: Sablon.content(:html, html_body) } + # alternative method using special key format + # 'html:article' => html_body } template.render_to_file File.expand_path("~/Desktop/output.docx"), context ``` -Currently HTML insertion is very limited and strongly focused on the HTML -generated by [Trix editor](https://github.com/basecamp/trix). +Currently, HTML insertion is somewhat limited. It is recommended that the block level tags such as `p` and `div` are not nested within each other, otherwise the final document may not generate as anticipated. List tags (`ul` and `ol`) and inline tags (`span`, `b`, `em`, etc.) can be nested as deeply as needed. -IMPORTANT: This feature is very much *experimental*. Currently, the insertion - will replace the containing paragraph. This means that other content in the same - paragraph is discarded. +Not all tags are supported. Currently supported tags are defined in [converter.rb](lib/sablon/html/converter.rb) for paragraphs in method `prepare_paragraph` and for text runs in `prepare_run`. + +Basic conversion of CSS inline styles into matching WordML properties in supported through the `style=" ... "` attribute in the HTML markup. Not all possible styles are supported and only a small subset of CSS styles have a direct WordML equivalent. Styles are passed onto nested elements. The currently supported styles are also defined in [converter.rb](lib/sablon/html/converter.rb) in method `process_style`. Simple toggle properties that aren't directly supported can be added using the `text-decoration: ` style attribute with the proper WordML tag name as the value. Paragraph and Run property reference can be found at: + * http://officeopenxml.com/WPparagraphProperties.php + * http://officeopenxml.com/WPtextFormatting.php + +If you wish to write out your HTML code in an indented human readable fashion, or you are pulling content from the ERB templating engine in rails the following regular expression can help eliminate extraneous whitespace in the final document. +```ruby +# combine all white space +html_str = html_str.gsub(/\s+/, ' ') +# clear any white space between block level tags and other content +html_str.gsub(%r{\s*<(/?(?:h\d|div|p|br|ul|ol|li).*?)>\s*}, '<\1>') +``` + +IMPORTANT: Currently, the insertion will replace the containing paragraph. This means that other content in the same paragraph is discarded. #### Conditionals -Sablon can render parts of the template conditonally based on the value of a +Sablon can render parts of the template conditionally based on the value of a context variable. Conditional fields are inserted around the content. ``` diff --git a/lib/sablon/html/ast.rb b/lib/sablon/html/ast.rb index 7556a9e8..bf280d8b 100644 --- a/lib/sablon/html/ast.rb +++ b/lib/sablon/html/ast.rb @@ -1,10 +1,70 @@ module Sablon class HTMLConverter class Node + PROPERTIES = [].freeze + # styles shared or common logic across all node types go here. Any + # undefined styles are passed straight through "as is" to the + # properties hash. Keys that are symbols will not get called directly + # when processing the style string and are suitable for internal-only + # usage across different classes. + STYLE_CONVERSION = { + 'background-color' => lambda { |v| + return 'shd', { val: 'clear', fill: v.delete('#') } + }, + border: lambda { |v| + props = { sz: 2, val: 'single', color: '000000' } + vals = v.split + vals[1] = 'single' if vals[1] == 'solid' + # + props[:sz] = (2 * Float(vals[0].gsub(/[^\d.]/, '')).ceil).to_s if vals[0] + props[:val] = vals[1] if vals[1] + props[:color] = vals[2].delete('#') if vals[2] + # + return props + }, + 'text-align' => ->(v) { return 'jc', v } + } + # This proc is used to allow unmapped styles to pass through + STYLE_CONVERSION.default_proc = proc do |hash, key| + ->(v) { return key, v } + end + STYLE_CONVERSION.freeze + def accept(visitor) visitor.visit(self) end + # maps the CSS style property to it's OpenXML equivalent. Not all CSS + # properties have an equivalent, nor share the same behavior when + # defined on different node types (Paragraph, Table and Run). + def self.process_style(style_str) + return {} unless style_str + # + styles = style_str.split(';').map { |pair| pair.split(':') } + # process the styles as a hash and store values + style_attrs = {} + Hash[styles].each do |key, value| + key, value = convert_style_attr(key.strip, value.strip) + style_attrs[key] = value if key + end + style_attrs + end + + # handles conversion of a single attribute allowing recursion through + # super classes + def self.convert_style_attr(key, value) + if self::STYLE_CONVERSION[key] + self::STYLE_CONVERSION[key].call(value) + else + superclass.convert_style_attr(key, value) + end + end + + # Simplifies usage at call sites + def self.transferred_properties(properties) + NodeProperties.transferred_properties(properties, self::PROPERTIES) + end + def self.node_name @node_name ||= name.split('::').last end @@ -12,12 +72,27 @@ def self.node_name class NodeProperties def self.paragraph(properties) - new('w:pPr', properties) + new('w:pPr', properties, Paragraph::PROPERTIES) end - def initialize(tagname, properties) + def self.run(properties) + new('w:rPr', properties, Run::PROPERTIES) + end + + # creates a hash of all properties that aren't consumed by the node + # so they can be propagated to child nodes + def self.transferred_properties(properties, whitelist) + props = properties.map do |key, value| + next if whitelist.include? key + [key, value] + end + # filter out nils and return hash + Hash[props.compact] + end + + def initialize(tagname, properties, whitelist) @tagname = tagname - @properties = properties + @properties = filter(properties, whitelist) end def inspect @@ -38,6 +113,15 @@ def to_docx private + def filter(properties, whitelist) + props = properties.map do |key, value| + next unless whitelist.include? key + [key, value] + end + # filter out nils and return hash + Hash[props.compact] + end + # processes attributes defined on the node into wordML property syntax def process @properties.map { |k, v| transform_attr(k, v) }.join @@ -45,7 +129,7 @@ def process # properties that have a list as the value get nested in tags and # each entry in the list is transformed. When a value is a hash the - # keys in the hash are used to explicitly buld the XML tag attributes. + # keys in the hash are used to explicitly build the XML tag attributes. def transform_attr(key, value) if value.is_a? Array sub_attrs = value.map do |sub_prop| @@ -97,7 +181,22 @@ def inspect end class Paragraph < Node + PROPERTIES = %w[framePr ind jc keepLines keepNext numPr + outlineLvl pBdr pStyle rPr sectPr shd spacing + tabs textAlignment].freeze + STYLE_CONVERSION = { + 'border' => lambda { |v| + props = Node::STYLE_CONVERSION[:border].call(v) + # + return 'pBdr', [ + { top: props }, { bottom: props }, + { left: props }, { right: props } + ] + }, + 'vertical-align' => ->(v) { return 'textAlignment', v } + }.freeze attr_accessor :runs + def initialize(properties, runs) @properties = NodeProperties.paragraph(properties) @runs = runs @@ -117,68 +216,55 @@ def inspect end end - class TextFormat - def initialize(bold, italic, underline) - @bold = bold - @italic = italic - @underline = underline - end - - def inspect - parts = [] - parts << 'bold' if @bold - parts << 'italic' if @italic - parts << 'underline' if @underline - parts.join('|') - end - - def to_docx - styles = [] - styles << '' if @bold - styles << '' if @italic - styles << '' if @underline - if styles.any? - "#{styles.join}" - else - '' - end - end - - def self.default - @default ||= new(false, false, false) - end - - def with_bold - TextFormat.new(true, @italic, @underline) - end - - def with_italic - TextFormat.new(@bold, true, @underline) - end - - def with_underline - TextFormat.new(@bold, @italic, true) - end - end - - class Text < Node + class Run < Node + PROPERTIES = %w[b i caps color dstrike emboss imprint highlight outline + rStyle shadow shd smallCaps strike sz u vanish + vertAlign].freeze + STYLE_CONVERSION = { + 'color' => ->(v) { return 'color', v.delete('#') }, + 'font-size' => lambda { |v| + return 'sz', (2 * Float(v.gsub(/[^\d.]/, '')).ceil).to_s + }, + 'font-style' => lambda { |v| + return 'b', nil if v =~ /bold/ + return 'i', nil if v =~ /italic/ + }, + 'font-weight' => ->(v) { return 'b', nil if v =~ /bold/ }, + 'text-decoration' => lambda { |v| + supported = %w[line-through underline] + props = v.split + return props[0], 'true' unless supported.include? props[0] + return 'strike', 'true' if props[0] == 'line-through' + return 'u', 'single' if props.length == 1 + return 'u', { val: props[1], color: 'auto' } if props.length == 2 + return 'u', { val: props[1], color: props[2].delete('#') } + }, + 'vertical-align' => lambda { |v| + return 'vertAlign', 'subscript' if v =~ /sub/ + return 'vertAlign', 'superscript' if v =~ /super/ + } + }.freeze attr_reader :string - def initialize(string, format) + + def initialize(properties, string) + @properties = NodeProperties.run(properties) @string = string - @format = format end def to_docx - "#{@format.to_docx}#{normalized_string}" + "#{@properties.to_docx}#{text}" end def inspect - "" + "" end private - def normalized_string - string.tr("\u00A0", ' ') + + + def text + content = @string.tr("\u00A0", ' ') + "#{content}" end end diff --git a/lib/sablon/html/converter.rb b/lib/sablon/html/converter.rb index 534c1601..db4d64c2 100644 --- a/lib/sablon/html/converter.rb +++ b/lib/sablon/html/converter.rb @@ -93,20 +93,65 @@ def initialize # Adds the appropriate style class to the node def prepare_paragraph(node) - # set default styles based on node name - styles = { 'div' => 'Normal', 'p' => 'Paragraph', 'h' => 'Heading', - 'ul' => 'ListBullet', 'ol' => 'ListNumber' } + # set default styles based on HTML element allowing for h1, h2, etc. + styles = Hash.new do |hash, key| + tag, num = key.match(/([a-z]+)(\d*)/)[1..2] + { 'pStyle' => hash[tag]['pStyle'] + num } if hash.key?(tag) + end + styles.merge!('div' => 'Normal', 'p' => 'Paragraph', 'h' => 'Heading', + 'ul' => 'ListBullet', 'ol' => 'ListNumber') styles['li'] = @definition.style if @definition - - # set the node class attribute based on the style, num allows h1,h2,.. - tag, num = node.name.match(/([a-z]+)(\d*)/)[1..2] - unless styles[tag] + styles.each { |k, v| styles[k] = { 'pStyle' => v } } + unless styles[node.name] raise ArgumentError, "Don't know how to handle node: #{node.inspect}" end # - properties = {} - properties['pStyle'] = styles[tag] + num - properties + merge_node_properties(node, {}, styles[node.name], Paragraph) + end + + # Adds properties to the run, from the parent, the style node attributes + # and finally any element specfic properties. A modified properties hash + # is returned + def prepare_run(node, properties) + # HTML element based styles + styles = { + 'span' => {}, 'text' => {}, 'br' => {}, + 'strong' => { 'b' => nil }, 'b' => { 'b' => nil }, + 'em' => { 'i' => nil }, 'i' => { 'i' => nil }, + 'u' => { 'u' => 'single' } + } + + unless styles.key?(node.name) + raise ArgumentError, "Don't know how to handle node: #{node.inspect}" + end + # combine all properties, return the new hash + merge_node_properties(node, properties, styles[node.name], Run) + end + + def merge_node_properties(node, par_props, elm_props, ast_class) + # perform an initial conversion for any leftover CSS props passed + # in from the node's parent + properties = par_props.map do |k, v| + ast_class.convert_style_attr(k, v) + end + properties = Hash[properties] + + # Process any styles, defined on the node + properties.merge!(ast_class.process_style(node['style'])) + + # Set the element specific attributes, overriding any other values + properties.merge(elm_props) + end + + # handles passing all attributes on the parent down to children + # preappending parent attributes so child can overwrite if present + def merge_node_attributes(node, attributes) + node.children.each do |child| + attributes.each do |name, atr| + catr = child[name] ? child[name] : '' + child[name] = atr.value.split(';').concat(catr.split(';')).join('; ') + end + end end def ast_next_paragraph @@ -121,6 +166,7 @@ def ast_next_paragraph unless @builder.nested? @definition = @numbering.register(properties['pStyle']) end + merge_node_attributes(node, node.attributes) @builder.push_all(node.children) return elsif node.name == 'li' @@ -131,28 +177,27 @@ def ast_next_paragraph # create word_ml node @builder.new_layer - @builder.emit Paragraph.new(properties, ast_text(node.children)) + trans_props = Paragraph.transferred_properties(properties) + @builder.emit Paragraph.new(properties, ast_runs(node.children, trans_props)) end - def ast_text(nodes, format: TextFormat.default) + def ast_runs(nodes, properties) runs = nodes.flat_map do |node| + begin + local_props = prepare_run(node, properties) + rescue ArgumentError + raise unless %w[ul ol p div].include?(node.name) + merge_node_attributes(node, node.parent.attributes) + @builder.push(node) + next nil + end + # if node.text? - Text.new(node.text, format) + Run.new(local_props, node.text) elsif node.name == 'br' Newline.new - elsif node.name == 'span' - ast_text(node.children).nodes - elsif node.name == 'strong' || node.name == 'b' - ast_text(node.children, format: format.with_bold).nodes - elsif node.name == 'em' || node.name == 'i' - ast_text(node.children, format: format.with_italic).nodes - elsif node.name == 'u' - ast_text(node.children, format: format.with_underline).nodes - elsif ['ul', 'ol', 'p', 'div'].include?(node.name) - @builder.push(node) - nil else - raise ArgumentError, "Don't know how to handle node: #{node.inspect}" + ast_runs(node.children, local_props).nodes end end Collection.new(runs.compact) diff --git a/test/fixtures/html/html_test_content.html b/test/fixtures/html/html_test_content.html new file mode 100644 index 00000000..b9c2f2e1 --- /dev/null +++ b/test/fixtures/html/html_test_content.html @@ -0,0 +1,163 @@ +

Sablon HTML insertion

+ +

Text

+ +
+ Lorem ipsum dolor sit +  ametconsectetur adipiscing elit. +  Suspendisse a tempus turpis. Duis urna justo, + vehicula vitae ultricies vel, congue at sem. Fusce turpis + turpis, aliquet id pulvinar aliquam, iaculis non elit. Nulla feugiat + lectus nulla, in dictum ipsum cursus ac. Quisque at odio neque. + Sed ac tortor iaculis, bibendum leo ut, malesuada velit. Donec iaculis + sed urna eget pharetra. Praesent ornare fermentum turpis, placerat + iaculis urna bibendum vitae. Nunc in quam consequat, tristique tellus in, + commodo turpis. Curabitur ullamcorper odio purus, lobortis egestas magna + laoreet vitae. Nunc fringilla velit ante, eu aliquam nisi cursus vitae. + Suspendisse sit amet dui egestas, volutpat nisi vel, mattis justo. Nullam + pellentesque, ipsum eget blandit pharetra, augue elit aliquam mauris, + vel mollis nisl augue ut ipsum. +
+ +

Lists

+ +
    +
  1. + Vestibulum  +
      +
    1. ante ipsum primis 
    2. +
    +
  2. +
  3. + in faucibus orci luctus  +
      +
    1. et ultrices posuere cubilia Curae;  +
        +
      1. Aliquam vel dolor 
      2. +
      3. sed sem maximus 
      4. +
      +
    2. +
    3. + fermentum in non odio.  +
        +
      1. Fusce hendrerit ornare mollis. 
      2. +
      +
    4. +
    5. Nunc scelerisque nibh nec turpis tempor pulvinar. 
    6. +
    +
  4. +
  5. Donec eros turpis, 
  6. +
  7. + aliquet vel volutpat sit amet,  +
      +
    1. semper eu purus. 
    2. +
    3. + Proin ac erat nec urna efficitur vulputate.  +
        +
      1. Quisque varius convallis ultricies. 
      2. +
      3. Nullam vel fermentum eros. 
      4. +
      +
    4. +
    +
  8. +
+ +
+ Pellentesque nulla leo, auctor ornare erat sed, rhoncus congue diam. + Duis non porttitor nulla, ut eleifend enim. Pellentesque non tempor sem. +
+ +
Mauris auctor egestas arcu, 
+ +
    +
  1. id venenatis nibh dignissim id. 
  2. +
  3. In non placerat metus. 
  4. +
+ +
    +
  • Nunc sed consequat metus. 
  • +
  • Nulla consectetur lorem consequat, 
  • +
  • malesuada dui at, lacinia lectus. 
  • +
+ +
    +
  1. Aliquam efficitur 
  2. +
  3. lorem a mauris feugiat, 
  4. +
  5. at semper eros pellentesque. 
  6. +
+ +
+ Nunc lacus diam, consectetur ut odio sit amet, placerat pharetra erat. + Sed commodo ut sem id congue. Sed eget neque elit. Curabitur at erat tortor. + Maecenas eget sapien vitae est sagittis accumsan et nec orci. Integer + luctus at nisl eget venenatis. Nunc nunc eros, consectetur at tortor et, + tristique ultrices elit. Nulla in turpis nibh. +
+ +
    +
  • + Nam consectetur  +
      +
    • venenatis tempor. 
    • +
    +
  • +
  • + Aenean  +
      +
    • blandit +
        +
      • porttitor massa,  +
          +
        • non efficitur  +
            +
          • metus. 
          • +
          +
        • +
        +
      • +
      +
    • +
    +
  • +
  • Duis faucibus nunc nec venenatis faucibus. 
  • +
  • Aliquam erat volutpat. 
  • +
+
+ Quisque non neque ut lacus eleifend volutpat quis sed lacus. +
Praesent ultrices purus eu quam elementum, sit amet faucibus elit + interdum. In lectus orci,
elementum quis dictum ac, porta ac ante. + Fusce tempus ac mauris id cursus. Phasellus a erat nulla. Mauris dolor orci, + malesuada auctor dignissim non, posuere nec odio. Etiam hendrerit + justo nec diam ullamcorper, nec blandit elit sodales.
+
+ + +
+ Ut eget auctor enim. + Quisque id + neque eu nibh feugiat imperdiet + id ut dui. Ut auctor libero eget + massa tristique pharetra. Cras tincidunt finibus sapien, ut maximus + tortor tempor at. Proin pulvinar + pretium justo vitae malesuada. Suspendisse porta purus eget tortor + tincidunt vestibulum. Maecenas id egestas purus, quis vulputate + lacus. Quisque non + eleifend est. +
+ +
    +
  • Item 1
  • +
  • Item 2
  • +
      +
    • Nested 1
    • +
    • + Nested 2 +
        +
      • Nested 2.1
      • +
      • Nested 2.2
      • +
      • Nested 2.3
      • +
      +
    • +
    +
  • Item 3
  • +
diff --git a/test/fixtures/html_sample.docx b/test/fixtures/html_sample.docx index 2a7b88795af0edc814b704883cce6c303a1e65bd..0ef9361025d6d2225bb2cff17a25f4925a5a979c 100644 GIT binary patch delta 4408 zcmZ8lbx;&s*I#N0rCmB@L4+klYUz?j;vsg)rMp{V>0Y`Ufu%vDR#HGjC6#WJ5Tp@7 z`cP#00;ntD(#KLbyzrmu&E1+`Cqbv zUGXo8#m)YgfPrZCyOrMI)Six;n- zv(u6h+>>5}B6zGp<|t>g+!9n@p6-+rTkVwPV|^elijx}icaPxt^9$=^fz_&_Z)}Zdt6`RaaV4GBqlyp=ImKlZ`I5t*w*&w zh#e#W`$KQ_95S9*A~0vVyj9!p_KBuy+rn(NzuWPPhC1@rNbR%fu%{imp}k~zjDJ9z z$5)KcfftafjemGC$gbY*#2S~(3dV{J(bFSzU#RnY0HPD3@B%LR%LPE!qjPTw*UHX= zTyOvDFCj_*Or>enx~@$lAUryL@?#oy17QbObo7G8MI@U0ceS_^wlpb~>Mt zp25Ly(u0Zj=rUR_BL|yo)7PVOLJ6hd^R=OhqzE-kYz^sx;Yv8kwIH4MWZ)# zgKe|!=fIdGq-s;xA{{s=;{C+9UgwK4>O;ADzsfYFWu$}5vejE-!g60h1REi7n{(8H zp^V`xVH}?}D*kZ0K7pay3EbW0bq*X$cJ*e8&bF@gnbDAmkvC5gv@GD-V)6hGivwQS zzz}sJJ$6c%OWlBUL8F7g&uL{7wbs=f%4U=|@0;=jq^P;jnCMd(>}OdaSi_8!ixmBF zrg{4uqgIgT+E+g)xPJQ%LcT&D7r;DJyfI4gyk8%%^bb)4av8#s0)6DP1T0-fu|ffA z5%4TO%H>S(cehfO<+0>pdmK&sX|~(YoJ%Gx9Ctkh0nCpS$IO09i;SpS`)1yp&|;jshThBURz`Kml#%>)L4Svvt4>%l*?mjvXyzIl=LoP4F`#D(<1@?hF=C-hO7WT^A5_0cS|MQQ9M9P7FlG})Jk}RdSG$o)(`)2N~q6eye@tcb83<`*m zjLW8Ke7%%XePubBrHH(-`fLHzTJ8HUpwBxo<;r}-)wWsw?IH~KDcs|nb3?C~)rpJ5 zzV45s7-bpv$++ebf>Am@vC-Zu8TZuu6-|H_LaWaNo=5y-@>&rh=H`iDjgc!CNDRFk zVzBu(*+F}HKuvDmEKfdK`c&r8s{QsdlI{H~;1j0B8B)_LoP@H(XvC&6E&t@nsp&{4 z6%UJ+J=!0%Bwy@D`@3VZJ!zCB6mVE*sUl@UR;CoYjU|NdsJXju4W2JgyT6`juh%pg z6rB$f6w8X60Dt-^aOi6U$%8uFk8Ig$tGBSU;{@Mrq{23& z#p(81^Ajpl*YoiB?Qw~C5qa4=*IVltLB^4uL~Kl$k-V!BO^4yoeYuBma5gj+?PmJt zoPejHe)H@vO01|R6V|1_H-~tA830o=uv;)W z)+&@&7l`0$KR=|aep3b9?i7XTk%S6M{9+e3yF_iQw<{1XolD8{V3ee^Gi96kOHd{3 zwgOb|Sa*9-&@*qjTlC^o_$Kd+{)kFg51Db5Ji`w0v)S4l;CFA2W(zQwoV~W0;?Up(iz2=F{~#v z8{OQ(nPXAJYej+3%xI~$ zn^S(3<2{>RQ}) zzPPG2d1vVX1Y^FzGaHU5@%vIIqRt(E7Ld!3jT$uxWq7pL!}fi9pUDZ2V~c%%Z=!Z* z2>f8k6@xRQhJUXQNlu2Leg9-k|qHpxW0T1@2Y0ytt9FgoZEs=yb+Eqs*dEAR#WVWzy*zuzq3BK6*j@^Fa zGXMMxZKIoL=vO#$lf!P_I|K=1=8ylRViTr~QbWGSGY%F-76oENNnPQm{b79hYDjfx zH|Z#kg!6ZjVKY4^3H!4B2GZ@b6kq2Lj5^yR2G~W}g$}0%UjCl$GiSs~_|Zy(D1MGJ zJjchI8oat=U&!+rLjY5uPcQTz&pQlSjN4(SSWf5uj2`RVGi0tL_)ePAEOv3yk}KBB zT47)=Tm%?K##ISyb}LGJ^9t~A3l2UinOM}uyJ*i;N8fLcgd__(D#!C=(T*|J=F() zK44G_R(!lytZoEQ0ap^!jeq?q`_*5Q$aj-8}IdfYYTC5)0yUCvVAGiP(r?Z%`X=0>#=@YT$OKTN%R^! zx!(-G`?N7+W$;n7cCyD!>^ITm)wkFE^0#q(*>5*g?8tuz{<4JhPHIvoY~CNga&-IF zQk!nlkuSc(hYvD%2Bg;uJ@~!w=2P$t1BeU5VP&vr6LP#Ndel)&eEak&QorX${!5eD z4e5W)l}%PczSwuOV(VfBsVN=+AYlUlQ2gJl=;G~cVeMh>V*7Vk+%s89eX01Md!^!4 z<*~Q&SOR3@-`&5y7b z4^vL|H%AusV(Pn~gzz4w@Z!&Q^s|=sput`{{+M0RQSH{e=k9pYm35%=rIUdFup zt-%-?1i17>kL*G?_FJIu=lxuk=Uji}LEFXGsm$pg zJx!}}ZIF~(o9vQRHr{-tS(-s9kz`~bIF0!vOGlXX!375{xnH6w?>Iy<7UH~UW~Eq| z=f9l5SxTh|4&{W*W3;+#_f`Clk`|96U=S;NZPQIU@vnFdO%2aBdn8$3?kd%qdKZcj5^ zo<|jkzh>R<|3?Nkh}uXjKdH_$&H$=3%-&n<)F_~t&1T7O!Q_?onnyQAAaxX|ApN2T z5Zf+TWxoHw!1=tlN`4FO*-@I&;@N2Ix3hA6;H6CWgQ(l9V*~f=0P&t5sn#T?me~vp z?Fi?O8;xaKkYZOQ?szFQxg^v!2>;o4H{a}Y!NkrtZa^Ns&PwrTIsuWN+-{;9s+YD_ zr2Rctc-;9+R@v&~SZ*GPxKkG7Vti6J5;va4`dp|?i%*Y&7ik~C4`z31_TOm)`~>fq z^gbf+3XelOuwewrj9D-;ZlSB#y?9~waH$D+$&4GgF_Vd5LvN3fbrN0I<0fuAF?)WQ zMim&T@{I6s%~C$Ip7w7q*xB`1yEJQ(-sI5A8=IrKV4O{@T-!Ut7epdZTFaKc=a!WE z;EvXZm7tF9PnzRzZ6b^7bL9wILE_GYr$R1?G~82$$ z-c%c->prC>*F${dM3#}!MWG^is?{(u&tYZV*0FMmCLPG0;X1e9bPEb60j*;aI6ilYzD{E(S9IX0WW%)3$pAtUU*(Rf2nal=(o6L^G~E~#J} z3E#m^(heno*-+*TJ(cAY`F^jd5N~ZjBvR?5;~$ z-ONawyxq1&PoG7Smg_yY>JhJrF6UC)JlSB@v#Ul4--N4^$H?NFymY(`8C< ziP(RglxTT;kt5UqQSscGX(OMxY#66AGDsq#c1GO7b97wMLfLyk+UHF-P5%KO CmJiSX delta 4031 zcmZvfcTkf{+lNE%5TqB84g#Xmdk3kZgc^#pPy_;mB0ZiU2nq%Wy(18MC~^R)f=Uq~ zD!ogU5}F_&BD{)z@AsWE-_GpZzqx0xot@{8efElNCrD}`pfNEZB4z{t0OWu`NJ%|S z4FS>5`n-eS+V8MHSoS-_6KDPo{G=Fx%QZuy5CX){#rppKLdjI*0Kl>s06_n9tNXri z*PE`sE1t=;Q)I$w6O&rc?Em?YE1T~CHf;{(UoaQ-S@DD$|r;KJ>SD5I9r{0M0QBi#H#(> z*2rQiV%=oWF|62x8;{(J_l2XXYOX&Fu-;*sFxUyRv4hH>E%%>&3zK2D{{-2z?EM^P zctZ*{oKVx!SDNP%`$KWmnEUyf1E@6R9KkRUHvibh;Rrc~7F*VP61`>!aBQeHKd1{` zpCXn*NJ-)axkpkbrdH_Z-&2yj)M<;UR&3iAp^;@tOd?})00aK|%;6X~^kA1G#t zqp}1oM0gO^OCgQjPm|(;dj)|z*ePzC`<-JjjE(G0gzC(>vr)OCCtTHY;&j&%!Ayd@ z;kv2}u^QSYTMO>?YFD$k{UYDX<(oMHz1VO+U$|8q2n>#ThEq6itKVN9N0=dN1P!p_ z;a!vj4nDz2u+uSZ;acawRGbto$G;)n*#9({Q9!S0&CB)@yH z1y1Jf*N&6*X1kTEp_S7S@v^;$$B*Z1O1@4ZFMJADY{mMAi#==?G?vKJQ zbd8^M*cCRefmI#iea5$TYwrH#%1e1OqK+B{F|c}Rm1SbPp*(53Kl2F)H`vH^%ndai z>%RSfu&~2__F!*|`>JOCr*Mw_+E4BBqfI`p*i!3$Ug6F}U-8nt^EhH9cRK#I8KzRzeF@MMLz5eAt&Q3iN3EL#+9pr9{(AwiPnE zOVY1hA!p>IZ^L>5jlC@Osm;Z7*er>+{4!XGjyQsMG&N|Z?`1N#u-Dm|F3BA9F4fx9 z;4pAX9FkaonyM2A6pleqwXe7mBKO&qnmrcf z7ZX=cC?$OA(wXXz0{2LBZQCLX#wazJk33T2C>yi;YJ>u|ka+!>?xh@*?=5nrp3rHz z_7MH;P;7cQdUUTbwdQ|jguTBkTy1-C}yc& zx^6`wE-R*p-zRF8u=j74NI{KdF+&5PoEtcX3C)Ma@^NX^)D1RRT)GYkoqmDaoeR=P zKCOM4p>xZ6c1X`CGPk^7P84=zUW8Ac!@MdY>foYqiM8B!9H`|ku+Y&)rGr-QL>m+C zb{h0KQux++!VrTN9>Z-WMlq#9mJC-bIfF*cr0Ei;!zZAUGF*x5)2W}mNxv9-C+3ne zoomX_5(h=7m9IokvE36D_1>!BV5$9*@QCmzD2_MARpd2!*k*cfPWne+gNctFkQ||1 zsF2%ti?37lF{L}1`685-l#!VCx^n32)bXTc`c9R%n+U`wetVUNISkN3b4j^SNg+t1 z+Ovk9w=!*ITK=3Tj~c_?4Zl|-h*+_PfLIdNO&dQ9S+vd`CoL9%>S_H~VA+ zrE28UxMGRmeH*>1X;)32-Lrcb)Gw7c+cijdNa{-yIny*b1c4?|npUz`Pz#r1-gPUr zQhf0&n2PzJEg_kFVfKY`)wN$+oKT>WE<4i;!blcZ4UfgYe3%|`w1dnv1 zw`ly6KG^w~-kV7$!!u;&Z!_BFz~JLwSrH+-Mc)+{w=*?)ZBf}B!5U{*;ucfOV3V^d zs-*A2-#*E63~p>+8s1-aJuf38=A?UfG>~;yJ3LCQ5rreak7TQAY>MQ1C8K>NE+=&i zx|^16W%|~OUFXr@!3<}N>^2#EDm6Q~je*TWbfKyAE6>XE_{mEaN$U+pptUwG-{02rubnS!<%t|rJ@3@&1PtM9I+JHs21hvBeIzyz1V@Wh4%G69l|nF zraV)c?Be4?K8gglVEsek{7l-(A!J)%>Ri#m6S z`28?cIp%8(l{u&DtOKk+Dckx9f_OMae@?^c)emzOZheq>SmcKA(r*Q-J2EIaB* z79W)t@pY){AJ32H^0M~{X@5N_tfGr#lt}=9r*`O0c{YUIT*@;Y#?FPZy7DjZSj@LE zi2LBxiO~{FFjyM>R@%90R6~haSN(;RKZ0snvAnzLCO@AB5w@;(YriI=8gcIp3vt3GJ;q$^=~Ht&R2&aVaUp zgIP&v<6U9{yhhTl8raIIdc>~HfX?apT1DLfs^Xf)ChQ*%Y2KOF2 z#Re~d>~Kc;W@5z;Y-mcLTHkFGs(MhP7#!z%Eh6IPHO|q*Ik{Hl6=qWgAz8uVmo}#l z1+&88T1snvYRfSkDmYtPsqq*6>h~xz`mvhYiaQt-Z}<=`?t#&40W2ZHcR5t~8IpZA znekbj$CAd^>=#0-Z<=AU(yIe`*JPVjwYnuDbOKPCZN&vt)lygd9DR08vmg<jiYOZGuI25e`k@4RKKZ2PGV zhG)a&W%ZxD2pu>-Rt=fTIZ>K)A;2TPqcb{?)dz*Oj|(r-ayl`87hmkH7!%gMMs$q6 zykp0pes&YK#H=9g|AQZ;XgSP3%_bB&Umqp7IIJ{q@U$j<5j5M-4EoYubwd0X2ML?ta{KhPk=XS!H^P zFu$#?&_9)Vj(uwc>b9<`8<|2fAqKD&MeLYJkm{&MsGl1q2NToQAzwKvyI;1928NqMuk@kX%Cj zV$cv6``_-LW)g)$hG7z65Aj0^at5Fvx1n0*gIfenaONJ@Vjd4XC;-(rRRHyApRcw0 z=&KjL64m9%hDwr_=#Ay>{s>CpPxyS4iE^6I6;h;LY7iM9gL`B)vG2V%KZY1fe@vj8 z-mV!(L{*}UpLVVp7HER#cV}ft66^Lmt-@yto2dS_FW=ViVY%04G7~c`J{nxq6kbgW zz8^H%|2~oDUd=1+=ag9%{nZ+7OcGT|HZku+TrJ;hGWp!8&1_W|kM)?etmUTp3N-6Q z9=@#-O`$@wv6NpG2LtW!;)ns;%}g>J#`h)8_yjb_cr`-+?!F0UlgmYX!I8y9G| zQiM%}7iaBo`~l=KDa#w0MZ}Z>Cc=Md3^<*d2+MmGJ#?ddlHO%yo@^`65=3?9ICJrp zTT8E@vNvWTbNdXo_B_kX1%r4s;2o<{qs#aL++ci>b}GUmnsrsf^kJbRk8mZB(TM!9 zIGFei5a~>*UN2`#D`vU8=R9_Ayd`v!dMuOa!mQzw(JTL=x_P`skkYl|@7|z# zKv8RjNEGBa$zOnYpmc%0EA-&zKkr&~)k>mEkK56xBfAVm`YPm?Vad#n^fIX1+_;oJ zv_C}UR|DMuk-4l}5XH-S-B#(cde~}Q)*9P8m-qMoDj}NC?)If6l30pH>JtJ0rxXCd zm7kXQYY_aLC4X5-|8Yh)Us#~Ile0JUmo%Cf5D-cc{Z~!@&oANM{0G@TJS@n^84CA= zx&Mnt{Bm3UyCjsaqKE(h<(~)mrK!Kh?*phP|Jfh_8W{K!5BL{ Lorem - + ipsum dolor sit amet @@ -310,6 +310,13 @@ def test_nested_unordered_lists assert_equal [Sablon::Numbering::Definition.new(1001, 'ListBullet')], @numbering.definitions end + def test_unknown_tag + e = assert_raises ArgumentError do + process('') + end + assert_match(/Don't know how to handle node:/, e.message) + end + private def process(input) @@ -321,6 +328,251 @@ def normalize_wordml(wordml) end end +class HTMLConverterStyleTest < Sablon::TestCase + def setup + super + @env = Sablon::Environment.new(nil) + @converter = Sablon::HTMLConverter.new + end + + # testing direct CSS style -> WordML conversion for paragraphs + + def test_paragraph_with_background_color + input = '

' + expected_output = para_with_ppr('') + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_paragraph_with_borders + # Basic single line black border + input = '

' + ppr = <<-DOCX.strip + + + + + + + DOCX + expected_output = para_with_ppr(ppr) + assert_equal normalize_wordml(expected_output), process(input) + # border with a line style + input = '

' + ppr = <<-DOCX.strip + + + + + + + DOCX + expected_output = para_with_ppr(ppr) + assert_equal normalize_wordml(expected_output), process(input) + # border with line style and color + input = '

' + ppr = <<-DOCX.strip + + + + + + + DOCX + expected_output = para_with_ppr(ppr) + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_paragraph_with_text_align + input = '

' + expected_output = para_with_ppr('') + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_paragraph_with_vertical_align + input = '

' + expected_output = para_with_ppr('') + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_paragraph_with_unsupported_property + input = '

' + expected_output = para_with_ppr('') + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_run_with_background_color + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_run_with_color + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_run_with_font_size + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + + # test that non-numeric are ignored + input = '

test

' + assert_equal normalize_wordml(expected_output), process(input) + + # test that floats round up + input = '

test

' + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_run_with_font_style + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + + # test that non-numeric are ignored + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_run_with_font_wieght + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_run_with_text_decoration + # testing underline configurations + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + + # testing line-through + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + + # testing that unsupported values are passed through as a toggle + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_run_with_vertical_align + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_run_with_unsupported_property + input = '

test

' + expected_output = 'test' + assert_equal normalize_wordml(expected_output), process(input) + end + + # tests with nested runs and styles + + def test_paragraph_props_passed_to_runs + input = '

Loremipsum

' + expected_output = <<-DOCX.strip + + + + + + + + + + Lorem + + + + + + ipsum + + + DOCX + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_run_prop_override_paragraph_prop + input = '

Loremipsum

' + expected_output = <<-DOCX.strip + + + + + + + + + + Lorem + + + + + + ipsum + + + DOCX + assert_equal normalize_wordml(expected_output), process(input) + end + + private + + def process(input) + @converter.process(input, @env) + end + + def para_with_ppr(ppr_str) + para_str = '%s' + format(para_str, ppr_str) + end + + def run_with_rpr(rpr_str) + para_str = <<-DOCX.strip + + + + + + + %s + + test + + + DOCX + format(para_str, rpr_str) + end + + def normalize_wordml(wordml) + wordml.gsub(/^\s+/, '').tr("\n", '') + end +end + class HTMLConverterASTTest < Sablon::TestCase def setup super @@ -331,49 +583,49 @@ def setup def test_div input = '
Lorem ipsum dolor sit amet
' ast = @converter.processed_ast(input) - assert_equal ']>]>', ast.inspect + assert_equal ']>]>', ast.inspect end def test_p input = '

Lorem ipsum dolor sit amet

' ast = @converter.processed_ast(input) - assert_equal ']>]>', ast.inspect + assert_equal ']>]>', ast.inspect end def test_b input = '

Lorem ipsum dolor sit amet

' ast = @converter.processed_ast(input) - assert_equal ', ]>]>', ast.inspect + assert_equal ', ]>]>', ast.inspect end def test_i input = '

Lorem ipsum dolor sit amet

' ast = @converter.processed_ast(input) - assert_equal ', ]>]>', ast.inspect + assert_equal ', ]>]>', ast.inspect end def test_br_in_strong input = '
Lorem
ipsum
dolor
' par = @converter.processed_ast(input).grep(Sablon::HTMLConverter::Paragraph).first - assert_equal "[, , , , ]", par.runs.inspect + assert_equal "[, , , , ]", par.runs.inspect end def test_br_in_em input = '
Lorem
ipsum
dolor
' par = @converter.processed_ast(input).grep(Sablon::HTMLConverter::Paragraph).first - assert_equal "[, , , , ]", par.runs.inspect + assert_equal "[, , , , ]", par.runs.inspect end def test_nested_strong_and_em input = '
Lorem ipsum dolor
' par = @converter.processed_ast(input).grep(Sablon::HTMLConverter::Paragraph).first - assert_equal "[, , ]", par.runs.inspect + assert_equal "[, , ]", par.runs.inspect end def test_ignore_last_br_in_div input = '
Lorem ipsum dolor sit amet
' par = @converter.processed_ast(input).grep(Sablon::HTMLConverter::Paragraph).first - assert_equal "[]", par.runs.inspect + assert_equal "[]", par.runs.inspect end def test_ignore_br_in_blank_div @@ -385,25 +637,25 @@ def test_ignore_br_in_blank_div def test_headings input = '

First

Second

Third

' ast = @converter.processed_ast(input) - assert_equal "]>, ]>, ]>]>", ast.inspect + assert_equal "]>, ]>, ]>]>", ast.inspect end def test_h_with_formatting input = '

Lorem ipsum dolor sit amet

' ast = @converter.processed_ast(input) - assert_equal ", , , ]>]>", ast.inspect + assert_equal ", , , ]>]>", ast.inspect end def test_ul input = '
  • Lorem
  • ipsum
' ast = @converter.processed_ast(input) - assert_equal "]>, ]>]>", ast.inspect + assert_equal "]>, ]>]>", ast.inspect end def test_ol input = '
  1. Lorem
  2. ipsum
' ast = @converter.processed_ast(input) - assert_equal "]>, ]>]>", ast.inspect + assert_equal "]>, ]>]>", ast.inspect end def test_num_id @@ -437,39 +689,48 @@ def get_numpr_prop_from_ast(ast, key) end class NodePropertiesTest < Sablon::TestCase + def setup + # struct to simplify prop whitelisting during tests + @inc_props = Struct.new(:props) do + def include?(value) + true + end + end + end + def test_empty_node_properties_converison # test empty properties - props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', {}) - assert props.inspect == '' - assert props.to_docx.nil? + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', {}, @inc_props.new) + assert_equal props.inspect, '' + assert_equal props.to_docx, nil end def test_simple_node_property_converison props = { 'pStyle' => 'Paragraph' } - props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props) - assert props.inspect == 'pStyle=Paragraph' - assert props.to_docx == '' + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props, @inc_props.new) + assert_equal props.inspect, 'pStyle=Paragraph' + assert_equal props.to_docx, '' end def test_node_property_with_nil_value_converison props = { 'b' => nil } - props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props) - assert props.inspect == 'b' - assert props.to_docx == '' + props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props, @inc_props.new) + assert_equal props.inspect, 'b' + assert_equal props.to_docx, '' end def test_node_property_with_hash_value_converison props = { 'shd' => { color: 'clear', fill: '123456', test: nil } } - props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props) - assert props.inspect == 'shd={:color=>"clear", :fill=>"123456", :test=>nil}' - assert props.to_docx == '' + props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props, @inc_props.new) + assert_equal props.inspect, 'shd={:color=>"clear", :fill=>"123456", :test=>nil}' + assert_equal props.to_docx, '' end def test_node_property_with_array_value_converison props = { 'numPr' => [{ 'ilvl' => 1 }, { 'numId' => 34 }] } - props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props) - assert props.inspect == 'numPr=[{"ilvl"=>1}, {"numId"=>34}]' - assert props.to_docx == '' + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props, @inc_props.new) + assert_equal props.inspect, 'numPr=[{"ilvl"=>1}, {"numId"=>34}]' + assert_equal props.to_docx, '' end def test_complex_node_properties_conversion @@ -499,14 +760,40 @@ def test_complex_node_properties_conversion DOCX - props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props) - assert props.to_docx == output + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props, @inc_props.new) + assert_equal props.to_docx, output + end + + def test_setting_property_value + props = {} + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props, @inc_props.new) + props['rStyle'] = 'FootnoteText' + assert_equal({ 'rStyle' => 'FootnoteText' }, props.instance_variable_get(:@properties)) + end + + def test_properties_filtered_on_init + props = { 'pStyle' => 'Paragraph', 'rStyle' => 'EndnoteText' } + props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props, %[rStyle]) + assert_equal({ 'rStyle' => 'EndnoteText' }, props.instance_variable_get(:@properties)) + end + + def test_transferred_properties + props = { 'pStyle' => 'Paragraph', 'rStyle' => 'EndnoteText' } + trans = Sablon::HTMLConverter::NodeProperties.transferred_properties(props, %[pStyle]) + assert_equal({ 'rStyle' => 'EndnoteText' }, trans) end def test_node_properties_paragraph_factory props = { 'pStyle' => 'Paragraph' } props = Sablon::HTMLConverter::NodeProperties.paragraph(props) - assert props.inspect == 'pStyle=Paragraph' - assert props.to_docx == '' + assert_equal 'pStyle=Paragraph', props.inspect + assert_equal props.to_docx, '' + end + + def test_node_properties_run_factory + props = { 'color' => 'FF00FF' } + props = Sablon::HTMLConverter::NodeProperties.run(props) + assert_equal 'color=FF00FF', props.inspect + assert_equal '', props.to_docx end end diff --git a/test/html_test.rb b/test/html_test.rb index 258890e7..afd89ee4 100644 --- a/test/html_test.rb +++ b/test/html_test.rb @@ -1,9 +1,10 @@ # -*- coding: utf-8 -*- require "test_helper" -require "support/xml_snippets" +require "support/html_snippets" class SablonHTMLTest < Sablon::TestCase include Sablon::Test::Assertions + include HTMLSnippets def setup super @@ -16,7 +17,7 @@ def test_generate_document_from_template_with_styles_and_html template_path = @base_path + "fixtures/insertion_template.docx" output_path = @base_path + "sandbox/html.docx" template = Sablon.template template_path - context = {'html:content' => content} + context = { 'html:content' => content } template.render_to_file output_path, context assert_docx_equal @sample_path, output_path @@ -26,7 +27,7 @@ def test_generate_document_from_template_without_styles_and_html template_path = @base_path + "fixtures/insertion_template_no_styles.docx" output_path = @base_path + "sandbox/html_no_styles.docx" template = Sablon.template template_path - context = {'html:content' => content} + context = { 'html:content' => content } e = assert_raises(ArgumentError) do template.render_to_file output_path, context @@ -37,13 +38,12 @@ def test_generate_document_from_template_without_styles_and_html end private + def content - <<-HTML -

Sablon HTML insertion

-

Text

-
Lorem ipsum dolor sit ametconsectetur adipiscing elitSuspendisse a tempus turpis. Duis urna justo, vehicula vitae ultricies vel, congue at sem. Fusce turpis turpis, aliquet id pulvinar aliquam, iaculis non elit. Nulla feugiat lectus nulla, in dictum ipsum cursus ac. Quisque at odio neque. Sed ac tortor iaculis, bibendum leo ut, malesuada velit. Donec iaculis sed urna eget pharetra. Praesent ornare fermentum turpis, placerat iaculis urna bibendum vitae. Nunc in quam consequat, tristique tellus in, commodo turpis. Curabitur ullamcorper odio purus, lobortis egestas magna laoreet vitae. Nunc fringilla velit ante, eu aliquam nisi cursus vitae. Suspendisse sit amet dui egestas, volutpat nisi vel, mattis justo. Nullam pellentesque, ipsum eget blandit pharetra, augue elit aliquam mauris, vel mollis nisl augue ut ipsum.
-

Lists

-
  1. Vestibulum 
    1. ante ipsum primis 
  2. in faucibus orci luctus 
    1. et ultrices posuere cubilia Curae; 
      1. Aliquam vel dolor 
      2. sed sem maximus 
    2. fermentum in non odio. 
      1. Fusce hendrerit ornare mollis. 
    3. Nunc scelerisque nibh nec turpis tempor pulvinar. 
  3. Donec eros turpis, 
  4. aliquet vel volutpat sit amet, 
    1. semper eu purus. 
    2. Proin ac erat nec urna efficitur vulputate. 
      1. Quisque varius convallis ultricies. 
      2. Nullam vel fermentum eros. 
Pellentesque nulla leo, auctor ornare erat sed, rhoncus congue diam. Duis non porttitor nulla, ut eleifend enim. Pellentesque non tempor sem.
Mauris auctor egestas arcu, 
  1. id venenatis nibh dignissim id. 
  2. In non placerat metus. 
  • Nunc sed consequat metus. 
  • Nulla consectetur lorem consequat, 
  • malesuada dui at, lacinia lectus. 
  1. Aliquam efficitur 
  2. lorem a mauris feugiat, 
  3. at semper eros pellentesque. 
Nunc lacus diam, consectetur ut odio sit amet, placerat pharetra erat. Sed commodo ut sem id congue. Sed eget neque elit. Curabitur at erat tortor. Maecenas eget sapien vitae est sagittis accumsan et nec orci. Integer luctus at nisl eget venenatis. Nunc nunc eros, consectetur at tortor et, tristique ultrices elit. Nulla in turpis nibh.
  • Nam consectetur 
    • venenatis tempor. 
  • Aenean 
    • blandit
      • porttitor massa, 
        • non efficitur 
          • metus. 
  • Duis faucibus nunc nec venenatis faucibus. 
  • Aliquam erat volutpat. 
Quisque non neque ut lacus eleifend volutpat quis sed lacus.
Praesent ultrices purus eu quam elementum, sit amet faucibus elit interdum. In lectus orci,
elementum quis dictum ac, porta ac ante. Fusce tempus ac mauris id cursus. Phasellus a erat nulla. Mauris dolor orci, malesuada auctor dignissim non, posuere nec odio. Etiam hendrerit justo nec diam ullamcorper, nec blandit elit sodales.
-HTML + html_str = snippet('html_test_content') + # combine all white space + html_str = html_str.gsub(/\s+/, ' ') + # clear any white space between block level tags and other content + html_str.gsub(%r{\s*<(/?(?:h\d|div|p|br|ul|ol|li).*?)>\s*}, '<\1>') end end diff --git a/test/support/html_snippets.rb b/test/support/html_snippets.rb new file mode 100644 index 00000000..a36615d2 --- /dev/null +++ b/test/support/html_snippets.rb @@ -0,0 +1,9 @@ +module HTMLSnippets + def snippet(name) + File.read(File.expand_path("#{name}.html", snippet_path)) + end + + def snippet_path + @snippet_path ||= File.expand_path("../../fixtures/html", __FILE__) + end +end From 00226a78db9b0086d15c8972eee9795260f2423d Mon Sep 17 00:00:00 2001 From: Matthew Stadelman Date: Thu, 20 Jul 2017 19:35:40 -0400 Subject: [PATCH 06/12] Set ruby zip version (#63) * pin ruby version at 1.1.1 for testing * Update version to require rubyzip >= 1.1.1 instead of >= 1.1 Fixes #57 --- Gemfile.lock | 10 +++++----- sablon.gemspec | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Gemfile.lock b/Gemfile.lock index 28e67380..da9cd629 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -3,15 +3,15 @@ PATH specs: sablon (0.0.21) nokogiri (>= 1.6.0) - rubyzip (>= 1.1) + rubyzip (>= 1.1.1) GEM remote: https://rubygems.org/ specs: - mini_portile2 (2.1.0) + mini_portile2 (2.2.0) minitest (5.8.0) - nokogiri (1.7.1) - mini_portile2 (~> 2.1.0) + nokogiri (1.8.0) + mini_portile2 (~> 2.2.0) rake (10.4.2) rubyzip (1.2.1) xml-simple (1.1.5) @@ -27,4 +27,4 @@ DEPENDENCIES xml-simple BUNDLED WITH - 1.14.5 + 1.14.6 diff --git a/sablon.gemspec b/sablon.gemspec index b5fc8532..0a26932b 100644 --- a/sablon.gemspec +++ b/sablon.gemspec @@ -20,7 +20,7 @@ Gem::Specification.new do |spec| spec.require_paths = ["lib"] spec.add_runtime_dependency 'nokogiri', ">= 1.6.0" - spec.add_runtime_dependency 'rubyzip', ">= 1.1" + spec.add_runtime_dependency 'rubyzip', ">= 1.1.1" spec.add_development_dependency "bundler", ">= 1.6" spec.add_development_dependency "rake", "~> 10.0" From f3307934f0dda78eb4f8a8fee0a39605f0278c24 Mon Sep 17 00:00:00 2001 From: Matthew Stadelman Date: Thu, 7 Sep 2017 11:40:12 -0400 Subject: [PATCH 07/12] Implement config module (#67) * Create initial version of config module to register HTML tags * Add tests for Configuration::HTMLTag * Fully test the configuration module * Minor changes to configuration files and ensure allowed_children is always array of symbols * Create initial methods in converter utilizing the config and added tests for them The methods are currently unused to allow easier testing * convert properties to strings as final step in NodeProperties#filter * Add document fragment as root level tag in configuration This greatly simplifies logic when assessing parent-child validity * Fix unit tests in converter_test * Additional fixes to html element initialization. my ul and ol definitions accidentally had pstyle instead of pStyle. I also didn't need to call .to_sym on the doc fragment key. It gets converted to a symbol during HTMLTag initialization * Refactor HTML converter to use the new config module * Prevent p and div from being nested inside an inline tag * Update config to not allow mixed lists * Require all children of ol or ul tags to be an li tag or a deeper nesting of the parent tag * WIP: Create initial AST classes for a List(ul & ol) and a ListParagraph(li) This classes also define the new interface I am going to design for all AST nodes. This commit by itself does not affect normal code execution * Standardize all ast node initialization methods. #ci skip This standardization provides a lot of flexiblity while keeping everything consistent. It would also be a necessity if I allow the user to define their own AST classes. * Begin work on a revamped HTML to AST converter * Update default config to use new List and ListParagraph AST nodes * Update ast builder to properly handler node properties I previously wasn't getting styles applied correctly from pass-through nodes * Remove all extraneous code from the HTML convert class. Most of this logic has ben refactored into the ast_builder and ast node sub classes. * Refactor AST node classes to handle styles more effectively in the new system. The largest change is that lists are handled within the AST class instead of inside the converter itself * Don't convert keys to symbols inside NodeProperties#filter * Move node properties testing into a separate file The single converter_test file was becoming extremely cluttered and hard to work with. I also fixed test_transferred_properties to use the new instance method instead of a class method * Refactor NodeProperties class to setup transferred properties on initialization * Update node properties test to check transferred_properties instance var * Properly handled nested lists so they share the same numId Also added an inspect method to the List ast class and fixed the inspect method in the Paragraph ast class * Move the AST test class into it's own file. Also fixed the get_numpr_prop method * Fix List AST node so ListParagraphs not longer get nested within each other Styles also appear to properly transfer * Fix recursive method convert_style_property to exit when the Node class is encountered * Create a test file for the ast_builder (incomplete) [ci skip] * Update html_sample fixture with corrected behavior. Due to the changes I defer converting the CSS to OpenXML until I actually instantiate an AST node. That means when working with "pass-through" HTML elements that only apply a style the CSS properties get thrown around instead of WordML properties. Preiously I converted the CSS to WordML immediately and and in the case of text-decoration the WordML produced from the conversion was not always the same so it didn't get overridden properly i.e. line-through produces a w:strike while underline produces a w:u. Now due to the fact I defer conversion text-decoration will be overridden correctly as it remains 'text-decoration' until the Run AST class itself is created. * Madify style hash order so inline styles can genreally override tag styles. This is only the case for non-toggle properties or if the toggle property has a value that can be set to false. i.e. . Otherwise the style will persist because CSS can only override not remove. * Strip whitespace from style properties after splitting the style string * Finish testing the ASTBuilder The remainder of the code is excercised by converter.rb * Move all AST class CSS style conversions from class constants to a config instance var This instance variable is accessed from the class level method style_conversion, which can be overriden in subclasses as desired. This allows the end user to add their own AST nodes without having to separately register their styles with the config instance. * Add methods to the configration object to add and remove CSS style conversions * Add a style converter for sz The same string to numeric conversion is used in border and font-size this pseduo key ensures that things are done consistently. * Add test to ensure newly registered tags can be converted * Add a test that converts a registered HTML node that provides it's own AST class * Add test to ensure registered style converters are successfully used * Update configuration_test to git the register and remove style converter methods * Add initial HTML tag conversion docs * Tweak comments and add content to section describing CSS style customizations * Fix grammar and readability in new Config section of README * Add Table of contents to README Generated using github-markdown-toc. https://github.com/ekalinin/github-markdown-toc * Fix typo in README [ci skip] --- README.md | 100 ++++++- lib/sablon.rb | 6 + lib/sablon/configuration/configuration.rb | 159 +++++++++++ lib/sablon/configuration/html_tag.rb | 94 +++++++ lib/sablon/html/ast.rb | 266 +++++++++++------- lib/sablon/html/ast_builder.rb | 78 ++++++ lib/sablon/html/converter.rb | 188 +------------ test/configuration_test.rb | 122 +++++++++ test/fixtures/html_sample.docx | Bin 24070 -> 24071 bytes test/html/ast_builder_test.rb | 65 +++++ test/html/ast_test.rb | 117 ++++++++ test/html/converter_test.rb | 313 ++++++---------------- test/html/node_properties_test.rb | 113 ++++++++ 13 files changed, 1100 insertions(+), 521 deletions(-) create mode 100644 lib/sablon/configuration/configuration.rb create mode 100644 lib/sablon/configuration/html_tag.rb create mode 100644 lib/sablon/html/ast_builder.rb create mode 100644 test/configuration_test.rb create mode 100644 test/html/ast_builder_test.rb create mode 100644 test/html/ast_test.rb create mode 100644 test/html/node_properties_test.rb diff --git a/README.md b/README.md index 47490db0..3d013a55 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,28 @@ and efficient. *Note: Sablon is still in early development. Please report if you encounter any issues along the way.* +#### Table of Contents +* [Installation](#installation) +* [Usage](#usage) + * [Writing Templates](#writing-templates) + * [Content Insertion](#content-insertion) + * [WordProcessingML](#wordprocessingml) + * [HTML](#html) + * [Conditionals](#conditionals) + * [Loops](#loops) + * [Nesting](#nesting) + * [Comments](#comments) + * [Configuration (Beta)](#configuration-beta) + * [Customizing HTML Tag Conversion](#customizing-html-tag-conversion) + * [Customizing CSS Style Conversion](#customizing-css-style-conversion) + * [Executable](#executable) + * [Examples](#examples) + * [Using a Ruby script](#using-a-ruby-script) + * [Using the sablon executable](#using-the-sablon-executable) +* [Contributing](#contributing) +* [Inspiration](#inspiration) + + ## Installation Add this line to your application's Gemfile: @@ -131,13 +153,13 @@ template.render_to_file File.expand_path("~/Desktop/output.docx"), context Currently, HTML insertion is somewhat limited. It is recommended that the block level tags such as `p` and `div` are not nested within each other, otherwise the final document may not generate as anticipated. List tags (`ul` and `ol`) and inline tags (`span`, `b`, `em`, etc.) can be nested as deeply as needed. -Not all tags are supported. Currently supported tags are defined in [converter.rb](lib/sablon/html/converter.rb) for paragraphs in method `prepare_paragraph` and for text runs in `prepare_run`. +Not all tags are supported. Currently supported tags are defined in [configuration.rb](lib/sablon/configuration/configuration.rb) for paragraphs in method `prepare_paragraph` and for text runs in `prepare_run`. -Basic conversion of CSS inline styles into matching WordML properties in supported through the `style=" ... "` attribute in the HTML markup. Not all possible styles are supported and only a small subset of CSS styles have a direct WordML equivalent. Styles are passed onto nested elements. The currently supported styles are also defined in [converter.rb](lib/sablon/html/converter.rb) in method `process_style`. Simple toggle properties that aren't directly supported can be added using the `text-decoration: ` style attribute with the proper WordML tag name as the value. Paragraph and Run property reference can be found at: +Basic conversion of CSS inline styles into matching WordML properties in supported through the `style=" ... "` attribute in the HTML markup. Not all possible styles are supported and only a small subset of CSS styles have a direct WordML equivalent. Styles are passed onto nested elements. The currently supported styles are also defined in [configuration.rb](lib/sablon/configuration/configuration.rb) in method `process_style`. Simple toggle properties that aren't directly supported can be added using the `text-decoration: ` style attribute with the proper WordML tag name as the value. Paragraph and Run property reference can be found at: * http://officeopenxml.com/WPparagraphProperties.php * http://officeopenxml.com/WPtextFormatting.php -If you wish to write out your HTML code in an indented human readable fashion, or you are pulling content from the ERB templating engine in rails the following regular expression can help eliminate extraneous whitespace in the final document. +If you wish to write out your HTML code in an indented human readable fashion, or you are pulling content from the ERB templating engine in rails the following regular expression can help eliminate extraneous whitespace in the final document. ```ruby # combine all white space html_str = html_str.gsub(/\s+/, ' ') @@ -204,6 +226,78 @@ styles for HTML insertion. «endComment» ``` +### Configuration (Beta) + +The Sablon::Configuration singleton is a new feature that allows the end user to customize HTML parsing to their needs without needing to fork and edit the source code of the gem. This API is still in a beta state and may be subject to change as future needs are identified beyond HTML conversion. + +The example below show how to expose the configuration instance: +```ruby +Sablon.configure do |config| + # manipulate config object +end +``` + +The default set of registered HTML tags and CSS property conversions are defined in [configuration.rb](lib/sablon/configuration/configuration.rb). + +#### Customizing HTML Tag Conversion + +Any HTML tag can be added using the configuration object even if it needs a custom AST class to handle conversion logic. Simple inline tags that only modify the style of text (i.e. the already supported `` tag) can be added without an AST class as shown below: +```ruby +Sablon.configure do |config| + config.register_html_tag(:bgcyan, :inline, properties: { highlight: 'cyan' }) +end +``` +The above tag simply adds a background color to text using the `` property. + + +More complex business logic can be supported by adding a new class under the `Sablon::HTMLConverter` namespace. The new class will likely subclass `Sablon::HTMLConverter::Node` or `Sablon::HTMLConverter::Collection` depending on the needed behavior. The current AST classes serve as additional examples and can be found in [ast.rb](/lib/sablon/html/ast.rb). When registering a new HTML tag that uses a custom AST class the class must be passed in either by name using a lowercased and underscored symbol or the class object itself. + +The block below shows how to register a new HTML tag that adds the following AST class: `Sablon::HTMLConverter::InstrText`. +```ruby +module Sablon + class HTMLConverter + class InstrText < Node + # implementation details ... + end + end +end +# register tag +Sablon.configure do |config| + config.register_html_tag(:bgcyan, :inline, ast_class: :instr_text) +end +``` + +Existing tags can be overwritten using the `config.register_html_tag` method or removed entirely using `config.remove_html_tag`. +```ruby +# remove tag +Sablon.configure do |config| + # remove support for the span tag + config.remove_html_tag(:span) +end +``` + + +#### Customizing CSS Style Conversion + +The conversion of CSS stored in an element's `style="..."` attribute can be customized using the configuration object as well. Adding a new style conversion or overriding an existing one is done using the `config.register_style_converter` method. It accepts three arguments the name of the AST node (as a lowercased and underscored symbol) the style applies to, the name of the CSS property (needs to be a string in most cases) and a lambda that accepts a single argument, the property value. The example below shows how to add a new style that sets the `` property. +```ruby +# add style conversion +Sablon.configure do |config| + # register new conversion for the Sablon::HTMLConverter::Run AST class. + converter = lambda { |v| return 'highlight', v } + config.register_style_converter(:run, 'custom-highlight', converter) +end +``` + +Existing conversions can be overwritten using the `config.register_style_converter` method or removed entirely using `config.remove_style_converter`. +```ruby +# remove tag +Sablon.configure do |config| + # remove support for conversion of font-size for the Run AST class + config.remove_style_converter(:run, 'font-size') +end +``` + ### Executable The `sablon` executable can be used to process templates on the command-line. diff --git a/lib/sablon.rb b/lib/sablon.rb index fffbfe37..45015b2f 100644 --- a/lib/sablon.rb +++ b/lib/sablon.rb @@ -2,6 +2,8 @@ require 'nokogiri' require "sablon/version" +require "sablon/configuration/configuration" + require "sablon/numbering" require "sablon/context" require "sablon/environment" @@ -18,6 +20,10 @@ module Sablon class TemplateError < ArgumentError; end class ContextError < ArgumentError; end + def self.configure + yield(Configuration.instance) if block_given? + end + def self.template(path) Template.new(path) end diff --git a/lib/sablon/configuration/configuration.rb b/lib/sablon/configuration/configuration.rb new file mode 100644 index 00000000..ef02c7c1 --- /dev/null +++ b/lib/sablon/configuration/configuration.rb @@ -0,0 +1,159 @@ +require 'singleton' +require 'sablon/configuration/html_tag' + +module Sablon + # Handles storing configuration data for the sablon module + class Configuration + include Singleton + + attr_accessor :permitted_html_tags, :defined_style_conversions + + def initialize + initialize_html_tags + initialize_css_style_conversion + end + + # Adds a new tag to the permitted tags hash or replaces an existing one + def register_html_tag(tag_name, type = :inline, **options) + tag = HTMLTag.new(tag_name, type, **options) + @permitted_html_tags[tag.name] = tag + end + + # Removes a tag from the permitted tgs hash, returning it + def remove_html_tag(tag_name) + @permitted_html_tags.delete(tag_name) + end + + # Adds a new style property converter for the specified ast class and + # CSS property name. The ast_class variable should be the class name + # in lowercased snakecase as a symbol, i.e. MyClass -> :my_class. + # The converter passed in must be a proc that accepts + # a single argument (the value) and returns two values: the WordML property + # name and its value. The converted property value can be a string, hash + # or array. + def register_style_converter(ast_node, prop_name, converter) + # create a new ast node hash if needed + unless @defined_style_conversions[ast_node] + @defined_style_conversions[ast_node] = {} + end + # add the style converter to the node's hash + @defined_style_conversions[ast_node][prop_name] = converter + end + + # Deletes a CSS converter from the hash by specifying the AST class + # in lowercased snake case and the property name. + def remove_style_converter(ast_node, prop_name) + @defined_style_conversions[ast_node].delete(prop_name) + end + + private + + # Defines all of the initial HTML tags to be used by HTMLconverter + def initialize_html_tags + @permitted_html_tags = {} + tags = { + # special tag used for elements with no parent, i.e. top level + '#document-fragment' => { type: :block, ast_class: :root, allowed_children: :_block }, + # block level tags + div: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Normal' } }, + p: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Paragraph' } }, + h1: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading1' } }, + h2: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading2' } }, + h3: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading3' } }, + h4: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading4' } }, + h5: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading5' } }, + h6: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading6' } }, + ol: { type: :block, ast_class: :list, properties: { pStyle: 'ListNumber' }, allowed_children: %i[ol li] }, + ul: { type: :block, ast_class: :list, properties: { pStyle: 'ListBullet' }, allowed_children: %i[ul li] }, + li: { type: :block, ast_class: :list_paragraph }, + # inline style tags + span: { type: :inline, ast_class: nil, properties: {} }, + strong: { type: :inline, ast_class: nil, properties: { b: nil } }, + b: { type: :inline, ast_class: nil, properties: { b: nil } }, + em: { type: :inline, ast_class: nil, properties: { i: nil } }, + i: { type: :inline, ast_class: nil, properties: { i: nil } }, + u: { type: :inline, ast_class: nil, properties: { u: 'single' } }, + # inline content tags + text: { type: :inline, ast_class: :run, properties: {}, allowed_children: [] }, + br: { type: :inline, ast_class: :newline, properties: {}, allowed_children: [] } + } + # add all tags to the config object + tags.each do |tag_name, settings| + type = settings.delete(:type) + register_html_tag(tag_name, type, **settings) + end + end + + # Defines an initial set of CSS -> WordML conversion lambdas stored in + # a nested hash structure where the first key is the AST class and the + # second is the conversion lambda + def initialize_css_style_conversion + @defined_style_conversions = { + # styles shared or common logic across all node types go here. + # Special conversion lambdas such as :_border can be + # defined here for reuse across several AST nodes. Care must + # be taken to avoid possible naming conflicts, hence the underscore. + # AST class keys should be stored with their names converted from + # camelcase to lowercased snakecase, i.e. TestCase = test_case + node: { + 'background-color' => lambda { |v| + return 'shd', { val: 'clear', fill: v.delete('#') } + }, + _border: lambda { |v| + props = { sz: 2, val: 'single', color: '000000' } + vals = v.split + vals[1] = 'single' if vals[1] == 'solid' + # + props[:sz] = @defined_style_conversions[:node][:_sz].call(vals[0]) + props[:val] = vals[1] if vals[1] + props[:color] = vals[2].delete('#') if vals[2] + # + return props + }, + _sz: lambda { |v| + return nil unless v + (2 * Float(v.gsub(/[^\d.]/, '')).ceil).to_s + }, + 'text-align' => ->(v) { return 'jc', v } + }, + # Styles specific to the Paragraph AST class + paragraph: { + 'border' => lambda { |v| + props = @defined_style_conversions[:node][:_border].call(v) + # + return 'pBdr', [ + { top: props }, { bottom: props }, + { left: props }, { right: props } + ] + }, + 'vertical-align' => ->(v) { return 'textAlignment', v } + }, + # Styles specific to a run of text + run: { + 'color' => ->(v) { return 'color', v.delete('#') }, + 'font-size' => lambda { |v| + return 'sz', @defined_style_conversions[:node][:_sz].call(v) + }, + 'font-style' => lambda { |v| + return 'b', nil if v =~ /bold/ + return 'i', nil if v =~ /italic/ + }, + 'font-weight' => ->(v) { return 'b', nil if v =~ /bold/ }, + 'text-decoration' => lambda { |v| + supported = %w[line-through underline] + props = v.split + return props[0], 'true' unless supported.include? props[0] + return 'strike', 'true' if props[0] == 'line-through' + return 'u', 'single' if props.length == 1 + return 'u', { val: props[1], color: 'auto' } if props.length == 2 + return 'u', { val: props[1], color: props[2].delete('#') } + }, + 'vertical-align' => lambda { |v| + return 'vertAlign', 'subscript' if v =~ /sub/ + return 'vertAlign', 'superscript' if v =~ /super/ + } + } + } + end + end +end diff --git a/lib/sablon/configuration/html_tag.rb b/lib/sablon/configuration/html_tag.rb new file mode 100644 index 00000000..2e6211e1 --- /dev/null +++ b/lib/sablon/configuration/html_tag.rb @@ -0,0 +1,94 @@ +module Sablon + class Configuration + # Stores the information for a single HTML tag. This information + # is used by the HTMLConverter. An optional AST class can be defined, + # and if so conversion stops there and it is assumed the AST class + # will handle any child nodes unless the element is a block level tag. + # In the case of a block level tag the child nodes are processed by the + # AST builder again. If the AST class is omitted it is assumed the node + # should be "passed through" only transferring it's properties onto + # children. A block level tag must have an AST class associated with + # it. The block and inline status of tags is not affected by CSS. + # Permitted child tags are specified using the :allowed_children optional + # arg. The default value is [:_inline, :ul, :ol]. :_inline is a special + # reference to all inline type tags, :_block is equivalent for block + # type tags. + # + # == Parameters + # * name - symbol or string of the HTML element tag name + # * type - The type of HTML tag needs to be :inline or :block + # * ast_class - class instance or symbol, the AST class or it's name + # used to process the HTML node + # * options - collects all other keyword arguments, Current kwargs are + # `:properties`, `:attributes` and `:allowed_children`. + # + # Example + # HTMLTag.new(:div, :block, ast_class: Sablon::HTMLConverter::Paragraph, + # properties: { pStyle: 'Normal' }) + class HTMLTag + attr_reader :name, :type, :ast_class, :attributes, :properties, + :allowed_children + + # Setup HTML tag information + def initialize(name, type, ast_class: nil, **options) + # Set basic params converting some args to symbols for consistency + @name = name.to_sym + @type = type.to_sym + self.ast_class = ast_class if ast_class + + # Ensure block level tags have an AST class + if @type == :block && @ast_class.nil? + raise ArgumentError, "Block level tag #{name} must have an AST class." + end + + # Set attributes from optinos hash, currently unused during AST generation + @attributes = options.fetch(:attributes, {}) + # WordML properties defined by the tag, i.e. for the tag, etc. + @properties = options.fetch(:properties, {}) + # Set permitted child tags or tag groups + self.allowed_children = options[:allowed_children] + end + + # checks if the given tag is a permitted child element + def allowed_child?(tag) + if @allowed_children.include?(tag.name) + true + elsif @allowed_children.include?(:_inline) && tag.type == :inline + true + elsif @allowed_children.include?(:_block) && tag.type == :block + true + else + false + end + end + + private + + def allowed_children=(value) + if value.nil? + @allowed_children = %i[_inline ol ul] + return + else + value = [value] unless value.is_a? Array + end + @allowed_children = value.map(&:to_sym) + end + + # converts a string or symbol to a class defined under + # Sablon::HTMLConverter + def ast_class=(value) + if value.is_a? Class + @ast_class = value + return + else + value = value.to_s + end + # camel case the word and get class, similar logic to + # ActiveSupport::Inflector.constantize but refactored to be specific + # to the HTMLConverter class + value.gsub!(/(?:^|_)([a-z])/) { Regexp.last_match[1].capitalize } + @ast_class = Sablon::HTMLConverter.const_get(value) + end + end + end +end diff --git a/lib/sablon/html/ast.rb b/lib/sablon/html/ast.rb index bf280d8b..9f7e29c7 100644 --- a/lib/sablon/html/ast.rb +++ b/lib/sablon/html/ast.rb @@ -1,76 +1,70 @@ +require "sablon/html/ast_builder" + module Sablon class HTMLConverter + # A top level abstract class to handle common logic for all AST nodes class Node PROPERTIES = [].freeze - # styles shared or common logic across all node types go here. Any - # undefined styles are passed straight through "as is" to the - # properties hash. Keys that are symbols will not get called directly - # when processing the style string and are suitable for internal-only - # usage across different classes. - STYLE_CONVERSION = { - 'background-color' => lambda { |v| - return 'shd', { val: 'clear', fill: v.delete('#') } - }, - border: lambda { |v| - props = { sz: 2, val: 'single', color: '000000' } - vals = v.split - vals[1] = 'single' if vals[1] == 'solid' - # - props[:sz] = (2 * Float(vals[0].gsub(/[^\d.]/, '')).ceil).to_s if vals[0] - props[:val] = vals[1] if vals[1] - props[:color] = vals[2].delete('#') if vals[2] - # - return props - }, - 'text-align' => ->(v) { return 'jc', v } - } - # This proc is used to allow unmapped styles to pass through - STYLE_CONVERSION.default_proc = proc do |hash, key| - ->(v) { return key, v } - end - STYLE_CONVERSION.freeze - def accept(visitor) - visitor.visit(self) + def self.node_name + @node_name ||= name.split('::').last + end + + # Returns a hash defined on the configuration object by default. However, + # this method can be overridden by subclasses to return a different + # node's style conversion config (i.e. :run) or a hash unrelated to the + # config itself. The config object is used for all built-in classes to + # allow for end-user customization via the configuration object + def self.style_conversion + # converts camelcase to underscored + key = node_name.gsub(/([a-z])([A-Z])/, '\1_\2').downcase.to_sym + Sablon::Configuration.instance.defined_style_conversions.fetch(key, {}) end # maps the CSS style property to it's OpenXML equivalent. Not all CSS # properties have an equivalent, nor share the same behavior when # defined on different node types (Paragraph, Table and Run). - def self.process_style(style_str) - return {} unless style_str - # - styles = style_str.split(';').map { |pair| pair.split(':') } + def self.process_properties(properties) # process the styles as a hash and store values style_attrs = {} - Hash[styles].each do |key, value| - key, value = convert_style_attr(key.strip, value.strip) + properties.each do |key, value| + unless key.is_a? Symbol + key, value = *convert_style_property(key.strip, value.strip) + end style_attrs[key] = value if key end style_attrs end # handles conversion of a single attribute allowing recursion through - # super classes - def self.convert_style_attr(key, value) - if self::STYLE_CONVERSION[key] - self::STYLE_CONVERSION[key].call(value) + # super classes. If the key exists and conversion is succesful a + # symbol is returned to avoid conflicts with a CSS prop sharing the + # same name. Keys without a conversion class are returned as is + def self.convert_style_property(key, value) + if style_conversion.key?(key) + key, value = style_conversion[key].call(value) + key = key.to_sym if key + [key, value] + elsif self == Node + [key, value] else - superclass.convert_style_attr(key, value) + superclass.convert_style_property(key, value) end end - # Simplifies usage at call sites - def self.transferred_properties(properties) - NodeProperties.transferred_properties(properties, self::PROPERTIES) + def accept(visitor) + visitor.visit(self) end - def self.node_name - @node_name ||= name.split('::').last + # Simplifies usage at call sites + def transferred_properties + @properties.transferred_properties end end class NodeProperties + attr_reader :transferred_properties + def self.paragraph(properties) new('w:pPr', properties, Paragraph::PROPERTIES) end @@ -79,20 +73,9 @@ def self.run(properties) new('w:rPr', properties, Run::PROPERTIES) end - # creates a hash of all properties that aren't consumed by the node - # so they can be propagated to child nodes - def self.transferred_properties(properties, whitelist) - props = properties.map do |key, value| - next if whitelist.include? key - [key, value] - end - # filter out nils and return hash - Hash[props.compact] - end - def initialize(tagname, properties, whitelist) @tagname = tagname - @properties = filter(properties, whitelist) + filter_properties(properties, whitelist) end def inspect @@ -113,13 +96,20 @@ def to_docx private - def filter(properties, whitelist) - props = properties.map do |key, value| - next unless whitelist.include? key - [key, value] + # processes properties adding those on the whitelist to the + # properties instance variable and those not to the transferred_properties + # isntance variable + def filter_properties(properties, whitelist) + @transferred_properties = {} + @properties = {} + # + properties.each do |key, value| + if whitelist.include? key.to_s + @properties[key] = value + else + @transferred_properties[key] = value + end end - # filter out nils and return hash - Hash[props.compact] end # processes attributes defined on the node into wordML property syntax @@ -169,6 +159,15 @@ def inspect end class Root < Collection + def initialize(env, node) + # strip text nodes from the root level element, these are typically + # extra whitespace from indenting the markup + node.search('./text()').remove + + # convert children from HTML to AST nodes + super(ASTBuilder.html_to_ast(env, node.children, {})) + end + def grep(pattern) visitor = GrepVisitor.new(pattern) accept(visitor) @@ -184,22 +183,15 @@ class Paragraph < Node PROPERTIES = %w[framePr ind jc keepLines keepNext numPr outlineLvl pBdr pStyle rPr sectPr shd spacing tabs textAlignment].freeze - STYLE_CONVERSION = { - 'border' => lambda { |v| - props = Node::STYLE_CONVERSION[:border].call(v) - # - return 'pBdr', [ - { top: props }, { bottom: props }, - { left: props }, { right: props } - ] - }, - 'vertical-align' => ->(v) { return 'textAlignment', v } - }.freeze attr_accessor :runs - def initialize(properties, runs) + def initialize(env, node, properties) + properties = self.class.process_properties(properties) @properties = NodeProperties.paragraph(properties) - @runs = runs + # + trans_props = transferred_properties + @runs = ASTBuilder.html_to_ast(env, node.children, trans_props) + @runs = Collection.new(@runs) end def to_docx @@ -212,43 +204,107 @@ def accept(visitor) end def inspect - "" + "" end end + # Manages the child nodes of a list type tag + class List < Collection + def initialize(env, node, properties) + # intialize values + @list_tag = node.name + # + if node.ancestors(".//#{@list_tag}").length.zero? + # Only register a definition when upon the first list tag encountered + @definition = env.numbering.register(properties[:pStyle]) + end + + # update attributes of all child nodes + transfer_node_attributes(node.children, node.attributes) + + # Move any list tags that are a child of a list item up one level + process_child_nodes(node) + + # strip text nodes from the list level element, this is typically + # extra whitespace from indenting the markup + node.search('./text()').remove + + # convert children from HTML to AST nodes + super(ASTBuilder.html_to_ast(env, node.children, properties)) + end + + def inspect + "" + end + + private + + # handles passing all attributes on the parent down to children + def transfer_node_attributes(nodes, attributes) + nodes.each do |child| + # update all attributes + merge_attributes(child, attributes) + + # set attributes specific to list items + if @definition + child['pStyle'] = @definition.style + child['numId'] = @definition.numid + end + child['ilvl'] = child.ancestors(".//#{@list_tag}").length - 1 + end + end + + # merges parent and child attributes together, preappending the parent's + # values to allow the child node to override it if the value is already + # defined on the child node. + def merge_attributes(child, parent_attributes) + parent_attributes.each do |name, par_attr| + child_attr = child[name] ? child[name].split(';') : [] + child[name] = par_attr.value.split(';').concat(child_attr).join('; ') + end + end + + # moves any list tags that are a child of a list item tag up one level + # so they become a sibling instead of a child + def process_child_nodes(node) + node.xpath("./li/#{@list_tag}").each do |list| + # transfer attributes from parent now because the list tag will + # no longer be a child and won't inheirit them as usual + transfer_node_attributes(list.children, list.parent.attributes) + list.parent.add_next_sibling(list) + end + end + end + + # Sets list item specific attributes registered on the node + class ListParagraph < Paragraph + def initialize(env, node, properties) + list_props = { + pStyle: node['pStyle'], + numPr: [{ ilvl: node['ilvl'] }, { numId: node['numId'] }] + } + properties = properties.merge(list_props) + super + end + + private + + def transferred_properties + super + end + end + + # Create a run of text in the document class Run < Node PROPERTIES = %w[b i caps color dstrike emboss imprint highlight outline rStyle shadow shd smallCaps strike sz u vanish vertAlign].freeze - STYLE_CONVERSION = { - 'color' => ->(v) { return 'color', v.delete('#') }, - 'font-size' => lambda { |v| - return 'sz', (2 * Float(v.gsub(/[^\d.]/, '')).ceil).to_s - }, - 'font-style' => lambda { |v| - return 'b', nil if v =~ /bold/ - return 'i', nil if v =~ /italic/ - }, - 'font-weight' => ->(v) { return 'b', nil if v =~ /bold/ }, - 'text-decoration' => lambda { |v| - supported = %w[line-through underline] - props = v.split - return props[0], 'true' unless supported.include? props[0] - return 'strike', 'true' if props[0] == 'line-through' - return 'u', 'single' if props.length == 1 - return 'u', { val: props[1], color: 'auto' } if props.length == 2 - return 'u', { val: props[1], color: props[2].delete('#') } - }, - 'vertical-align' => lambda { |v| - return 'vertAlign', 'subscript' if v =~ /sub/ - return 'vertAlign', 'superscript' if v =~ /super/ - } - }.freeze attr_reader :string - def initialize(properties, string) + def initialize(_env, node, properties) + properties = self.class.process_properties(properties) @properties = NodeProperties.run(properties) - @string = string + @string = node.text end def to_docx @@ -261,14 +317,16 @@ def inspect private - def text content = @string.tr("\u00A0", ' ') "#{content}" end end + # Creates a blank line in the word document class Newline < Node + def initialize(*); end + def to_docx "" end diff --git a/lib/sablon/html/ast_builder.rb b/lib/sablon/html/ast_builder.rb new file mode 100644 index 00000000..64c18324 --- /dev/null +++ b/lib/sablon/html/ast_builder.rb @@ -0,0 +1,78 @@ +module Sablon + class HTMLConverter + # Converts a nokogiri HTML fragment into an equivalent AST structure + class ASTBuilder + attr_reader :nodes + + def self.html_to_ast(env, nodes, properties) + builder = new(env, nodes, properties) + builder.nodes + end + + private + + def initialize(env, nodes, properties) + @env = env + @nodes = process_nodes(nodes, properties).compact + end + + # Loops over HTML nodes converting them to their configured AST class + def process_nodes(html_nodes, properties) + html_nodes.flat_map do |node| + # get tags from config + parent_tag = fetch_tag(node.parent.name) if node.parent.name + tag = fetch_tag(node.name) + + # check node hierarchy + validate_structure(parent_tag, tag) + + # merge properties + local_props = merge_node_properties(node, tag, properties) + if tag.ast_class + tag.ast_class.new(@env, node, local_props) + else + process_nodes(node.children, local_props) + end + end + end + + # retrieves a HTMLTag instance from the cpermitted_html_tags hash or + # raises an ArgumentError if the tag is not registered in the hash + def fetch_tag(tag_name) + tag_name = tag_name.to_sym + unless Sablon::Configuration.instance.permitted_html_tags[tag_name] + raise ArgumentError, "Don't know how to handle HTML tag: #{tag_name}" + end + Sablon::Configuration.instance.permitted_html_tags[tag_name] + end + + # Checking that the current tag is an allowed child of the parent_tag. + # If the parent tag is nil then a block level tag is required. + def validate_structure(parent, child) + if parent.ast_class == Root && child.type == :inline + msg = "#{child.name} needs to be wrapped in a block level tag." + elsif parent && !parent.allowed_child?(child) + msg = "#{child.name} is not a valid child element of #{parent.name}." + else + return + end + raise ContextError, "Invalid HTML structure: #{msg}" + end + + # Merges node properties in a sppecifc + def merge_node_properties(node, tag, parent_properties) + # Process any styles, defined on the node into a hash + if node['style'] + style_props = node['style'].split(';').map do |prop| + prop.split(':').map(&:strip) + end + style_props = Hash[style_props] + else + style_props = {} + end + # allow inline styles to override parent styles passed down + parent_properties.merge(tag.properties).merge(style_props) + end + end + end +end diff --git a/lib/sablon/html/converter.rb b/lib/sablon/html/converter.rb index db4d64c2..7e5d6f05 100644 --- a/lib/sablon/html/converter.rb +++ b/lib/sablon/html/converter.rb @@ -3,69 +3,8 @@ module Sablon class HTMLConverter - class ASTBuilder - Layer = Struct.new(:items, :ilvl) - - def initialize(nodes) - @layers = [Layer.new(nodes, false)] - @root = Root.new([]) - end - - def to_ast - @root - end - - def new_layer(ilvl: false) - @layers.push Layer.new([], ilvl) - end - - def next - current_layer.items.shift - end - - def push(node) - @layers.last.items.push node - end - - def push_all(nodes) - nodes.each(&method(:push)) - end - - def done? - !current_layer.items.any? - end - - def nested? - ilvl > 0 - end - - def ilvl - @layers.select { |layer| layer.ilvl }.size - 1 - end - - def emit(node) - @root.nodes << node - end - - private - - def current_layer - if @layers.any? - last_layer = @layers.last - if last_layer.items.any? - last_layer - else - @layers.pop - current_layer - end - else - Layer.new([], false) - end - end - end - def process(input, env) - @numbering = env.numbering + @env = env processed_ast(input).to_docx end @@ -77,130 +16,7 @@ def processed_ast(input) def build_ast(input) doc = Nokogiri::HTML.fragment(input) - @builder = ASTBuilder.new(doc.children) - - while !@builder.done? - ast_next_paragraph - end - @builder.to_ast - end - - private - - def initialize - @numbering = nil - end - - # Adds the appropriate style class to the node - def prepare_paragraph(node) - # set default styles based on HTML element allowing for h1, h2, etc. - styles = Hash.new do |hash, key| - tag, num = key.match(/([a-z]+)(\d*)/)[1..2] - { 'pStyle' => hash[tag]['pStyle'] + num } if hash.key?(tag) - end - styles.merge!('div' => 'Normal', 'p' => 'Paragraph', 'h' => 'Heading', - 'ul' => 'ListBullet', 'ol' => 'ListNumber') - styles['li'] = @definition.style if @definition - styles.each { |k, v| styles[k] = { 'pStyle' => v } } - unless styles[node.name] - raise ArgumentError, "Don't know how to handle node: #{node.inspect}" - end - # - merge_node_properties(node, {}, styles[node.name], Paragraph) - end - - # Adds properties to the run, from the parent, the style node attributes - # and finally any element specfic properties. A modified properties hash - # is returned - def prepare_run(node, properties) - # HTML element based styles - styles = { - 'span' => {}, 'text' => {}, 'br' => {}, - 'strong' => { 'b' => nil }, 'b' => { 'b' => nil }, - 'em' => { 'i' => nil }, 'i' => { 'i' => nil }, - 'u' => { 'u' => 'single' } - } - - unless styles.key?(node.name) - raise ArgumentError, "Don't know how to handle node: #{node.inspect}" - end - # combine all properties, return the new hash - merge_node_properties(node, properties, styles[node.name], Run) - end - - def merge_node_properties(node, par_props, elm_props, ast_class) - # perform an initial conversion for any leftover CSS props passed - # in from the node's parent - properties = par_props.map do |k, v| - ast_class.convert_style_attr(k, v) - end - properties = Hash[properties] - - # Process any styles, defined on the node - properties.merge!(ast_class.process_style(node['style'])) - - # Set the element specific attributes, overriding any other values - properties.merge(elm_props) - end - - # handles passing all attributes on the parent down to children - # preappending parent attributes so child can overwrite if present - def merge_node_attributes(node, attributes) - node.children.each do |child| - attributes.each do |name, atr| - catr = child[name] ? child[name] : '' - child[name] = atr.value.split(';').concat(catr.split(';')).join('; ') - end - end - end - - def ast_next_paragraph - node = @builder.next - return if node.text? - - properties = prepare_paragraph(node) - - # handle special cases - if node.name =~ /ul|ol/ - @builder.new_layer ilvl: true - unless @builder.nested? - @definition = @numbering.register(properties['pStyle']) - end - merge_node_attributes(node, node.attributes) - @builder.push_all(node.children) - return - elsif node.name == 'li' - properties['numPr'] = [ - { 'ilvl' => @builder.ilvl }, { 'numId' => @definition.numid } - ] - end - - # create word_ml node - @builder.new_layer - trans_props = Paragraph.transferred_properties(properties) - @builder.emit Paragraph.new(properties, ast_runs(node.children, trans_props)) - end - - def ast_runs(nodes, properties) - runs = nodes.flat_map do |node| - begin - local_props = prepare_run(node, properties) - rescue ArgumentError - raise unless %w[ul ol p div].include?(node.name) - merge_node_attributes(node, node.parent.attributes) - @builder.push(node) - next nil - end - # - if node.text? - Run.new(local_props, node.text) - elsif node.name == 'br' - Newline.new - else - ast_runs(node.children, local_props).nodes - end - end - Collection.new(runs.compact) + Root.new(@env, doc) end end end diff --git a/test/configuration_test.rb b/test/configuration_test.rb new file mode 100644 index 00000000..d62a659d --- /dev/null +++ b/test/configuration_test.rb @@ -0,0 +1,122 @@ +# -*- coding: utf-8 -*- +require "test_helper" + +class ConfigurationTest < Sablon::TestCase + def setup + super + @config = Sablon::Configuration.send(:new) + end + + def test_register_tag + options = { + ast_class: :paragraph, + attributes: { dummy: 'value' }, + properties: { pstyle: 'ListBullet' }, + allowed_children: %i[_inline ol ul li] + } + # test initialization without type + tag = @config.register_html_tag(:test_tag, **options) + assert_equal @config.permitted_html_tags[:test_tag], tag + assert_equal tag.name, :test_tag + assert_equal tag.type, :inline + assert_equal tag.ast_class, Sablon::HTMLConverter::Paragraph + assert_equal tag.attributes, dummy: 'value' + assert_equal tag.properties, pstyle: 'ListBullet' + assert_equal tag.allowed_children, %i[_inline ol ul li] + + # test initialization with type + tag = @config.register_html_tag('test_tag2', :block, **options) + assert_equal @config.permitted_html_tags[:test_tag2], tag + assert_equal tag.name, :test_tag2 + assert_equal tag.type, :block + end + + def test_remove_tag + tag = @config.register_html_tag(:test) + assert_equal @config.remove_html_tag(:test), tag + assert_nil @config.permitted_html_tags[:test] + end + + def test_register_style_converter_on_existing_ast_class + converter = ->(v) { return "test-attr-#{v}" } + @config.register_style_converter(:run, 'my-test-attr', converter) + # + assert @config.defined_style_conversions[:run]['my-test-attr'], 'converter should be stored in hash' + assert_equal 'test-attr-123', @config.defined_style_conversions[:run]['my-test-attr'].call(123) + end + + def test_register_style_converter_on_newast_class + converter = ->(v) { return "test-attr-#{v}" } + @config.register_style_converter(:unset_ast_class, 'my-test-attr', converter) + # + assert @config.defined_style_conversions[:unset_ast_class]['my-test-attr'], 'converter should be stored in hash' + end + + def test_remove_style_converter + converter = ->(v) { return "test-attr-#{v}" } + converter = @config.register_style_converter(:run, 'my-test-attr', converter) + # + assert_equal converter, @config.remove_style_converter(:run, 'my-test-attr') + assert_nil @config.defined_style_conversions[:run]['my-test-attr'] + end +end + +class ConfigurationHTMLTagTest < Sablon::TestCase + # test basic instantiation of an HTMLTag + def test_html_tag_defaults + tag = Sablon::Configuration::HTMLTag.new(:a, :inline) + assert_equal tag.name, :a + assert_equal tag.type, :inline + assert_nil tag.ast_class + assert_equal tag.attributes, {} + assert_equal tag.properties, {} + assert_equal tag.allowed_children, %i[_inline ol ul] + end + + # Exercising more of the logic used to conform args into valid + def test_html_tag_full_init + args = ['a', 'inline', ast_class: Sablon::HTMLConverter::Run] + tag = Sablon::Configuration::HTMLTag.new(*args) + assert_equal tag.name, :a + assert_equal tag.type, :inline + assert_equal tag.ast_class, Sablon::HTMLConverter::Run + # + options = { + ast_class: :run, + attributes: { dummy: 'value1' }, + properties: { dummy2: 'value2' }, + allowed_children: 'text' + } + tag = Sablon::Configuration::HTMLTag.new('a', 'inline', **options) + # + assert_equal tag.name, :a + assert_equal tag.type, :inline + assert_equal tag.ast_class, Sablon::HTMLConverter::Run + assert_equal tag.attributes, dummy: 'value1' + assert_equal tag.properties, dummy2: 'value2' + assert_equal tag.allowed_children, [:text] + end + + def test_html_tag_init_block_without_class + e = assert_raises ArgumentError do + Sablon::Configuration::HTMLTag.new(:form, :block) + end + assert_equal "Block level tag form must have an AST class.", e.message + end + + def test_html_tag_allowed_children + # define different tags for testing + text = Sablon::Configuration::HTMLTag.new(:text, :inline) + div = Sablon::Configuration::HTMLTag.new(:div, :block, ast_class: :paragraph) + olist = Sablon::Configuration::HTMLTag.new(:ol, :block, ast_class: :paragraph, allowed_children: %i[_block]) + + # test default allowances + assert div.allowed_child?(text) # all inline elements allowed + assert div.allowed_child?(olist) # tag name is included even though it is bock leve + assert_equal div.allowed_child?(div), false # other block elms are not allowed + + # test olist with allowances for all blocks but no inline + assert olist.allowed_child?(div) # all block elements allowed + assert_equal olist.allowed_child?(text), false # no inline elements + end +end diff --git a/test/fixtures/html_sample.docx b/test/fixtures/html_sample.docx index 0ef9361025d6d2225bb2cff17a25f4925a5a979c..3185b5606022d17e31e6650005300e9dd3a3642e 100644 GIT binary patch delta 3129 zcmZ9Oc{CL4+s9{yu{R_;*_Rly?^_XKER$@5A=$HUnS`;WlBJC7OV%M<_B~^I>}z7m zPO>G8r7T51(tFGIiGW#>wBH+zW@3}w*nux0wHi12`K{r0H6RUd62@YAZ0+( zGpKnBq z_hQOw%>gJG-9moZ3{oH+san|H`rKQ_2+=e->)zd=H}b1=g3>Sj6xFwaPh`*!0th@^ z#%Z^6`IWbVor`v-I>jr^7p)yD?u^pE;vPhqI;~_qs#|KDlb?uS9NVMHc6A&Z_#tJw zH}2jgO|TWt+SiF(!2K4f>r0iCn6VN`8T03D(vZzJ8xcL`xIpGi9B|ZH)QLCgm1ZePDFM+zW!MmeYe-AIo^YICDlW1t%Js*px>%v&L_SkmPSe z<31^(aNJJJz#05;lsw3Wm9!gS3~}Nv39gc?a0O2SBxKB zkTNxwr5mU%@nj;^+uyB446q;tP~4^3o*^&_ThiCQ7`UeHS^K-WZVt>>x}6xaJH!>6 zm%@Jkh`I8oDcA3XkjYO;G9=Z7PSI-u>(JZX1(~Z zEa#K_ly72!OG(}#!xs9b6SzrQ_lP1A8*UOD6h(#NF;}_q<;97SK~IK$Dl~U5UeKLC zccFgJ--`MsHNgqLNdP&~8CW=_1*C^%B~LrtDT{h)7O`KS@{7Rg-K;1deNj&os^?7M z4d$r3TcEW9?Y^^YJTjI^lNUnCe(=s67x zDtdF{=jH4!&~5$LGbyALs?UlRGcECejD zz4Ws#RO&!OXevy4BtqMpEVy3o=T%{_jO&OFJ)Rn_yP3i{E+#z&y-L1q#T6Q^f#Z$s zd=wccM0zC+RvViKZ;F}T>#-uYOUcj2eHWH0gcU`iPhi-~S1gN#i#Y|!JFipOU3>30 zuK|A|_o_EQ(20$q(`Wmbo=!w;oqjp7imvy95sZ_(qJ<*T9*0Pr-j+e8h`#h@x)#E+ zvB3`D@ck0#Qri%b#4S*qo_jGM|G{BrHr35wRNLyE5lDm1ykxZ*9Q(c^Oieh6$TS%D z1etXKO}e#N7&0HV|Hh?gNmOAuA6lfps8U+xueGOiSJ)VP{rjepk;l6ZZXi~nh!yiY zvT)(H(6nk}6`$E-Wd}wN2{z4=2f+TI)xJmND*0DQj@gL3tU~w)v_Y%s(OFD*x*NI1 ztlhX{M~Jk~*BP3h`!+XNHsDeHt285&D2pH>n#0~Al7)72on8C{sBPS>Dd87I!*VZ( zymA@vjESyT=0j`DCJ+4E<74E#86D^6*8sbm_PT0Fx+opWHJ}W+o6)MLI{n6|E}3_C z?dw}#b89l;GD;~_ANYG}xPAn>YR)^-v4PTeaYSO=i8hod$xu$(!Iuid{kFjDJPBMw z-&hl^`$jxA>WfP@@gwep?Sy9zQy6~^JdnDP8oz9-1Dx(CM|fGWv>^IvGe?mM+g%?Xl>9cNq2IGqS?CgLcUC zBE6}OO0U%`b778>Q`sB&gYSWW-q$GN@B+vFPEgbB1GWQ8#xEmU#ry#_unON3)9jZIw)iimJ zDD7#)6fsrY5Ib77<>`-?`_**5nA; z1ht`Px$Ry4m3tlR1IU-0!jBNpOyus1sfKbpzu0M*;@x(ccL=(d{g#R%7VM+ZFSmAw9HdzE z1?|3=AWCDBBS|wRk~L*=#3$p{gsL`$>jrhvuPf%`X?jRr>Ee`3@7kJvd)tgk|E8>h zYE&CPekkVASQ7ty{R4w&qur9&r^q|GsL^s-edx1&=X8$eaYM*(7NM+mzU8&e?1`AP zS1G7^qnZkF3|zE*fznzWjTEIYzxGP@t`{l#M5R&3lW8m}@*W%PK1D{hN^VY2zN>!& zFF^}m0ufnctsbVgZug-p6$_{XiLOn!sbnA}ZVI_^c7v8@&0;wpI0J_(vpf!N2N-dM zpB%M?Rd^Y6ug#9d_T8e+Ft0OPRIeGMvT?dNIgsa@lXk~sk&xqq1}1D>722w}swv+8 z7C}t(TN9V)KAE59?75dmKVWO>uHsg(IZV5@|KhFO7f8w8GBn&s)Tr(V+WadtVD8-o z9i46=Y#d5|llq78K!$a3b|F;3NWu=+8C;Uwd9OR~otKnkx3$a)?VRUSUDB28U2{!K zq|1@D*pvC^C8ck)(MA#jZNJJ=MeIYzo~6oQ@5o-3Z+&#wvs8cEfm`jrZTz1XqvP>d9D7 zZdAvfs3AB@Xu&-`HA#1DZVnFIOClGOwqwo&VfA+au&8K zj=twSw)wS#{~9}J^xQr% zzLu(gN|+i;*>m`2sdNq^R?6q#YIXe_Dy+0G{L4=s`yfBn_T%hTV)?ykGznZUwfK#&Z{|I2dyG?vx|3;?J&{Wp(Q{G0a-@VM*f z>+0$JpSroC0RURv z(|ex&8~=>Ch~(rxdo=|4{1PPv0N|q!0I>b5=cOLzjUETa4M}tUPY;o7Ac8Sw+n5C8De)dbA}u{N~>C&Y5}6na}4t=l$=AYb1HtNJ3|bASJs9001RGrBkn4MORKj zb_UhWB)orU288=V31sPiNC;dgbe`)KwIo5GH4N3hzL}u{fMhrTFrE$!^$l>6aPoBw z_ICCO5)bqCnzXhIZ?@Q%3LgTWO-Hi%q#OzsaJZ4^M(S9^lALHckC4I3nRXA z$s8*$`9&U7b=?f01MK8DC+*~NYg>-4l37Z56P^~mu=nlf*K>g^1w@VQA2@HL7Yo#r z3Z;tFbVA1J>~TBeqY}R6b2iSeP+ypqRkOBAyKgC6@a*z$@qI%sT1;e79Qah6x#nED zTsE5ITHay|-*^mek2x{*NWJEWBK$HDFCliUQxBEb#<~!f+$3Q!p5kRn>jK6?*iAoN zkgAx?iafr5+GZbo04-U#Elxl;wKgZ0`J@%`6fMAZw{UGS-kkud5vmbkdE=)Kz}aGa zVUoy(atAFQST;ky0SWqQZ10*hPT>sz!PIRwS!3(5JkD+Ev zQJ$Rh>qlw6PAt92Oj-LuPkR{STa)QAoT@N=tG2~+;Zwa_GtxJG+ZsK@29IUE`(ECP zt;dUEF+vq#p>k8UIU7+(GyyL6!&DQ$XUBwQl^%jy5GVeHM&j7jC42*@o3v}A&< z&sSnj)0+9l9{t8ZC$4}a$lAjt!}d+@s7zA6;Qm}!(c}9F!lN?!ajWTjbVt&h!Gn>5 z65r#zUiXG2vODvc7L zqOJ9QIBun4^;(uRq?T3ko?Dw#SNQX@mdpiu66;g(Uin3phxyL1HWKl6CX*SD2^Oy_OhUu#W%^*(RX{h>qe;4X ziYJ)b6KtQa3->xfZy6n~FhTZ0KR{>YMP670=mZnA62)gP^0jv{MMA7B9!G|#8A>_& z^peB?i2Ii5Va!u$f=hn+d{ccX-R`6Y?!y8nF`2)44N3h?HKYhDPdwAw9UW31p19-1 zGq*)7Sby+kTt@eXP!V)pxO+rO0ec%@pa&8LuHSNLUF6fL%gwUXza(1$Lx{^eB$>tr zXX(wR;x?I=@v&9Xq+)J@uF=&sqoFz?Gil(vg8Yp@>U`9S(WI1m)3+domLIchSIMAl z7TGY1FtzBXbsAk)sZ-11aavOflnvZGh5`l>%AVOQylDM=Gv zl2WVkk3|cQsCG+VO_-biEy8DFH7wLEm^Q6)+Uq@JRV&wSU2ZR5VwTPLQ@X< z$HwItU>fu0Zur{RXM94%S$T}X`QiJ2zogcV47))@_0y$my<^W~8Fa^sF6Ug-4NF|q z>@a^pi{_cKMM$(VE43A5rsboa_r1!H!pxO-eqs5%8ega_L0jsY9$qiUO~d4$;GGq7 zcu9{o2R^&rkKs|}S!aNZp`;%g?|^W@)2SB_;hISBfy~TJsayAVc!Q>8Xhp^D^T(?d zN+rkq?&5a&HrT+iyTQh2k5^~x&A+a4WyYO2g%;?14Zg-ZF+y*9NSaiT9EV!e=8zox zv1{8C!z#vS=#C3#nN-gWF_(j9P5g1Ub|R9_P0O=V&=pGyPXtD zL~9VCHC`u22A?o#CV`vV@3!+4au^Fbd{3O?rFnWTu6Gw)oF2^$lN$a7*HOqy3-!bb;vc`wMuE#!3RtK=d+g*)1H;d(sXeI2@uDPD## zjgtrouIZ-ooJd}*211@|DD#dfz5G~*8aj+_m_M^%*wv2VuK=YuXt!VPh5a&cA!gGK ztb4m~ZDvF+=-I}*HyPgazfpl9fW{jL?_prdoRzIaD777_U0Lk+2Fl=3>O|gn_VPHy zEyvmKrWMD2&kIWE%6})r*r_lwIVL5MrY8i(b;G`&P?|>@5v{B59g9PmP+1JD*U-En zz7=pb#9Nu?6_F@N{r)CrXjy4 z=3(-LT%}k$_`kPet2;i=r?D1=-_<&8wKR( z?F0*Jn-7ak8T?~}D)&dX?zfWo8uS9mI1HDMiS#TL*0Qt%Z3qOR1sY`nDV_FEoqFruEr@76pj!H6=Z*Yz-ZkyEKmP@ql)@T*f=PT|e`#Gpwy%Z^NJtpcRmu zJGB_zmJmAP=y9U)7KA3e1T{#u+n^X!rW=hqRwH{sCT#8lVYhX$m{wXt>j~d$bz(vp zq}G(WjcUKn^d8mN+5ZdqJW9+=opZpYNZHif?BeZ7au3;R+brze{D>rNJT2BsGo!+m9~MPP}z&g(@kyezc7 zHc-CWC3vyRmq0p#pt#V6W@I36)yX|oZn-oeV1B`)O;ZLVq)?!5u-aVGj{72&c!6zD(q#?ai)-cYZo;3l4Rm19hX=m3{ihB=T3;Y~$a2+r`)HifqnoyiVG`HQ` zwfu`A%3;3akzRlq+|n%0Vi-cDSn092b%=fJIb>#(bTEXruelOb;{@NU_|&CfK_)!V zh{ya8+~7t;X^O1n>RAIig2l9){j)o&v;KiuH0T%H$-Dtu>q9xHwAW^3{w;qve#QN5*7LQ3@B;SgROzM`{}9=yu4{x4bq{6@?>SDb zrX24y`l!q3jlI`U3vi2?aS?6bNn+5_@-phjaU<3TN&GEWna4PFBngiS4VXX4Ey=cX zq<#ZRX}6B@UT`rhe~Tr+JEUO7GxuJ>FkEwO!TfU2mlUFrIqPaSkD&8 z=`M(YVsG2<|C4z;QABS`4giWS|IOgASQ~!6vpII;)a_BE0C3}U&YAe0t@An3%2*?1 zW_Lpb2}qjs|B~HzvE;@>3IL$jIo;>^KZRMfKi(V|6z=65cxKK2Ez==Wn5ifMz+Vpl zIR90Cri-o8<;Hp;Y0m#V63Iz2giS{Zo*!F~oFHjStdS)vt096Md{(79{gO{#?qGFx F`X4k1Y^?wQ diff --git a/test/html/ast_builder_test.rb b/test/html/ast_builder_test.rb new file mode 100644 index 00000000..021622db --- /dev/null +++ b/test/html/ast_builder_test.rb @@ -0,0 +1,65 @@ +# -*- coding: utf-8 -*- +require "test_helper" + +# Tests some low level private methods in the ASTBuilder class. #process_nodes +# and self.html_to_ast are covered extensively in converter_test.rb +class HTMLConverterASTBuilderTest < Sablon::TestCase + def setup + super + @env = Sablon::Environment.new(nil) + end + + def test_fetch_tag + @bulider = new_builder + tag = Sablon::Configuration.instance.permitted_html_tags[:span] + assert_equal @bulider.send(:fetch_tag, :span), tag + # check that strings are converted into symbols + assert_equal @bulider.send(:fetch_tag, 'span'), tag + # test uknown tag raises error + e = assert_raises ArgumentError do + @bulider.send(:fetch_tag, :unknown_tag) + end + assert_equal "Don't know how to handle HTML tag: unknown_tag", e.message + end + + def test_validate_structure + @bulider = new_builder + root = Sablon::Configuration.instance.permitted_html_tags['#document-fragment'.to_sym] + div = Sablon::Configuration.instance.permitted_html_tags[:div] + span = Sablon::Configuration.instance.permitted_html_tags[:span] + # test valid relationship + assert_nil @bulider.send(:validate_structure, div, span) + # test inverted relationship + e = assert_raises ArgumentError do + @bulider.send(:validate_structure, span, div) + end + assert_equal "Invalid HTML structure: div is not a valid child element of span.", e.message + # test inline tag with no parent + e = assert_raises ArgumentError do + @bulider.send(:validate_structure, root, span) + end + assert_equal "Invalid HTML structure: span needs to be wrapped in a block level tag.", e.message + end + + def test_merge_properties + @builder = new_builder + node = Nokogiri::HTML.fragment('Test').children[0] + tag = Struct.new(:properties).new(rStyle: 'Normal') + # test that properties are merged across all three arguments + props = @builder.send(:merge_node_properties, node, tag, 'background-color' => '#00F') + assert_equal({ 'background-color' => '#00F', rStyle: 'Normal', 'color' => '#F00', 'text-decoration' => 'underline wavy' }, props) + # test that parent properties are overriden by tag properties + props = @builder.send(:merge_node_properties, node, tag, rStyle: 'Citation', 'background-color' => '#00F') + assert_equal({ 'background-color' => '#00F', rStyle: 'Normal', 'color' => '#F00', 'text-decoration' => 'underline wavy' }, props) + # test that inline properties override parent styles + node = Nokogiri::HTML.fragment('Test').children[0] + props = @builder.send(:merge_node_properties, node, tag, 'color' => '#00F') + assert_equal({ rStyle: 'Normal', 'color' => '#F00' }, props) + end + + private + + def new_builder(nodes = [], properties = {}) + Sablon::HTMLConverter::ASTBuilder.new(@env, nodes, properties) + end +end diff --git a/test/html/ast_test.rb b/test/html/ast_test.rb new file mode 100644 index 00000000..c232ec67 --- /dev/null +++ b/test/html/ast_test.rb @@ -0,0 +1,117 @@ +# -*- coding: utf-8 -*- +require "test_helper" + +class HTMLConverterASTTest < Sablon::TestCase + def setup + super + @converter = Sablon::HTMLConverter.new + @converter.instance_variable_set(:@env, Sablon::Environment.new(nil)) + end + + def test_div + input = '
Lorem ipsum dolor sit amet
' + ast = @converter.processed_ast(input) + assert_equal ']>]>', ast.inspect + end + + def test_p + input = '

Lorem ipsum dolor sit amet

' + ast = @converter.processed_ast(input) + assert_equal ']>]>', ast.inspect + end + + def test_b + input = '

Lorem ipsum dolor sit amet

' + ast = @converter.processed_ast(input) + assert_equal ', ]>]>', ast.inspect + end + + def test_i + input = '

Lorem ipsum dolor sit amet

' + ast = @converter.processed_ast(input) + assert_equal ', ]>]>', ast.inspect + end + + def test_br_in_strong + input = '
Lorem
ipsum
dolor
' + par = @converter.processed_ast(input).grep(Sablon::HTMLConverter::Paragraph).first + assert_equal "[, , , , ]", par.runs.inspect + end + + def test_br_in_em + input = '
Lorem
ipsum
dolor
' + par = @converter.processed_ast(input).grep(Sablon::HTMLConverter::Paragraph).first + assert_equal "[, , , , ]", par.runs.inspect + end + + def test_nested_strong_and_em + input = '
Lorem ipsum dolor
' + par = @converter.processed_ast(input).grep(Sablon::HTMLConverter::Paragraph).first + assert_equal "[, , ]", par.runs.inspect + end + + def test_ignore_last_br_in_div + input = '
Lorem ipsum dolor sit amet
' + par = @converter.processed_ast(input).grep(Sablon::HTMLConverter::Paragraph).first + assert_equal "[]", par.runs.inspect + end + + def test_ignore_br_in_blank_div + input = '

' + par = @converter.processed_ast(input).grep(Sablon::HTMLConverter::Paragraph).first + assert_equal "[]", par.runs.inspect + end + + def test_headings + input = '

First

Second

Third

' + ast = @converter.processed_ast(input) + assert_equal "]>, ]>, ]>]>", ast.inspect + end + + def test_h_with_formatting + input = '

Lorem ipsum dolor sit amet

' + ast = @converter.processed_ast(input) + assert_equal ", , , ]>]>", ast.inspect + end + + def test_ul + input = '
  • Lorem
  • ipsum
' + ast = @converter.processed_ast(input) + assert_equal "]>, ]>]>]>", ast.inspect + end + + def test_ol + input = '
  1. Lorem
  2. ipsum
' + ast = @converter.processed_ast(input) + assert_equal "]>, ]>]>]>", ast.inspect + end + + def test_num_id + ast = @converter.processed_ast('
  1. Some
  2. Lorem
  • ipsum
  1. dolor
  2. sit
') + assert_equal %w[1001 1001 1002 1003 1003], get_numpr_prop_from_ast(ast, :numId) + end + + def test_nested_lists_have_the_same_numid + ast = @converter.processed_ast('
  • Lorem
    • ipsum
      • dolor
') + assert_equal %w[1001 1001 1001], get_numpr_prop_from_ast(ast, :numId) + end + + def test_keep_nested_list_order + input = '
  • 1
    • 1.1
      • 1.1.1
    • 1.2
  • 2
    • 1.3
      • 1.3.1
' + ast = @converter.processed_ast(input) + assert_equal %w[1001], get_numpr_prop_from_ast(ast, :numId).uniq + assert_equal %w[0 1 2 1 0 1 2], get_numpr_prop_from_ast(ast, :ilvl) + end + + private + + # returns the numid attribute from paragraphs + def get_numpr_prop_from_ast(ast, key) + values = [] + ast.grep(Sablon::HTMLConverter::ListParagraph).each do |para| + numpr = para.instance_variable_get('@properties')[:numPr] + numpr.each { |val| values.push(val[key]) if val[key] } + end + values + end +end diff --git a/test/html/converter_test.rb b/test/html/converter_test.rb index 67a5f028..160f1430 100644 --- a/test/html/converter_test.rb +++ b/test/html/converter_test.rb @@ -314,7 +314,7 @@ def test_unknown_tag e = assert_raises ArgumentError do process('') end - assert_match(/Don't know how to handle node:/, e.message) + assert_match(/Don't know how to handle HTML tag:/, e.message) end private @@ -520,8 +520,8 @@ def test_run_prop_override_paragraph_prop expected_output = <<-DOCX.strip - + @@ -540,6 +540,88 @@ def test_run_prop_override_paragraph_prop assert_equal normalize_wordml(expected_output), process(input) end + def test_inline_style_overrides_tag_style + # Note: a toggle property can not be removed once it becomes a symbol + # unless there is a specific CSS style that will set it to false. This + # is because CSS styles can only override parent properties not remove them. + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_conversion_of_a_registered_tag_without_ast_class + # This registers a new tag with the configuration object and then trys + # to convert it + Sablon.configure do |config| + config.register_html_tag(:bgcyan, :inline, properties: { highlight: 'cyan' }) + end + # + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + + # remove the tag to avoid any accidental side effects + Sablon.configure do |config| + config.remove_html_tag(:bgcyan) + end + end + + def test_conversion_of_a_registered_tag_with_ast_class + Sablon.configure do |config| + # create the AST class and then pass it onto the register tag method + ast_class = Class.new(Sablon::HTMLConverter::Node) do + def self.name + 'TestInstr' + end + + def initialize(_env, node, _properties) + @content = node.text + end + + def inspect + @content + end + + def to_docx + " #{@content} " + end + end + # + config.register_html_tag(:test_instr, :inline, ast_class: ast_class) + end + # + input = '

test

' + expected_output = <<-DOCX.strip + + + + + test + + DOCX + assert_equal normalize_wordml(expected_output), process(input) + + # remove the tag to avoid any accidental side effects + Sablon.configure do |config| + config.remove_html_tag(:test_instr) + end + end + + def test_conversion_of_registered_style_attribute + Sablon.configure do |config| + converter = ->(v) { return :highlight, v } + config.register_style_converter(:run, 'test-highlight', converter) + end + # + input = '

test

' + expected_output = run_with_rpr('') + assert_equal normalize_wordml(expected_output), process(input) + # + Sablon.configure do |config| + config.remove_style_converter(:run, 'test-highlight') + end + end + private def process(input) @@ -547,7 +629,7 @@ def process(input) end def para_with_ppr(ppr_str) - para_str = '%s' + para_str = '%s' format(para_str, ppr_str) end @@ -572,228 +654,3 @@ def normalize_wordml(wordml) wordml.gsub(/^\s+/, '').tr("\n", '') end end - -class HTMLConverterASTTest < Sablon::TestCase - def setup - super - @converter = Sablon::HTMLConverter.new - @converter.instance_variable_set(:@numbering, Sablon::Environment.new(nil).numbering) - end - - def test_div - input = '
Lorem ipsum dolor sit amet
' - ast = @converter.processed_ast(input) - assert_equal ']>]>', ast.inspect - end - - def test_p - input = '

Lorem ipsum dolor sit amet

' - ast = @converter.processed_ast(input) - assert_equal ']>]>', ast.inspect - end - - def test_b - input = '

Lorem ipsum dolor sit amet

' - ast = @converter.processed_ast(input) - assert_equal ', ]>]>', ast.inspect - end - - def test_i - input = '

Lorem ipsum dolor sit amet

' - ast = @converter.processed_ast(input) - assert_equal ', ]>]>', ast.inspect - end - - def test_br_in_strong - input = '
Lorem
ipsum
dolor
' - par = @converter.processed_ast(input).grep(Sablon::HTMLConverter::Paragraph).first - assert_equal "[, , , , ]", par.runs.inspect - end - - def test_br_in_em - input = '
Lorem
ipsum
dolor
' - par = @converter.processed_ast(input).grep(Sablon::HTMLConverter::Paragraph).first - assert_equal "[, , , , ]", par.runs.inspect - end - - def test_nested_strong_and_em - input = '
Lorem ipsum dolor
' - par = @converter.processed_ast(input).grep(Sablon::HTMLConverter::Paragraph).first - assert_equal "[, , ]", par.runs.inspect - end - - def test_ignore_last_br_in_div - input = '
Lorem ipsum dolor sit amet
' - par = @converter.processed_ast(input).grep(Sablon::HTMLConverter::Paragraph).first - assert_equal "[]", par.runs.inspect - end - - def test_ignore_br_in_blank_div - input = '

' - par = @converter.processed_ast(input).grep(Sablon::HTMLConverter::Paragraph).first - assert_equal "[]", par.runs.inspect - end - - def test_headings - input = '

First

Second

Third

' - ast = @converter.processed_ast(input) - assert_equal "]>, ]>, ]>]>", ast.inspect - end - - def test_h_with_formatting - input = '

Lorem ipsum dolor sit amet

' - ast = @converter.processed_ast(input) - assert_equal ", , , ]>]>", ast.inspect - end - - def test_ul - input = '
  • Lorem
  • ipsum
' - ast = @converter.processed_ast(input) - assert_equal "]>, ]>]>", ast.inspect - end - - def test_ol - input = '
  1. Lorem
  2. ipsum
' - ast = @converter.processed_ast(input) - assert_equal "]>, ]>]>", ast.inspect - end - - def test_num_id - ast = @converter.processed_ast('
  1. Some
  2. Lorem
  • ipsum
  1. dolor
  2. sit
') - assert_equal [1001, 1001, 1002, 1003, 1003], get_numpr_prop_from_ast(ast, 'numId') - end - - def test_nested_lists_have_the_same_numid - ast = @converter.processed_ast('
  • Lorem
    • ipsum
      • dolor
') - assert_equal [1001, 1001, 1001], get_numpr_prop_from_ast(ast, 'numId') - end - - def test_keep_nested_list_order - input = '
  • 1
    • 1.1
      • 1.1.1
    • 1.2
  • 2
    • 1.3
      • 1.3.1
' - ast = @converter.processed_ast(input) - assert_equal [1001], get_numpr_prop_from_ast(ast, 'numId').uniq - assert_equal [0, 1, 2, 1, 0, 1, 2], get_numpr_prop_from_ast(ast, 'ilvl') - end - - private - - # returns the numid attribute from paragraphs - def get_numpr_prop_from_ast(ast, key) - values = [] - ast.grep(Sablon::HTMLConverter::Paragraph).each do |para| - numpr = para.instance_variable_get('@properties')['numPr'] - numpr.each { |val| values.push(val[key]) if val[key] } - end - values - end -end - -class NodePropertiesTest < Sablon::TestCase - def setup - # struct to simplify prop whitelisting during tests - @inc_props = Struct.new(:props) do - def include?(value) - true - end - end - end - - def test_empty_node_properties_converison - # test empty properties - props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', {}, @inc_props.new) - assert_equal props.inspect, '' - assert_equal props.to_docx, nil - end - - def test_simple_node_property_converison - props = { 'pStyle' => 'Paragraph' } - props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props, @inc_props.new) - assert_equal props.inspect, 'pStyle=Paragraph' - assert_equal props.to_docx, '' - end - - def test_node_property_with_nil_value_converison - props = { 'b' => nil } - props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props, @inc_props.new) - assert_equal props.inspect, 'b' - assert_equal props.to_docx, '' - end - - def test_node_property_with_hash_value_converison - props = { 'shd' => { color: 'clear', fill: '123456', test: nil } } - props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props, @inc_props.new) - assert_equal props.inspect, 'shd={:color=>"clear", :fill=>"123456", :test=>nil}' - assert_equal props.to_docx, '' - end - - def test_node_property_with_array_value_converison - props = { 'numPr' => [{ 'ilvl' => 1 }, { 'numId' => 34 }] } - props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props, @inc_props.new) - assert_equal props.inspect, 'numPr=[{"ilvl"=>1}, {"numId"=>34}]' - assert_equal props.to_docx, '' - end - - def test_complex_node_properties_conversion - props = { - 'top1' => 'val1', - 'top2' => [ - { 'mid0' => nil }, - { 'mid1' => [ - { 'bottom1' => { key1: 'abc' } }, - { 'bottom2' => 'xyz' } - ] }, - { 'mid2' => 'val2' } - ], - 'top3' => { key1: 1, key2: '2', key3: nil, key4: true, key5: false } - } - output = <<-DOCX.gsub(/^\s*/, '').delete("\n") - - - - - - - - - - - - - DOCX - props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props, @inc_props.new) - assert_equal props.to_docx, output - end - - def test_setting_property_value - props = {} - props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props, @inc_props.new) - props['rStyle'] = 'FootnoteText' - assert_equal({ 'rStyle' => 'FootnoteText' }, props.instance_variable_get(:@properties)) - end - - def test_properties_filtered_on_init - props = { 'pStyle' => 'Paragraph', 'rStyle' => 'EndnoteText' } - props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props, %[rStyle]) - assert_equal({ 'rStyle' => 'EndnoteText' }, props.instance_variable_get(:@properties)) - end - - def test_transferred_properties - props = { 'pStyle' => 'Paragraph', 'rStyle' => 'EndnoteText' } - trans = Sablon::HTMLConverter::NodeProperties.transferred_properties(props, %[pStyle]) - assert_equal({ 'rStyle' => 'EndnoteText' }, trans) - end - - def test_node_properties_paragraph_factory - props = { 'pStyle' => 'Paragraph' } - props = Sablon::HTMLConverter::NodeProperties.paragraph(props) - assert_equal 'pStyle=Paragraph', props.inspect - assert_equal props.to_docx, '' - end - - def test_node_properties_run_factory - props = { 'color' => 'FF00FF' } - props = Sablon::HTMLConverter::NodeProperties.run(props) - assert_equal 'color=FF00FF', props.inspect - assert_equal '', props.to_docx - end -end diff --git a/test/html/node_properties_test.rb b/test/html/node_properties_test.rb new file mode 100644 index 00000000..2f9b48cc --- /dev/null +++ b/test/html/node_properties_test.rb @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- +require "test_helper" + +class NodePropertiesTest < Sablon::TestCase + def setup + # struct to simplify prop whitelisting during tests + @inc_props = Struct.new(:props) do + def include?(*) + true + end + end + end + + def test_empty_node_properties_converison + # test empty properties + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', {}, @inc_props.new) + assert_equal props.inspect, '' + assert_nil props.to_docx + end + + def test_simple_node_property_converison + props = { 'pStyle' => 'Paragraph' } + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props, @inc_props.new) + assert_equal props.inspect, 'pStyle=Paragraph' + assert_equal props.to_docx, '' + end + + def test_node_property_with_nil_value_converison + props = { 'b' => nil } + props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props, @inc_props.new) + assert_equal props.inspect, 'b' + assert_equal props.to_docx, '' + end + + def test_node_property_with_hash_value_converison + props = { 'shd' => { color: 'clear', fill: '123456', test: nil } } + props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props, @inc_props.new) + assert_equal props.inspect, 'shd={:color=>"clear", :fill=>"123456", :test=>nil}' + assert_equal props.to_docx, '' + end + + def test_node_property_with_array_value_converison + props = { 'numPr' => [{ 'ilvl' => 1 }, { 'numId' => 34 }] } + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props, @inc_props.new) + assert_equal props.inspect, 'numPr=[{"ilvl"=>1}, {"numId"=>34}]' + assert_equal props.to_docx, '' + end + + def test_complex_node_properties_conversion + props = { + 'top1' => 'val1', + 'top2' => [ + { 'mid0' => nil }, + { 'mid1' => [ + { 'bottom1' => { key1: 'abc' } }, + { 'bottom2' => 'xyz' } + ] }, + { 'mid2' => 'val2' } + ], + 'top3' => { key1: 1, key2: '2', key3: nil, key4: true, key5: false } + } + output = <<-DOCX.gsub(/^\s*/, '').delete("\n") + + + + + + + + + + + + + DOCX + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props, @inc_props.new) + assert_equal props.to_docx, output + end + + def test_setting_property_value + props = {} + props = Sablon::HTMLConverter::NodeProperties.new('w:pPr', props, @inc_props.new) + props['rStyle'] = 'FootnoteText' + assert_equal({ 'rStyle' => 'FootnoteText' }, props.instance_variable_get(:@properties)) + end + + def test_properties_filtered_on_init + props = { 'pStyle' => 'Paragraph', 'rStyle' => 'EndnoteText' } + props = Sablon::HTMLConverter::NodeProperties.new('w:rPr', props, %w[rStyle]) + assert_equal({ 'rStyle' => 'EndnoteText' }, props.instance_variable_get(:@properties)) + end + + def test_transferred_properties + props = { 'pStyle' => 'Paragraph', 'rStyle' => 'EndnoteText' } + props = Sablon::HTMLConverter::NodeProperties.new(nil, props, %w[pStyle]) + trans = props.transferred_properties + assert_equal({ 'rStyle' => 'EndnoteText' }, trans) + end + + def test_node_properties_paragraph_factory + props = { 'pStyle' => 'Paragraph' } + props = Sablon::HTMLConverter::NodeProperties.paragraph(props) + assert_equal 'pStyle=Paragraph', props.inspect + assert_equal props.to_docx, '' + end + + def test_node_properties_run_factory + props = { 'color' => 'FF00FF' } + props = Sablon::HTMLConverter::NodeProperties.run(props) + assert_equal 'color=FF00FF', props.inspect + assert_equal '', props.to_docx + end +end From 19fc64b6e791954c3a55ea53c8bb5ee36595cd94 Mon Sep 17 00:00:00 2001 From: Matthew Stadelman Date: Thu, 7 Sep 2017 12:20:25 -0400 Subject: [PATCH 08/12] Support s, sub, and sup tags (#68) * Add support for , , and tags to configuration * Update converter test with new unit tests for the three added tags * Update integration test to use , , and in various combinations --- lib/sablon/configuration/configuration.rb | 6 +++ test/fixtures/html/html_test_content.html | 7 ++-- test/fixtures/html_sample.docx | Bin 24071 -> 24121 bytes test/html/converter_test.rb | 48 ++++++++++++++++++++++ 4 files changed, 58 insertions(+), 3 deletions(-) diff --git a/lib/sablon/configuration/configuration.rb b/lib/sablon/configuration/configuration.rb index ef02c7c1..d5a5a667 100644 --- a/lib/sablon/configuration/configuration.rb +++ b/lib/sablon/configuration/configuration.rb @@ -54,6 +54,7 @@ def initialize_html_tags tags = { # special tag used for elements with no parent, i.e. top level '#document-fragment' => { type: :block, ast_class: :root, allowed_children: :_block }, + # block level tags div: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Normal' } }, p: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Paragraph' } }, @@ -66,6 +67,7 @@ def initialize_html_tags ol: { type: :block, ast_class: :list, properties: { pStyle: 'ListNumber' }, allowed_children: %i[ol li] }, ul: { type: :block, ast_class: :list, properties: { pStyle: 'ListBullet' }, allowed_children: %i[ul li] }, li: { type: :block, ast_class: :list_paragraph }, + # inline style tags span: { type: :inline, ast_class: nil, properties: {} }, strong: { type: :inline, ast_class: nil, properties: { b: nil } }, @@ -73,6 +75,10 @@ def initialize_html_tags em: { type: :inline, ast_class: nil, properties: { i: nil } }, i: { type: :inline, ast_class: nil, properties: { i: nil } }, u: { type: :inline, ast_class: nil, properties: { u: 'single' } }, + s: { type: :inline, ast_class: nil, properties: { strike: 'true' } }, + sub: { type: :inline, ast_class: nil, properties: { vertAlign: 'subscript' } }, + sup: { type: :inline, ast_class: nil, properties: { vertAlign: 'superscript' } }, + # inline content tags text: { type: :inline, ast_class: :run, properties: {}, allowed_children: [] }, br: { type: :inline, ast_class: :newline, properties: {}, allowed_children: [] } diff --git a/test/fixtures/html/html_test_content.html b/test/fixtures/html/html_test_content.html index b9c2f2e1..6c580071 100644 --- a/test/fixtures/html/html_test_content.html +++ b/test/fixtures/html/html_test_content.html @@ -14,9 +14,10 @@

Text

iaculis urna bibendum vitae. Nunc in quam consequat, tristique tellus in, commodo turpis. Curabitur ullamcorper odio purus, lobortis egestas magna laoreet vitae. Nunc fringilla velit ante, eu aliquam nisi cursus vitae. - Suspendisse sit amet dui egestas, volutpat nisi vel, mattis justo. Nullam - pellentesque, ipsum eget blandit pharetra, augue elit aliquam mauris, - vel mollis nisl augue ut ipsum. + Suspendisse sit amet dui egestas, volutpat + nisi vel, mattis justo. Nullam pellentesque, ipsum eget blandit pharetra, + augue elit aliquam mauris, vel mollis nisl augue ut + ipsum.

Lists

diff --git a/test/fixtures/html_sample.docx b/test/fixtures/html_sample.docx index 3185b5606022d17e31e6650005300e9dd3a3642e..abe2b44c61e7b89981fc0f2d114920b73f91653c 100644 GIT binary patch delta 3047 zcmZ8jc{J4BAO2d#*bS4N>}1U#*%iMMW65sD-q=H!%2ti-MV83e_w3tD_7G(+3?fTP znjw|QSYErPpLBk|bKdvf^EuD+x#xL4=iYPwdEz>$+B>OOuES|)&j0{G4_G)$HL&2Q zXil)Hi%RGZ`9xj$hdiXs{zJs+DkM%dqq0aU))O(@>}fSOlo0?Vw6PcD&tWs&gcff$ddp;zSzT&_@le+%k)_Lt**|;SH*oPn6wKh}&Q@$;hes zu)rx%F{e6zWa!H3Y8EN4i3^rFoX;pRcV)FOXXaMc%g%zLuQF&5HWL7Ok9~E&JCTdD zFARPznb#PVYuclXOXB-rim6Dz^QD9icA(3**NbP#2hQa-yvlSH;bzN8xi;dqRbf+% zx#dOCPGn|$d!3io^0f@(-oA!OP)k6yl%gI6z06;Xuoy9L2nbJUHkDOsO3Zw_r8OC1 zxfzKXi;Ubxh)j<7e0&}`v7d4En}%_$$gKHtVHe(mn#X3+IJEr@=0n94k)*`-FXB}C$o=mXpk{Gjn+>JnQa+g165Z zQueo2#7G?ei|c(2GnQ{6O>qeDj_1$b2k&CB_pZV@cPjU*RAgUA8%`SVXfc*8PBY37 z|8m$Ac+5$DnwL78qiCzQB4}a<*X#4cuC#Kca@7sR{J?<-!kFB8g3_uAm8N|AP`4ce zG?|v{d;Xw6nRpxR&n%X1XOlh#Au_7@^k0k4dMbF4RuuFRnTXLqP2*>9nH?U!!^%=# z(AE4vIB#IDPhp@3K%1aSMU$;|y%>Ps{I1jtbG$SE^*uDo4Z}gKgHs ziF^^YqG>(Waq|rVXQ8z!$7W3}L-8ryjS*2U32GlCi848XDwS2#ZcBWr{x%Js5lI)B zRH+PyrKbr#hn?O7)&Dme*k<-A^(#nfnY`19`PrBnN3-4$gq zw_k^YU9(C{yLY6O%Hb8!6nLfJIb;=k)mcgKz-30)OA}C{-t~CZ;vsj*LB2Bsftx=$ zATshdp@Uvir=k(F2ZbaJr<{c)~rRH1K+0E|{S|Z``%eGNwZY=?BaeH#g{D3k-9`Lihl3KP4~yf zPM^VPGt(Ub9YQ^;*|AN;>~TB~Oi<3bLp0U|23;w4+F6q{+dJ%X(xU_#YfJvZfRFN1 z@&i)jJPqo0gI~*NOteiKjWX3&>)R^ym%#4uew)3-Lr`NM(~r=fRoQQdCEy39O6PI42wtUHJAJuW(p^nK-{+~D-BI+EH;T+$ngm}H3)sFNB!ox{~ z&m8HfD1Cms>I-pM!~_(iQ*M%x*q0?DdD)AScrHS-4Yq0``}`R^)D}}!<}ycvtz%9U zoyZASjtqG(8CK)QHI9A^IbJXE_Y#Ns`24h5J7sZz`}hsk)(%| z9Yb~Ry~YPy2R0n?oPt$j^R;2GdnCDf(eg z2UEX%@uc(~ICH=k-HJ(3H2?$RwclrTb>nwhFCj zaE}Xcd9`juR+p{x-Bf#Y?aJ-113jq65U)N9gaRrQ1QWI4*T%^(4@`FUvsoSL zx63-whX>raK7BdU>^6`z=*y&UZz)ipjMA8U{j6ds_1vn?z9ZG%#l_%A0qS>Q)8AUy zU)grUc$S~YVpPwXu0@Y>t_)oDRd5Ntoc!eI;jw|H+q8(^YJD{ z<<`q+a_hv>`F#DbyZa?pZH)R(GCY3lUr9sHwT!EbLVPKD3&@oLhxW%VAvvzgM_xWKCL)ET&D2W()A*s5i}cXUqAVtk9~z z&t3R~m3L*mRMVubI!u_WQra${Z`i&b+@rTy)}^-jRBoY&2YpFZBN%ief9g_Hc4b$% zhpb1G(+xD^==&GOdJV_1QUL{1ywI?0qJRGaJ>{Ohe%!oiHA4ZHO6D3w73T-z4K1yn z7C5siiXmBnGd8K`xEU0l5&C^bN|02Uf8P*#?;)tP;x@Wu*FFCsVM0Vmj(AYs#PX?$ zxFYt(+27au=y^4wCok7r-N$viX6E>KT<#<2yfVukp*g51$Hi)CMhH!nC4`R^!awb` z&-_V`V|YQMTkx}XE2mDfh2hwGV6X}KopPA}0At&EQ-3|x=++C4HjmJn>7uuDKX*`U zvqA!bMfo#}u=exs;ILakPr@?8Ue72t)Pz=K{sY&)34ny`Q$rarP@f3+3V*0c-+IgkQ}i~nKeD^XYCg1pc-n0sV{9`?6@Y{ z)62llF6v3D%N7u7X417F?m^f#D;?=0TlCvtPbms{ZFopfk_+w9`0M8QtbE1B(cf!oj9wO| zI_CM{G-w;<-kS5~0Bf5TlbBwz`zFtbRAQy&RW#lFac%IIovL)lj(oM%^WmIESJwEf zBgy+8+J_&^3UJD_xjmcr=w6=i?hU@oPx0LE>senU=bgtK=>8)>HfdZ8ran!F&FYDu zQ=+F^Lvu>VdiCH_Qevb-e@d1t9O(XN5b~$t5*vin`F((GLaLtP8dCEVW$d+1(c50{ z6!G>3|BjwC4ltqLX%u85d*6r}01g=dfa4_C{$5lkRTk-twR8|U)wIr6vN?eOK+EIb znmH`iK}7iX9A^+?-vb%|&_0>NemdoUb3Api=DJYt>u@S+6`KENXZZM;YVBAs0I(UJ z)ISVo^Z_9FR*>6y^zBezceFx?pYOlTtd6Nr26_MpGzI{^f0$nxVH=DDu)dDWr!Nn8 o$#AZ8+owhGdT2BPl9KpqP9S)-V?PNXKk4`S0f&AV57iSK6d;kCd delta 2896 zcmZXWcQhOP8pjiRhN?}CS|w5|w$hiP#7OOt7`2O{c!knfRobEzwMWfRqqTR1zV@t= z){dH?h|wA?y={8$J?EbLJLfsy&-eK~&w2j+wzLBuwF8;qFiI+R002M-Q1hj#VXgvF z{?fV*px_@^q`>}x1geZbAWBmzMxGe~+W|qphG98|ZOcpa0D!9+MiFuaU0;LEQOOFh zOxdr-L||Jb6D}%EFA0ksY;h6ZL)0}}gHcd}5>dr$Dj+MBX5``4;#-br=}aZWz`^Jn zDEkdCS8opI!yLqh#)pe#{c?r&PmWhMerYy;S;S(B&3w#UiV5 z=`((28V}O2o9?oHlEr{%dyXU$&X^nq5;^3ueoK)h)(}2O-L&`|@1^S=x)5NOB$8;& zA*kZc#Pl@%;mzDB^9ahKJ#gZZN(`w=5lvX#ZQq*^?BwsexPdig@A89~w6 zdXT(YH#O%6@mA8Ml<7g8mv(XzFu}|AzNJ~gtD{XHws-AX zYD;^YivEUlTGWqUsDH`v(Gv-+yQI*>XxB|6RlD~UA!UrEP@zb?Sl>TNjlz}>b}oky zweHscY;9Pi7O6n*r93_u6?|Hl&iD5z*X!?=fowI&D@fp!1Dyg-q{+N0LUPd1@5bRQm3^%ugiIZufxY5N)612I$YJ&1EP;X}dd1Ws5g=RV>T z0oJ;rF<5~XH`R7=E3e~x_pKDk{Af2Ibr*X;BMfILzwBvpi@Lv;LukJ8_85T2kJAT~ zlFIlhbL!Cf!{dsT6p>yVF&pdv+kz*97SIHo0Rt(FN9+01gf}XEzbZ)$#kk9Y(TDx^ zw9e^8MYyk$@+Gj+m~$BR(iOWh$ua?P+Md4{oUgwRBxu9qm0k`6i@Wo(_XO;nv9U@) zw%FEEYFG!DO<@AG)opY!E;!qixjiT{UHXMT=k*Bg?QK2)|GkwE&-$k5R3Wjltbz-{ zMWH7>c?>thQ5_pjie!s2#8fF<-z=nDOoa?n7h_oBVzSP-Ic>vmRF}d z6gU*NG5D}bqv$H-887KFk3@GUOV~zMTrOuf>x}|4o<7`}8%QBwbDrrt`pDr1_clCs zXoG2-9%U0o;&-u$;bz&{;*&WC>Y4TF$OgtTao-7}eZ2;F!pT|&{m}k+M;QL)(HZj2 zyuQb?>wp6R7XwWsYpgy!5eTLAHr=?Z#kM_RK;z$6|LWFeL0t|Us+P|1LA1Y4@LPzN z4#AC;mm=!`Mw!sVIrb{5E|Clv?xziAKR$$bgDBnbg>+^*n3!e}NZOCwkIEUN zGy5EJEPo>-Y0Xg|IM-EW>u1m1W;?{BefhDiNGXtKE%kloeT690j96HKE9c}>g4TL{ zWY9#9C>Ot&tDuVPTKzNrNRL&8cu=eXC}P)y&ZQ82Cd{|kQ7MQH7r@+rsq#%_-oM47 zI`fY<3WOKVEXI2{vKHRcd?Y>WAIW&~gr+3%xD)iO)Nr=zwco~-#Yi`~*}Uzdk@rBr zz$+AKY>EG9Kdi<4nD^L@V`W^oY{<&y@On~LSn~>$bJSsSGr7qiYZDi?!A~nvP_u@S zdLgqZsQg)dGVmIjL5I-hK$^MF^wosI=wkQf-gi5>$o1O;je2!2Sd4U)1s!x0m8XPq zxo9f}g~#KX-ny=Pch=+}(yv72V0rFzUE7bUWh#6~@4^I8#ncFaoLP9+=04JUrL}a! z4Qy~xHkQQa-tc+;Af6#`0qzP#l@wFYVCg?B7N^J-umc^s&D_6gQ)r7i08k`1+dYow zr1e?OFj3mCD$n9~o2Tte#+@Y|S3TDO*vs&CD1AO=Q=uu+2C*4^?_Zw*?AT6 zDaNS)HBrT41b%Ylk;VTkaTFQFEs@(PvPRs=n|_>$ewmJHG_9+a!N8?Em*^2PNNGCj z>n{~=`%$6KHJbJBa!$s^+~I}&O_!agQIH>20ns8rewMn-bmveH3d8R!;>&J%D4cbA6A^gG~ja&QCictNo1ohzpZX2X8TETQ^uO zYt>Dn864a%%nTRa%g=POSjOiE0F!sGO6*o&)sY!`YfDNAB+AJ4ofGB+`tKC74Le%; zXm}UzjIj`pp1)OE0hJ%Fful^NO&dx81`tLQ@7{_&*Bt6`Tgpe(u|xNs-c)o@eNgt0I3G(LJb{RFf}QRiZ{a>dI7#0Rzo;;n zjcUPjXj3kFYH5B$hSyLEjCIGuUCJF@PKqD2IYB*eBb_uJ4#|eo4Sg+j=K*l#0*imD zZ*hF;Q(l+)jtsOYBc#mxX9LsqvGj}7zQ_4XyVa*(^B+0<*hl|Q9IW$E4xu1ty*1S* z)MN + + Lorem + + + ipsum dolor + + sit amet +
+ DOCX + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_convert_sub_tags_inside_p + input = '

Lorem ipsum dolor sit amet

' + expected_output = <<-DOCX.strip + + + Lorem + + + ipsum dolor + + sit amet + + DOCX + assert_equal normalize_wordml(expected_output), process(input) + end + + def test_convert_sup_tags_inside_p + input = '

Lorem ipsum dolor sit amet

' + expected_output = <<-DOCX.strip + + + Lorem + + + ipsum dolor + + sit amet + + DOCX + assert_equal normalize_wordml(expected_output), process(input) + end + def test_convert_br_tags_inside_strong input = '

Lorem ipsum
dolor sit amet
' expected_output = <<-DOCX From 04d7233daabdd34be01d349a92e37e9a04c602f0 Mon Sep 17 00:00:00 2001 From: Matthew Stadelman Date: Fri, 15 Sep 2017 08:53:39 -0400 Subject: [PATCH 09/12] Minor Improvements html parsing (#69) * Limited the allowed children of block level HTML tags to only be inline This is because anything that geneates a paragraph tag (w:p) cannot be nested inside another prargraph otherwise the document will become invalid. * Add drop_text? logic to ast_builder class This matches the old behavior where text nodes that were siblings to paragraph nodes were dropped but it does it in a more formal way than just skipping that node in the processing loop. * Add comments describing each AST class in more detail. * Add to_docx and initialize method to the Node base class * Fix node#initalize and add super calls to node subclasses * Refactor Run and Paragraph AST classes to use super in their to_docx method * Simplify conditional used in Node#to_docx * Update to_docx method to handle node attributes properly Previously it would not add a space between the tag itself and the first attribute. * Give subclasses more flexibility to define child content of nodes. Previously any children needed to implement the `to_docx` method which lead to alot of structs wrapping plain text content. Now with the `children_to_docx` method a subclass can delegate that directly to it's child nodes as with Paragraph, `runs.to_docx` or specify plain text directly as the Run and Newline classes do. --- lib/sablon/configuration/configuration.rb | 16 +++--- lib/sablon/html/ast.rb | 69 +++++++++++++++++++---- lib/sablon/html/ast_builder.rb | 12 ++++ 3 files changed, 77 insertions(+), 20 deletions(-) diff --git a/lib/sablon/configuration/configuration.rb b/lib/sablon/configuration/configuration.rb index d5a5a667..3df2f331 100644 --- a/lib/sablon/configuration/configuration.rb +++ b/lib/sablon/configuration/configuration.rb @@ -56,14 +56,14 @@ def initialize_html_tags '#document-fragment' => { type: :block, ast_class: :root, allowed_children: :_block }, # block level tags - div: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Normal' } }, - p: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Paragraph' } }, - h1: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading1' } }, - h2: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading2' } }, - h3: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading3' } }, - h4: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading4' } }, - h5: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading5' } }, - h6: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading6' } }, + div: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Normal' }, allowed_children: :_inline }, + p: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Paragraph' }, allowed_children: :_inline }, + h1: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading1' }, allowed_children: :_inline }, + h2: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading2' }, allowed_children: :_inline }, + h3: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading3' }, allowed_children: :_inline }, + h4: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading4' }, allowed_children: :_inline }, + h5: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading5' }, allowed_children: :_inline }, + h6: { type: :block, ast_class: :paragraph, properties: { pStyle: 'Heading6' }, allowed_children: :_inline }, ol: { type: :block, ast_class: :list, properties: { pStyle: 'ListNumber' }, allowed_children: %i[ol li] }, ul: { type: :block, ast_class: :list, properties: { pStyle: 'ListBullet' }, allowed_children: %i[ul li] }, li: { type: :block, ast_class: :list_paragraph }, diff --git a/lib/sablon/html/ast.rb b/lib/sablon/html/ast.rb index 9f7e29c7..13ab94b6 100644 --- a/lib/sablon/html/ast.rb +++ b/lib/sablon/html/ast.rb @@ -52,16 +52,44 @@ def self.convert_style_property(key, value) end end + def initialize(_env, _node, _properties) + @attributes ||= {} + end + def accept(visitor) visitor.visit(self) end + # Simplifies usage at call sites by only requiring them to supply + # the tag name to use and any child AST nodes to render + def to_docx(tag) + prop_str = @properties.to_docx if @properties + # + "<#{tag}#{attributes_to_docx}>#{prop_str}#{children_to_docx}" + end + + private + # Simplifies usage at call sites def transferred_properties @properties.transferred_properties end + + # Gracefully handles conversion of an attributes hash into a + # string + def attributes_to_docx + return '' if @attributes.nil? || @attributes.empty? + ' ' + @attributes.map { |k, v| %(#{k}="#{v}") }.join(' ') + end + + # Acts like an abstract method allowing subclases full flexibility to + # define any content inside the tags. + def children_to_docx + '' + end end + # Manages the properties for an AST node class NodeProperties attr_reader :transferred_properties @@ -136,6 +164,8 @@ def transform_attr(key, value) end end + # A container for an array of AST nodes with convenience methods to + # work with the internal array as if it were a regular node class Collection < Node attr_reader :nodes def initialize(nodes) @@ -158,6 +188,8 @@ def inspect end end + # Stores all of the AST nodes from the current fragment of HTML being + # parsed class Root < Collection def initialize(env, node) # strip text nodes from the root level element, these are typically @@ -179,6 +211,8 @@ def inspect end end + # An AST node representing the top level content container for a word + # document. These cannot be nested within other paragraph elements class Paragraph < Node PROPERTIES = %w[framePr ind jc keepLines keepNext numPr outlineLvl pBdr pStyle rPr sectPr shd spacing @@ -186,6 +220,7 @@ class Paragraph < Node attr_accessor :runs def initialize(env, node, properties) + super properties = self.class.process_properties(properties) @properties = NodeProperties.paragraph(properties) # @@ -195,7 +230,7 @@ def initialize(env, node, properties) end def to_docx - "#{@properties.to_docx}#{runs.to_docx}" + super('w:p') end def accept(visitor) @@ -206,6 +241,12 @@ def accept(visitor) def inspect "" end + + private + + def children_to_docx + runs.to_docx + end end # Manages the child nodes of a list type tag @@ -276,7 +317,8 @@ def process_child_nodes(node) end end - # Sets list item specific attributes registered on the node + # Sets list item specific attributes registered on the node to properly + # generate a list paragraph class ListParagraph < Paragraph def initialize(env, node, properties) list_props = { @@ -294,46 +336,49 @@ def transferred_properties end end - # Create a run of text in the document + # Create a run of text in the document, runs cannot be nested within + # each other class Run < Node PROPERTIES = %w[b i caps color dstrike emboss imprint highlight outline rStyle shadow shd smallCaps strike sz u vanish vertAlign].freeze - attr_reader :string def initialize(_env, node, properties) + super properties = self.class.process_properties(properties) @properties = NodeProperties.run(properties) @string = node.text end def to_docx - "#{@properties.to_docx}#{text}" + super('w:r') end def inspect - "" + "" end private - def text + def children_to_docx content = @string.tr("\u00A0", ' ') "#{content}" end end # Creates a blank line in the word document - class Newline < Node + class Newline < Run def initialize(*); end - def to_docx - "" - end - def inspect "" end + + private + + def children_to_docx + "" + end end end end diff --git a/lib/sablon/html/ast_builder.rb b/lib/sablon/html/ast_builder.rb index 64c18324..a0e4130e 100644 --- a/lib/sablon/html/ast_builder.rb +++ b/lib/sablon/html/ast_builder.rb @@ -23,6 +23,9 @@ def process_nodes(html_nodes, properties) parent_tag = fetch_tag(node.parent.name) if node.parent.name tag = fetch_tag(node.name) + # remove all text nodes if the tag doesn't accept them + node.search('./text()').remove if drop_text?(tag) + # check node hierarchy validate_structure(parent_tag, tag) @@ -59,6 +62,15 @@ def validate_structure(parent, child) raise ContextError, "Invalid HTML structure: #{msg}" end + # If the node doesn't allow inline elements, or text specifically, + # drop all text nodes. This is largely meant to prevent whitespace + # between tags from rasing an invalid structure error. Although it + # will purge the node whether it contains nonblank characters or not. + def drop_text?(child) + text = fetch_tag(:text) + !child.allowed_child?(text) + end + # Merges node properties in a sppecifc def merge_node_properties(node, tag, parent_properties) # Process any styles, defined on the node into a hash From 6a808adee20e1d711a6dd53dc0f175e4f4e1db2b Mon Sep 17 00:00:00 2001 From: Matthew Stadelman Date: Thu, 28 Sep 2017 08:17:47 -0400 Subject: [PATCH 10/12] Html tag properties bug fix (#70) * Fix bug when the user registers a new HTML tag If the user registers a new HTML tag and specifies style properties for the element using string keys the convert will attempt to process them as if they came from the `style=` attribute. For non-string values this causes an error. The fix was to onvert all property keys specified during registration to symbols. * Add test in converter_test.rb that would fail without this bugfix --- lib/sablon/configuration/html_tag.rb | 5 ++++- test/html/converter_test.rb | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/lib/sablon/configuration/html_tag.rb b/lib/sablon/configuration/html_tag.rb index 2e6211e1..86f610f6 100644 --- a/lib/sablon/configuration/html_tag.rb +++ b/lib/sablon/configuration/html_tag.rb @@ -43,8 +43,11 @@ def initialize(name, type, ast_class: nil, **options) # Set attributes from optinos hash, currently unused during AST generation @attributes = options.fetch(:attributes, {}) - # WordML properties defined by the tag, i.e. for the tag, etc. + # WordML properties defined by the tag, i.e. for the tag, + # etc. All the keys need to be symbols to avoid getting reparsed + # with the element's CSS attributes. @properties = options.fetch(:properties, {}) + @properties = Hash[@properties.map { |k, v| [k.to_sym, v] }] # Set permitted child tags or tag groups self.allowed_children = options[:allowed_children] end diff --git a/test/html/converter_test.rb b/test/html/converter_test.rb index 6700a801..b0193192 100644 --- a/test/html/converter_test.rb +++ b/test/html/converter_test.rb @@ -601,7 +601,7 @@ def test_conversion_of_a_registered_tag_without_ast_class # This registers a new tag with the configuration object and then trys # to convert it Sablon.configure do |config| - config.register_html_tag(:bgcyan, :inline, properties: { highlight: 'cyan' }) + config.register_html_tag(:bgcyan, :inline, properties: { 'highlight' => { val: 'cyan' } }) end # input = '

test

' From f796489f860ea0e61a4372d9aa2bae1d881f5760 Mon Sep 17 00:00:00 2001 From: Quyet Bui Date: Sun, 29 Oct 2017 14:21:02 +0700 Subject: [PATCH 11/12] multiple images in a template --- lib/sablon/processor/document.rb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/sablon/processor/document.rb b/lib/sablon/processor/document.rb index c92f91e8..91f56eac 100644 --- a/lib/sablon/processor/document.rb +++ b/lib/sablon/processor/document.rb @@ -118,8 +118,8 @@ def self.encloses?(start_field, end_field) end class ImageBlock < ParagraphBlock - def self.parent(node) - node.ancestors + def self.placeholder(node) + parent(node).xpath('following-sibling::w:p') end def self.encloses?(start_field, end_field) @@ -133,9 +133,9 @@ def replace(content) return end - pic_prop = self.class.parent(start_field).at_xpath('.//pic:cNvPr', pic: Sablon::Processor::Relationships::PICTURE_NS_URI) + pic_prop = self.class.placeholder(start_field).at_xpath('.//pic:cNvPr', pic: Sablon::Processor::Relationships::PICTURE_NS_URI) pic_prop.attributes['name'].value = content.first.name - blip = self.class.parent(start_field).at_xpath('.//a:blip', a: Sablon::Processor::Relationships::MAIN_NS_URI) + blip = self.class.placeholder(start_field).at_xpath('.//a:blip', a: Sablon::Processor::Relationships::MAIN_NS_URI) blip.attributes['embed'].value = content.first.rid start_field.remove end_field.remove From 2f9844e02871b1653a1840ab4f61da5f884c7cfc Mon Sep 17 00:00:00 2001 From: Quyet Bui Date: Sun, 29 Oct 2017 14:28:16 +0700 Subject: [PATCH 12/12] image can be from a remote source --- lib/sablon.rb | 1 + lib/sablon/content.rb | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/sablon.rb b/lib/sablon.rb index 4f6630fc..c4db06f2 100644 --- a/lib/sablon.rb +++ b/lib/sablon.rb @@ -1,5 +1,6 @@ require 'zip' require 'nokogiri' +require 'open-uri' require "sablon/version" require "sablon/configuration/configuration" diff --git a/lib/sablon/content.rb b/lib/sablon/content.rb index bdec9b57..eff2aaf1 100644 --- a/lib/sablon/content.rb +++ b/lib/sablon/content.rb @@ -50,7 +50,8 @@ def inspect end def initialize(path) - super "#{Integer(rand * 1e9)}-#{File.basename(path)}", IO.binread(path) + # Links from Amazon S3 might have ?1498548740 part + super "#{Integer(rand * 1e9)}-#{File.basename(path).split('?').first}", open(path).read end def append_to(paragraph, display_node, env) end