From b133b5340aaf80e1de54f42fb4bf168a2ad86b74 Mon Sep 17 00:00:00 2001 From: nogginly Date: Sat, 20 May 2023 23:30:55 -0400 Subject: [PATCH 1/3] Support grad backprop when add/sub use broadcast --- spec/grad/gates_arithmetic_spec.cr | 108 +++++++++++++++++++++++++++++ src/grad/backends/agnostic.cr | 33 +++++++-- src/grad/gates/arithmetic.cr | 49 +++++-------- src/grad/variable.cr | 4 +- 4 files changed, 155 insertions(+), 39 deletions(-) diff --git a/spec/grad/gates_arithmetic_spec.cr b/spec/grad/gates_arithmetic_spec.cr index 848b8a15..8ada4530 100644 --- a/spec/grad/gates_arithmetic_spec.cr +++ b/spec/grad/gates_arithmetic_spec.cr @@ -82,6 +82,60 @@ describe Num::Grad do end {% end %} + it "backpropogates for addition with broadcast" do + ctx = Num::Grad::Context(Float32Tensor).new + + a = ctx.variable([ + [1_f32, 2_f32, 3_f32, 4_f32], + [5_f32, 6_f32, 7_f32, 8_f32], + [9_f32, 10_f32, 11_f32, 12_f32], + [13_f32, 14_f32, 15_f32, 16_f32], + ]) + b = ctx.variable([ + 1_f32, 1_f32, 1_f32, 1_f32, + ]) + + result = a + b + result.backprop + + expected_a = [[1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32]].to_tensor + expected_b = [4_f32, 4_f32, 4_f32, 4_f32].to_tensor + + Num::Testing.tensor_equal(a.grad, expected_a).should be_true + Num::Testing.tensor_equal(b.grad, expected_b).should be_true + end + + {% if flag?(:opencl) %} + it "backpropogates for addition with broadcast opencl", tags: "opencl" do + ctx = Num::Grad::Context(Float32ClTensor).new + + a = ctx.variable([ + [1_f32, 2_f32, 3_f32, 4_f32], + [5_f32, 6_f32, 7_f32, 8_f32], + [9_f32, 10_f32, 11_f32, 12_f32], + [13_f32, 14_f32, 15_f32, 16_f32], + ].to_tensor(OCL)) + b = ctx.variable([ + 1_f32, 1_f32, 1_f32, 1_f32, + ].to_tensor(OCL)) + + result = a + b + result.backprop + + expected_a = [[1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32]].to_tensor + expected_b = [4_f32, 4_f32, 4_f32, 4_f32].to_tensor + + Num::Testing.tensor_equal(a.grad.cpu, expected_a).should be_true + Num::Testing.tensor_equal(b.grad.cpu, expected_b).should be_true + end + {% end %} + it "backpropogates for subtraction" do ctx = Num::Grad::Context(Float32Tensor).new @@ -112,6 +166,60 @@ describe Num::Grad do end {% end %} + it "backpropogates for subtraction with broadcast" do + ctx = Num::Grad::Context(Float32Tensor).new + + a = ctx.variable([ + [1_f32, 2_f32, 3_f32, 4_f32], + [5_f32, 6_f32, 7_f32, 8_f32], + [9_f32, 10_f32, 11_f32, 12_f32], + [13_f32, 14_f32, 15_f32, 16_f32], + ]) + b = ctx.variable([ + 1_f32, 1_f32, 1_f32, 1_f32, + ]) + + result = a - b + result.backprop + + expected_a = [[1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32]].to_tensor + expected_b = [-4_f32, -4_f32, -4_f32, -4_f32].to_tensor + + Num::Testing.tensor_equal(a.grad, expected_a).should be_true + Num::Testing.tensor_equal(b.grad, expected_b).should be_true + end + + {% if flag?(:opencl) %} + it "backpropogates for subtraction with broadcast opencl", tags: "opencl" do + ctx = Num::Grad::Context(Float32ClTensor).new + + a = ctx.variable([ + [1_f32, 2_f32, 3_f32, 4_f32], + [5_f32, 6_f32, 7_f32, 8_f32], + [9_f32, 10_f32, 11_f32, 12_f32], + [13_f32, 14_f32, 15_f32, 16_f32], + ].to_tensor(OCL)) + b = ctx.variable([ + 1_f32, 1_f32, 1_f32, 1_f32, + ].to_tensor(OCL)) + + result = a - b + result.backprop + + expected_a = [[1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32]].to_tensor + expected_b = [-4_f32, -4_f32, -4_f32, -4_f32].to_tensor + + Num::Testing.tensor_equal(a.grad.cpu, expected_a).should be_true + Num::Testing.tensor_equal(b.grad.cpu, expected_b).should be_true + end + {% end %} + it "backpropogates for multiplication" do ctx = Num::Grad::Context(Float32Tensor).new diff --git a/src/grad/backends/agnostic.cr b/src/grad/backends/agnostic.cr index 7e0339f5..cb40496b 100644 --- a/src/grad/backends/agnostic.cr +++ b/src/grad/backends/agnostic.cr @@ -24,14 +24,39 @@ module Num::Grad extend self + # + # This returns the appropriate backward gradient processing for + # addition and subtraction based on the + # size and rank of the two variables + # + private def sum_grad_backward(gradient : U, a : U, b : U) : Array(U) forall U + # if a.size == 1 || b.size == 1 + # # broadcast of 1 element, sum it all up + # gless = U.new([1]) { gradient.sum } + # b.size == 1 ? [gradient, gless] : [gless, gradient] + if a.rank != b.rank + # broadcast along an axis, so sum dwn by axis + swap = a.rank > b.rank + gless = gradient + (b.rank - a.rank).abs.times do + gless = gless.sum(0) + end + swap ? [gradient, gless] : [gless, gradient] + else + [gradient, gradient] + end + end + # :nodoc: - def add_backward(gradient : U) : Array(U) forall U - [gradient, gradient] + def add_backward(gradient : U, a : Variable(U), b : Variable(U)) : Array(U) forall U + sum_grad_backward(gradient, a.value, b.value) end # :nodoc: - def subtract_backward(gradient : U) : Array(U) forall U - [gradient, gradient * -1] + def subtract_backward(gradient : U, a : Variable(U), b : Variable(U)) : Array(U) forall U + r = sum_grad_backward(gradient, a.value, b.value) + r[1] = -r[1] + r end # :nodoc: diff --git a/src/grad/gates/arithmetic.cr b/src/grad/gates/arithmetic.cr index 8b2fdbed..7906b12e 100644 --- a/src/grad/gates/arithmetic.cr +++ b/src/grad/gates/arithmetic.cr @@ -22,61 +22,44 @@ # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # :nodoc: -class Num::Grad::AddGate(T) < Num::Grad::Gate(T) +abstract class Num::Grad::TwoOpGate(T) < Num::Grad::Gate(T) + getter a : Num::Grad::Variable(T) + getter b : Num::Grad::Variable(T) + @@name = "TwoOp" + # :nodoc: - def backward(payload : Num::Grad::Payload(T)) : Array(T) - Num::Grad.add_backward(payload.variable.grad) + def initialize(@a, @b) end + abstract def backward(payload : Num::Grad::Payload(T)) : Array(T) + # :nodoc: def cache(result : Num::Grad::Variable(T), *args) a, b = args - result.grad = T.zeros_like(result.value) result.requires_grad = true - Num::Grad.register("Add", self, result, a, b) + Num::Grad.register(@@name, self, result, a, b) end end # :nodoc: -class Num::Grad::SubtractGate(T) < Num::Grad::Gate(T) - # :nodoc: - def backward(payload : Num::Grad::Payload(T)) : Array(T) - Num::Grad.subtract_backward(payload.variable.grad) - end +class Num::Grad::AddGate(T) < Num::Grad::TwoOpGate(T) + @@name = "Add" # :nodoc: - def cache(result : Num::Grad::Variable(T), *args) - a, b = args - result.grad = T.zeros_like(result.value) - result.requires_grad = true - - Num::Grad.register("Sub", self, result, a, b) + def backward(payload : Num::Grad::Payload(T)) : Array(T) + Num::Grad.add_backward(payload.variable.grad, a, b) end end # :nodoc: -class Num::Grad::TwoOpGate(T) < Num::Grad::Gate(T) - getter a : Num::Grad::Variable(T) - getter b : Num::Grad::Variable(T) - @@name = "TwoOp" +class Num::Grad::SubtractGate(T) < Num::Grad::TwoOpGate(T) + @@name = "Sub" # :nodoc: - def initialize(@a : Num::Grad::Variable(T), @b : Num::Grad::Variable(T)) - end - def backward(payload : Num::Grad::Payload(T)) : Array(T) - [] of T - end - - # :nodoc: - def cache(result : Num::Grad::Variable(T), *args) - a, b = args - result.grad = T.zeros_like(result.value) - result.requires_grad = true - - Num::Grad.register(@@name, self, result, a, b) + Num::Grad.subtract_backward(payload.variable.grad, a, b) end end diff --git a/src/grad/variable.cr b/src/grad/variable.cr index cb0889e6..69a64016 100644 --- a/src/grad/variable.cr +++ b/src/grad/variable.cr @@ -62,7 +62,7 @@ class Num::Grad::Variable(T) # f = a + b # => [5.0] # f.backprop # ``` - operator_op :+, Num::Grad::AddGate(T) + operator_op :+, Num::Grad::AddGate(T), self, other # Subtracts a variable from another variable and stores # the derivative of the operation in the computational @@ -83,7 +83,7 @@ class Num::Grad::Variable(T) # f = a - b # => [-1.0] # f.backprop # ``` - operator_op :-, Num::Grad::SubtractGate(T) + operator_op :-, Num::Grad::SubtractGate(T), self, other # Multiples a variable to another variable and stores # the derivative of the operation in the computational From ae5670e2111acbf13b4ca986d85cfbfebe31329b Mon Sep 17 00:00:00 2001 From: nogginly Date: Sat, 20 May 2023 23:50:15 -0400 Subject: [PATCH 2/3] Cleaned up the scalar broadcast case --- src/grad/backends/agnostic.cr | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/grad/backends/agnostic.cr b/src/grad/backends/agnostic.cr index cb40496b..438ddb1c 100644 --- a/src/grad/backends/agnostic.cr +++ b/src/grad/backends/agnostic.cr @@ -30,10 +30,6 @@ module Num::Grad # size and rank of the two variables # private def sum_grad_backward(gradient : U, a : U, b : U) : Array(U) forall U - # if a.size == 1 || b.size == 1 - # # broadcast of 1 element, sum it all up - # gless = U.new([1]) { gradient.sum } - # b.size == 1 ? [gradient, gless] : [gless, gradient] if a.rank != b.rank # broadcast along an axis, so sum dwn by axis swap = a.rank > b.rank @@ -41,6 +37,9 @@ module Num::Grad (b.rank - a.rank).abs.times do gless = gless.sum(0) end + if a.size == 1 || b.size == 1 + gless = gless.sum(0) + end swap ? [gradient, gless] : [gless, gradient] else [gradient, gradient] From c1819bd1c116c684cd6c4c26539a5dea15de0d18 Mon Sep 17 00:00:00 2001 From: nogginly Date: Sat, 20 May 2023 23:59:43 -0400 Subject: [PATCH 3/3] Add sclara broadcast test cases --- spec/grad/gates_arithmetic_spec.cr | 108 +++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/spec/grad/gates_arithmetic_spec.cr b/spec/grad/gates_arithmetic_spec.cr index 8ada4530..3204049e 100644 --- a/spec/grad/gates_arithmetic_spec.cr +++ b/spec/grad/gates_arithmetic_spec.cr @@ -136,6 +136,60 @@ describe Num::Grad do end {% end %} + it "backpropogates for addition with scalar broadcast" do + ctx = Num::Grad::Context(Float32Tensor).new + + a = ctx.variable([ + [1_f32, 2_f32, 3_f32, 4_f32], + [5_f32, 6_f32, 7_f32, 8_f32], + [9_f32, 10_f32, 11_f32, 12_f32], + [13_f32, 14_f32, 15_f32, 16_f32], + ]) + b = ctx.variable([ + 1_f32, + ]) + + result = a + b + result.backprop + + expected_a = [[1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32]].to_tensor + expected_b = [16_f32].to_tensor + + Num::Testing.tensor_equal(a.grad, expected_a).should be_true + Num::Testing.tensor_equal(b.grad, expected_b).should be_true + end + + {% if flag?(:opencl) %} + it "backpropogates for addition with scalar broadcast opencl", tags: "opencl" do + ctx = Num::Grad::Context(Float32ClTensor).new + + a = ctx.variable([ + [1_f32, 2_f32, 3_f32, 4_f32], + [5_f32, 6_f32, 7_f32, 8_f32], + [9_f32, 10_f32, 11_f32, 12_f32], + [13_f32, 14_f32, 15_f32, 16_f32], + ].to_tensor(OCL)) + b = ctx.variable([ + 1_f32, + ].to_tensor(OCL)) + + result = a + b + result.backprop + + expected_a = [[1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32]].to_tensor + expected_b = [16_f32].to_tensor + + Num::Testing.tensor_equal(a.grad.cpu, expected_a).should be_true + Num::Testing.tensor_equal(b.grad.cpu, expected_b).should be_true + end + {% end %} + it "backpropogates for subtraction" do ctx = Num::Grad::Context(Float32Tensor).new @@ -220,6 +274,60 @@ describe Num::Grad do end {% end %} + it "backpropogates for subtraction with scalar broadcast" do + ctx = Num::Grad::Context(Float32Tensor).new + + a = ctx.variable([ + [1_f32, 2_f32, 3_f32, 4_f32], + [5_f32, 6_f32, 7_f32, 8_f32], + [9_f32, 10_f32, 11_f32, 12_f32], + [13_f32, 14_f32, 15_f32, 16_f32], + ]) + b = ctx.variable([ + 1_f32, + ]) + + result = a - b + result.backprop + + expected_a = [[1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32]].to_tensor + expected_b = [-16_f32].to_tensor + + Num::Testing.tensor_equal(a.grad, expected_a).should be_true + Num::Testing.tensor_equal(b.grad, expected_b).should be_true + end + + {% if flag?(:opencl) %} + it "backpropogates for subtraction with scalar broadcast opencl", tags: "opencl" do + ctx = Num::Grad::Context(Float32ClTensor).new + + a = ctx.variable([ + [1_f32, 2_f32, 3_f32, 4_f32], + [5_f32, 6_f32, 7_f32, 8_f32], + [9_f32, 10_f32, 11_f32, 12_f32], + [13_f32, 14_f32, 15_f32, 16_f32], + ].to_tensor(OCL)) + b = ctx.variable([ + 1_f32, + ].to_tensor(OCL)) + + result = a - b + result.backprop + + expected_a = [[1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32], + [1_f32, 1_f32, 1_f32, 1_f32]].to_tensor + expected_b = [-16_f32].to_tensor + + Num::Testing.tensor_equal(a.grad.cpu, expected_a).should be_true + Num::Testing.tensor_equal(b.grad.cpu, expected_b).should be_true + end + {% end %} + it "backpropogates for multiplication" do ctx = Num::Grad::Context(Float32Tensor).new