From d5eec7a180e9dd7aa3c5aea75ac79714de893648 Mon Sep 17 00:00:00 2001 From: Marek Otahal Date: Fri, 30 Oct 2020 20:52:07 +0100 Subject: [PATCH 1/3] only create layers as needed for given mode. --- dain.py | 15 ++++++++------- exp_mlp.py | 12 ++++++------ train_utils.py | 15 +++++++++------ 3 files changed, 23 insertions(+), 19 deletions(-) diff --git a/dain.py b/dain.py index 7b1efc3..1a4674d 100644 --- a/dain.py +++ b/dain.py @@ -14,16 +14,18 @@ def __init__(self, mode='adaptive_avg', mean_lr=0.00001, gate_lr=0.001, scale_lr self.gate_lr = gate_lr self.scale_lr = scale_lr - # Parameters for adaptive average + # Parameters for adaptive average; aka Dain(1) self.mean_layer = nn.Linear(input_dim, input_dim, bias=False) self.mean_layer.weight.data = torch.FloatTensor(data=np.eye(input_dim, input_dim)) - # Parameters for adaptive std - self.scaling_layer = nn.Linear(input_dim, input_dim, bias=False) - self.scaling_layer.weight.data = torch.FloatTensor(data=np.eye(input_dim, input_dim)) + # Parameters for adaptive scaling; Dain(1+2) + if mode == 'adaptive_scale' or mode == 'full': + self.scaling_layer = nn.Linear(input_dim, input_dim, bias=False) + self.scaling_layer.weight.data = torch.FloatTensor(data=np.eye(input_dim, input_dim)) - # Parameters for adaptive scaling - self.gating_layer = nn.Linear(input_dim, input_dim) + # Parameters for adaptive gating; Dain(1+2+3) + if mode == 'full': + self.gating_layer = nn.Linear(input_dim, input_dim) self.eps = 1e-8 @@ -49,7 +51,6 @@ def forward(self, x): # Perform the first + second step (adaptive averaging + adaptive scaling ) elif self.mode == 'adaptive_scale': - # Step 1: avg = torch.mean(x, 2) adaptive_avg = self.mean_layer(avg) diff --git a/exp_mlp.py b/exp_mlp.py index 0fbea4b..b58c9f0 100644 --- a/exp_mlp.py +++ b/exp_mlp.py @@ -27,7 +27,7 @@ def forward(self, x): return x -def run_experiments_ablation(model, mode, train_epochs=20, window=10, normalization=None): +def run_experiments_ablation(model, mode, train_epochs=1, window=10, normalization=None): results1 = train_evaluate_anchored(model, window=window, train_epochs=train_epochs, horizon=0, splits=[1, 2, 3, 4, 5, 6, 7, 8], @@ -43,13 +43,13 @@ def run_experiments_ablation(model, mode, train_epochs=20, window=10, normalizat mean_lr, std_lr, scale_lr = 1e-06, 0.001, 10 # Baseline 1 -model = lambda: MLP(mode=None, mean_lr=mean_lr, gate_lr=scale_lr, scale_lr=std_lr) -run_experiments_ablation(model, 'mlp_std', window=15, normalization='std') +#model = lambda: MLP(mode=None, mean_lr=mean_lr, gate_lr=scale_lr, scale_lr=std_lr) +#run_experiments_ablation(model, 'mlp_std', window=15, normalization='std') # Baseline 2 -model = lambda: MLP(mode='avg', mean_lr=mean_lr, gate_lr=scale_lr, scale_lr=std_lr) -run_experiments_ablation(model, 'mlp_sample_avg', window=15, normalization=None) +#model = lambda: MLP(mode='avg', mean_lr=mean_lr, gate_lr=scale_lr, scale_lr=std_lr) +#run_experiments_ablation(model, 'mlp_sample_avg', window=15, normalization=None) # Proposed Method -model = lambda: MLP(mode='full', mean_lr=mean_lr, gate_lr=scale_lr, scale_lr=std_lr) +model = lambda: MLP(mode='adaptive_scale', mean_lr=mean_lr, gate_lr=scale_lr, scale_lr=std_lr) run_experiments_ablation(model, 'mlp_full', window=15, normalization=None) diff --git a/train_utils.py b/train_utils.py index c24db6b..369eebc 100644 --- a/train_utils.py +++ b/train_utils.py @@ -11,12 +11,15 @@ def lob_epoch_trainer(model, loader, lr=0.0001, optimizer=optim.RMSprop): model.train() - model_optimizer = optimizer([ - {'params': model.base.parameters()}, - {'params': model.dean.mean_layer.parameters(), 'lr': lr * model.dean.mean_lr}, - {'params': model.dean.scaling_layer.parameters(), 'lr': lr * model.dean.scale_lr}, - {'params': model.dean.gating_layer.parameters(), 'lr': lr * model.dean.gate_lr}, - ], lr=lr) + dean_params = [{'params': model.base.parameters()}] + if model.dean.mode in ['adaptive_avg', 'adaptive_scale', 'full']: + dean_params.append({'params': model.dean.mean_layer.parameters(), 'lr': lr * model.dean.mean_lr}) + if model.dean.mode in ['adaptive_scale', 'full']: + dean_params.append({'params': model.dean.scaling_layer.parameters(), 'lr': lr * model.dean.scale_lr}) + if model.dean.mode == 'full': + dean_params.append({'params': model.dean.gating_layer.parameters(), 'lr': lr * model.dean.gate_lr}) + + model_optimizer = optimizer(dean_params, lr=lr) criterion = CrossEntropyLoss() train_loss, counter = 0, 0 From 5a28cbc50878fab710db9607f5b74d9466412d5d Mon Sep 17 00:00:00 2001 From: Marek Otahal Date: Fri, 30 Oct 2020 21:03:53 +0100 Subject: [PATCH 2/3] do not duplicate codepath for DAIN 1+2+3 do not repeat the same code (for easier changes in one place only) --- dain.py | 92 ++++++++++++++++++++++++--------------------------------- 1 file changed, 38 insertions(+), 54 deletions(-) diff --git a/dain.py b/dain.py index 1a4674d..4f4fcf3 100644 --- a/dain.py +++ b/dain.py @@ -7,6 +7,8 @@ class DAIN_Layer(nn.Module): def __init__(self, mode='adaptive_avg', mean_lr=0.00001, gate_lr=0.001, scale_lr=0.00001, input_dim=144): super(DAIN_Layer, self).__init__() + assert mode in [None, 'avg', 'adaptive_avg', 'adaptive_scale', 'full'], f'Unsupported mode: {mode}!'\ + 'Use one of: None, "avg", "adaptive_avg", "adaptive_scale", "full". ' print("Mode = ", mode) self.mode = mode @@ -32,64 +34,46 @@ def __init__(self, mode='adaptive_avg', mean_lr=0.00001, gate_lr=0.001, scale_lr def forward(self, x): # Expecting (n_samples, dim, n_feature_vectors) + ## Other methods: # Nothing to normalize if self.mode == None: - pass - + return x # Do simple average normalization elif self.mode == 'avg': avg = torch.mean(x, 2) avg = avg.resize(avg.size(0), avg.size(1), 1) x = x - avg - - # Perform only the first step (adaptive averaging) - elif self.mode == 'adaptive_avg': - avg = torch.mean(x, 2) - adaptive_avg = self.mean_layer(avg) - adaptive_avg = adaptive_avg.resize(adaptive_avg.size(0), adaptive_avg.size(1), 1) - x = x - adaptive_avg - - # Perform the first + second step (adaptive averaging + adaptive scaling ) - elif self.mode == 'adaptive_scale': - # Step 1: - avg = torch.mean(x, 2) - adaptive_avg = self.mean_layer(avg) - adaptive_avg = adaptive_avg.resize(adaptive_avg.size(0), adaptive_avg.size(1), 1) - x = x - adaptive_avg - - # Step 2: - std = torch.mean(x ** 2, 2) - std = torch.sqrt(std + self.eps) - adaptive_std = self.scaling_layer(std) - adaptive_std[adaptive_std <= self.eps] = 1 - - adaptive_std = adaptive_std.resize(adaptive_std.size(0), adaptive_std.size(1), 1) - x = x / (adaptive_std) - - elif self.mode == 'full': - - # Step 1: - avg = torch.mean(x, 2) - adaptive_avg = self.mean_layer(avg) - adaptive_avg = adaptive_avg.resize(adaptive_avg.size(0), adaptive_avg.size(1), 1) - x = x - adaptive_avg - - # # Step 2: - std = torch.mean(x ** 2, 2) - std = torch.sqrt(std + self.eps) - adaptive_std = self.scaling_layer(std) - adaptive_std[adaptive_std <= self.eps] = 1 - - adaptive_std = adaptive_std.resize(adaptive_std.size(0), adaptive_std.size(1), 1) - x = x / adaptive_std - - # Step 3: - avg = torch.mean(x, 2) - gate = F.sigmoid(self.gating_layer(avg)) - gate = gate.resize(gate.size(0), gate.size(1), 1) - x = x * gate - - else: - assert False - - return x + return x + + ## DAIN: + # Perform the first step: adaptive averaging; DAIN(1) + # Step 1: + avg = torch.mean(x, 2) + adaptive_avg = self.mean_layer(avg) + adaptive_avg = adaptive_avg.resize(adaptive_avg.size(0), adaptive_avg.size(1), 1) + x = x - adaptive_avg + if self.mode == 'adaptive_avg': + return x + + # Perform the second step: adaptive averaging + adaptive scaling; DAIN(1+2) + # Step 2: + std = torch.mean(x ** 2, 2) + std = torch.sqrt(std + self.eps) + adaptive_std = self.scaling_layer(std) + adaptive_std[adaptive_std <= self.eps] = 1 + + adaptive_std = adaptive_std.resize(adaptive_std.size(0), adaptive_std.size(1), 1) + x = x / adaptive_std + if self.mode == 'adaptive_scale': + return x + + # Perform the third step: adaptuve avg + adative scale + gating; DAIN(1+2+3) + # Step 3: + avg = torch.mean(x, 2) + gate = F.sigmoid(self.gating_layer(avg)) + gate = gate.resize(gate.size(0), gate.size(1), 1) + x = x * gate + if self.mode == 'full': + return x + + assert False, "You fool! Should not reach here." From a2a7686f07fe471de3eec9eea545e1f53b937dec Mon Sep 17 00:00:00 2001 From: Marek Otahal Date: Fri, 30 Oct 2020 21:11:36 +0100 Subject: [PATCH 3/3] revert mistaken changes in exp_mlp --- exp_mlp.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/exp_mlp.py b/exp_mlp.py index b58c9f0..0fbea4b 100644 --- a/exp_mlp.py +++ b/exp_mlp.py @@ -27,7 +27,7 @@ def forward(self, x): return x -def run_experiments_ablation(model, mode, train_epochs=1, window=10, normalization=None): +def run_experiments_ablation(model, mode, train_epochs=20, window=10, normalization=None): results1 = train_evaluate_anchored(model, window=window, train_epochs=train_epochs, horizon=0, splits=[1, 2, 3, 4, 5, 6, 7, 8], @@ -43,13 +43,13 @@ def run_experiments_ablation(model, mode, train_epochs=1, window=10, normalizati mean_lr, std_lr, scale_lr = 1e-06, 0.001, 10 # Baseline 1 -#model = lambda: MLP(mode=None, mean_lr=mean_lr, gate_lr=scale_lr, scale_lr=std_lr) -#run_experiments_ablation(model, 'mlp_std', window=15, normalization='std') +model = lambda: MLP(mode=None, mean_lr=mean_lr, gate_lr=scale_lr, scale_lr=std_lr) +run_experiments_ablation(model, 'mlp_std', window=15, normalization='std') # Baseline 2 -#model = lambda: MLP(mode='avg', mean_lr=mean_lr, gate_lr=scale_lr, scale_lr=std_lr) -#run_experiments_ablation(model, 'mlp_sample_avg', window=15, normalization=None) +model = lambda: MLP(mode='avg', mean_lr=mean_lr, gate_lr=scale_lr, scale_lr=std_lr) +run_experiments_ablation(model, 'mlp_sample_avg', window=15, normalization=None) # Proposed Method -model = lambda: MLP(mode='adaptive_scale', mean_lr=mean_lr, gate_lr=scale_lr, scale_lr=std_lr) +model = lambda: MLP(mode='full', mean_lr=mean_lr, gate_lr=scale_lr, scale_lr=std_lr) run_experiments_ablation(model, 'mlp_full', window=15, normalization=None)